Source code for utils


import numpy as np
import pandas as pd

[docs] def IntraSeries_2_DayReturn(series): ''' Computes the day return of a time series representing intraday prices. INPUTS: - series: pandas Series, the time series of intraday prices OUTPUTS: - output: numpy array, the daily log returns Example of usage ---------------- .. code-block:: python import pandas as pd from utils import IntraSeries_2_DayReturn df = pd.read_csv('stock.csv') df.index = pd.to_datetime(df.index) daily_ret = IntraSeries_2_DayReturn(df.Close) #Compute the daily log returns ''' output = list() #Initialize the output for day, _ in series.groupby(pd.Grouper(freq='D')): #Iterate over each day end_day = day + pd.Timedelta(days=1) temp_y = series[(series.index >= day) & (series.index < end_day)] #Isolate the daily time series if len(temp_y) > 0: #If the daily time series is not empty, compute the log return output.append(np.log(temp_y[-1]) - np.log(temp_y[0])) output = np.array(output) return output
[docs] class Subordinator(): ''' Subordinator class. It is intended to be used on each day separately. Parameters: ---------------- - c: int the number of time indexes to sample - sub_type: str, optional Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation; 'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock' Example of usage ---------------- .. code-block:: python import numpy as np import pandas as pd from utils import subordinator df = pd.read_csv('stock.csv') df.index = pd.to_datetime(df.index) day_price = df.Close day_vol = df.Volume subordinator = Subordinator(78, sub_type='vol') #Initialize the subordinator target_idxs = day_price.iloc[subordinator.predict(day_price, vol=day_vol)] #Extract the subordinated indexes log_ret = np.log(target_points).diff().dropna() #Compute the subordinated logarithmic returns Methods: ---------------- ''' def __init__(self, c, sub_type='clock'): self.c = c+1 self.sub_type = sub_type
[docs] def sample_with_tie_break(self, a): ''' Ensure the sampled time indexes are unique (i.e., not overlapping). That is, it manage the tie-break rule for transofrming a non-injective function into an injective subordinator. INPUTS: - a: ndarray the cumulative intensities OUTPUTS: - indices: ndarray the indexes corresponding to the subordinated values ''' indices = np.searchsorted( a, np.linspace(0, a[-1], self.c-1), side='left') # Initialize an attempt of indices for i in range(1, len(indices)): #Avoid duplicates - forward pass if indices[i] == indices[i - 1]: indices[i] += 1 if indices[-1] >= len(a)-1: #Remove values outside the data index indices[-1] = len(a) - 2 for i in range(len(indices)-1, 1, -1): #Avoid all the repretitions - backward pass if indices[i] <= indices[i - 1]: indices[i - 1] = indices[i] - 1 return indices
[docs] def predict(self, data, vol=None, tpv_int_min=15): ''' Returns the index position of the subordinated values. That is, return the vector \tau(j), with j=0,..,c, where \tau is intended to be the subordinator. INPUTS: - data: pd.Series the time series of intra-day prices, over all the day (that is, from 09:30 to 16:00) - tpv_int_min: int, optional half-length of the window used for computing the tri-power variation. Only used when self.sub_type == 'tpv'. Default is 15 - vol: pd.Series, optional the volume series, on the same indexes as data. Only used when self.sub_type == 'vol' OUTPUTS: - sub_idxs: list the indexes corresponding to the subordinated values ''' from datetime import datetime # Distinguish different situations according to self.sub_type if self.sub_type == 'clock': sub_idxs = [int( len(data.index) / (self.c-1) * i ) for i in range(self.c-1)] + [len(data.index)-1] return sub_idxs elif self.sub_type == 'identity': sub_idxs = range(len(data.index)) return sub_idxs elif self.sub_type == 'tpv': cte = 1.9357924048803463 #Normalization constant # Compute the TriPower Variation intensity tpv = list() for t in data.index: #Iterate over every minute temp_data = data[(data.index>=t-pd.Timedelta(minutes=tpv_int_min)) &\ (data.index<=t+pd.Timedelta(minutes=tpv_int_min))].values #Isolate the time window temp_data = temp_data[1:] - temp_data[:-1] #Compute the price delta inside each window temp_data = temp_data[ temp_data != 0 ] #Remove zeros if len(temp_data) < 3: #If there are not enough values for computing the tpv, just return 0 tpv.append(0) else: temp_tpv = 0 #Compute the sum for the tpv for j in range(2, len(temp_data)): temp_tpv += np.abs(temp_data[j-2])**(2/3) *\ np.abs(temp_data[j-1])**(2/3) *\ np.abs(temp_data[j])**(2/3) tpv.append(temp_tpv * cte) #Normalize tpv = np.array(tpv).cumsum() #From intensities to cumulative intensities # Compute the subordinator as equispaced points in the TPV sub_idxs = list(self.sample_with_tie_break(tpv)) + [len(data.index)-1] return sub_idxs elif self.sub_type == 'vol': t_vol = np.where(vol.values>=0, vol.values, 0).cumsum() #Clean potential missing/corrupted values # Compute the subordinator as equispaced points in the vol sub_idxs = list(self.sample_with_tie_break(t_vol)) + [len(data.index)-1] return sub_idxs
[docs] def Fill_RTH_Minutes(df): ''' Data preprocessing. Given a pandas dataframe, it fills missing values in the Regular Trading Hours (RTH) (09:30 - 16:00). Only the days where there is at least one observation are used, as they are assumed to be all and only the working days. INPUTS: - df: pandas DataFrame, the dataframe, made up of minute-by-minute values OUTPUTS: - df: pandas DataFrame, the preprocessed dataframe, with no holes and filled missing values. Example of usage ---------------- .. code-block:: python import pandas as pd from utils import Fill_RTH_Minutes df = pd.read_csv('stock.csv') df.index = pd.to_datetime(df.index) df_full = Fill_RTH_Minutes(df) #Data Preprocessing ''' full_days = pd.to_datetime(df.index.date).unique() #Obtain the list of days rth_minutes = pd.date_range('09:30', '16:00', freq='T').time # Create a time range for all minutes during RTH full_index = pd.MultiIndex.from_product([full_days, rth_minutes], names=['Day', 'Minute']) # Create a MultiIndex for all possible day-minute combinations # Reset the index of df to 'Day' and 'Minute' if it's not already df['Day'] = df.index.date df['Minute'] = df.index.time df.set_index(['Day', 'Minute'], inplace=True) # Reindex the DataFrame to include all possible minutes df = df.reindex(full_index) df[['Open', 'High', 'Low', 'Close']] = df[['Open', 'High', 'Low', 'Close']].fillna(method='ffill') # Forward fill the price columns df['Volume'] = df['Volume'].fillna(0) # Set Volume to 0 for newly filled minutes df.reset_index(inplace=True) # Reset index to have 'Day' and 'Minute' as columns if needed df['timestamp'] = pd.to_datetime(df['Day'].astype(str) + ' ' + df['Minute'].astype(str)) df.set_index('timestamp', inplace=True) df.drop(columns=['Day', 'Minute'], inplace=True) # Drop 'Day' and 'Minute' columns if not needed anymore return df
[docs] def price2params(y, c, mu=0., sub_type='clock', vol=None): ''' Fit the intra-day distribution, which is assumed to be a Student's t-distribution INPUTS: - y: pandas Series, the price time series, with a minute-by-minute granularity, over all the considered period (e.g., one year of data) - c: int the number of time indexes to sample - mu: float, the intra-day mean. It could be either a float or None. In the latter case, it will be estimated from the data. It is preferable to not set mu=None. If you don't have a reliable estimate for it, simply use 0. Default is 0 - sub_type: str, optional Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation; 'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock' - vol: pd.Series, optional the volume series, on the same indexes as data. Only used when sub_type == 'vol' OUTPUTS: - out_pars: dict, the fitted parameters. Every key correspond to a day of the y series. Every value is a list containing nu, mu, sigma. Example of usage ---------------- .. code-block:: python import pandas as pd from utils import price2params df = pd.read_csv('stock.csv') price = df.Close vol = df.Volume fitted_pars = price2params(price, c=78, sub_type='vol', vol=vol) ''' from scipy.stats import t as stud_t #Load the Student's t-distribution subordinator = Subordinator(c, sub_type=sub_type) #Initialize the subordinator out_pars = dict() #Initialize the output dictionary for day, _ in y.groupby(pd.Grouper(freq='D')): #Iterate over the days # Isolate the day to work with end_day = day + pd.Timedelta(days=1) temp_y = y[(y.index >= day) & (y.index < end_day)] #Eventually, ass the volume if isinstance(vol, type(None)): temp_vol = None else: temp_vol = vol[(vol.index >= day) & (vol.index < end_day)] if len(temp_y) > 0: #If it is a working day target_points = temp_y.iloc[subordinator.predict(temp_y, vol=temp_vol)] #Subordinated prices log_ret = np.log(target_points).diff().dropna() # Subordinated logarithmic returns if isinstance(mu, type(None)): #If mu is not given, it has to be estimated temp_out = list(stud_t.fit(log_ret)) if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu temp_out = list(stud_t.fit(log_ret, f0=2+1e-6)) if temp_out[2] < 1e-6: #Cap on the standard deviation temp_out[2] = 1e-6 else: #If mu is given, fit the other parameters by keeping it fixed temp_out = list(stud_t.fit(log_ret, floc=mu)) if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu temp_out = list(stud_t.fit(log_ret, f0=2+1e-6, floc=mu)) if temp_out[2] < 1e-6: #Cap on the standard deviation temp_out[2] = 1e-6 out_pars[day] = temp_out #Add the results to the output dictionary return out_pars
[docs] def price2params_ma(y, c, mu=0, sub_type='clock', vol=None): ''' Fit the intra-day distribution, which is assumed to be a MA(1) process with Student's t innovations. INPUTS: - y: pandas Series, the price time series, with a minute-by-minute granularity, over all the considered period (e.g., one year of data) - c: int the number of time indexes to sample - mu: float, the intra-day mean. It could be either a float or None. In the latter case, it will be estimated from the data. It is preferable to not set mu=None. If you don't have a reliable estimate for it, simply use 0. Default is 0 - sub_type: str, optional Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation; 'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock' - vol: pd.Series, optional the volume series, on the same indexes as data. Only used when sub_type == 'vol' OUTPUTS: - out_pars: dict, the fitted parameters. Every key correspond to a day of the y series. Every value is a list containing nu, mu, sigma. Example of usage ---------------- .. code-block:: python import pandas as pd from utils import price2params_ma df = pd.read_csv('stock.csv') price = df.Close vol = df.Volume fitted_pars = price2params_ma(price, c=39, sub_type='vol', vol=vol) ''' from scipy.stats import t as stud_t #Load the Student's t-distribution from scipy.optimize import minimize, Bounds #Load function for minimization def my_t_fit(obs, mu): #Fit the innovations to a Student's t-distribution if isinstance(mu, type(None)): #If mu is not given, it has to be estimated temp_out = list(stud_t.fit(obs)) if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu temp_out = list(stud_t.fit(obs, f0=2+1e-6)) if temp_out[2] < 1e-6: #Cap on the standard deviation temp_out[2] = 1e-6 else: #If mu is given, fit the other parameters by keeping it fixed temp_out = list(stud_t.fit(obs, floc=mu)) if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu temp_out = list(stud_t.fit(obs, f0=2+1e-6, floc=mu)) if temp_out[2] < 1e-6: #Cap on the standard deviation temp_out[2] = 1e-6 return temp_out def ma_resids(y, phi, xi0): #Filter the residuals of the MA(1) process inn = [xi0] #Initialize the residuals for y_curr in y: #Iterate over the observations inn.append( y_curr - phi*inn[-1] ) return inn def ma_neg_ll(params, y, mu): #Negative log-likelihood of the MA(1) process phi, xi0 = params inn = ma_resids(y, phi, xi0) #Filter the residuals if isinstance(mu, type(None)): #If mu is not given, it has to be estimated return - stud_t.logpdf(inn, *my_t_fit(inn, None)).sum() #Return the negative log-likelihood else: #If mu is given, it has to been adjusted according to phi return - stud_t.logpdf(inn, *my_t_fit(inn, mu/(1+phi))).sum() #Return the negative log-likelihood starting_point = [-0.05, 0] #Starting point for the minimization: phi=-0.05, xi0=0 bounds = Bounds([-0.1, -1], [0.1, 1]) #Bounds for the parameters subordinator = Subordinator(c, sub_type=sub_type) #Initialize the subordinator out_pars = dict() #Initialize the output dictionary for day, _ in y.groupby(pd.Grouper(freq='D')): #Iterate over the days # Isolate the day to work with end_day = day + pd.Timedelta(days=1) temp_y = y[(y.index >= day) & (y.index < end_day)] #Eventually, ass the volume if isinstance(vol, type(None)): temp_vol = None else: temp_vol = vol[(vol.index >= day) & (vol.index < end_day)] if len(temp_y) > 0: #If it is a working day target_points = temp_y.iloc[subordinator.predict(temp_y, vol=temp_vol)] #Subordinated prices log_ret = np.log(target_points).diff().dropna() # Subordinated logarithmic returns temp_out = minimize( ma_neg_ll, starting_point, args=(log_ret, mu), method='SLSQP', bounds=bounds).x #Find the optimal phi and starting point temp_out = [temp_out[0]] + list( my_t_fit(ma_resids(log_ret, *temp_out), mu)) #Concatenate phi and innovations' parameters out_pars[day] = temp_out #Add the results to the output dictionary return out_pars