import numpy as np
import pandas as pd
[docs]
def IntraSeries_2_DayReturn(series):
'''
Computes the day return of a time series representing intraday prices.
INPUTS:
- series: pandas Series,
the time series of intraday prices
OUTPUTS:
- output: numpy array,
the daily log returns
Example of usage
----------------
.. code-block:: python
import pandas as pd
from utils import IntraSeries_2_DayReturn
df = pd.read_csv('stock.csv')
df.index = pd.to_datetime(df.index)
daily_ret = IntraSeries_2_DayReturn(df.Close) #Compute the daily log returns
'''
output = list() #Initialize the output
for day, _ in series.groupby(pd.Grouper(freq='D')): #Iterate over each day
end_day = day + pd.Timedelta(days=1)
temp_y = series[(series.index >= day) & (series.index < end_day)] #Isolate the daily time series
if len(temp_y) > 0: #If the daily time series is not empty, compute the log return
output.append(np.log(temp_y[-1]) - np.log(temp_y[0]))
output = np.array(output)
return output
[docs]
class Subordinator():
'''
Subordinator class. It is intended to be used on each day separately.
Parameters:
----------------
- c: int
the number of time indexes to sample
- sub_type: str, optional
Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation;
'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock'
Example of usage
----------------
.. code-block:: python
import numpy as np
import pandas as pd
from utils import subordinator
df = pd.read_csv('stock.csv')
df.index = pd.to_datetime(df.index)
day_price = df.Close
day_vol = df.Volume
subordinator = Subordinator(78, sub_type='vol') #Initialize the subordinator
target_idxs = day_price.iloc[subordinator.predict(day_price, vol=day_vol)] #Extract the subordinated indexes
log_ret = np.log(target_points).diff().dropna() #Compute the subordinated logarithmic returns
Methods:
----------------
'''
def __init__(self, c, sub_type='clock'):
self.c = c+1
self.sub_type = sub_type
[docs]
def sample_with_tie_break(self, a):
'''
Ensure the sampled time indexes are unique (i.e., not overlapping). That is, it manage the tie-break rule for transofrming a non-injective function into an injective subordinator.
INPUTS:
- a: ndarray
the cumulative intensities
OUTPUTS:
- indices: ndarray
the indexes corresponding to the subordinated values
'''
indices = np.searchsorted(
a, np.linspace(0, a[-1], self.c-1), side='left') # Initialize an attempt of indices
for i in range(1, len(indices)): #Avoid duplicates - forward pass
if indices[i] == indices[i - 1]:
indices[i] += 1
if indices[-1] >= len(a)-1: #Remove values outside the data index
indices[-1] = len(a) - 2
for i in range(len(indices)-1, 1, -1): #Avoid all the repretitions - backward pass
if indices[i] <= indices[i - 1]:
indices[i - 1] = indices[i] - 1
return indices
[docs]
def predict(self, data, vol=None, tpv_int_min=15):
'''
Returns the index position of the subordinated values. That is, return the vector \tau(j), with j=0,..,c, where \tau is intended to be the subordinator.
INPUTS:
- data: pd.Series
the time series of intra-day prices, over all the day (that is, from 09:30 to 16:00)
- tpv_int_min: int, optional
half-length of the window used for computing the tri-power variation. Only used when self.sub_type == 'tpv'. Default is 15
- vol: pd.Series, optional
the volume series, on the same indexes as data. Only used when self.sub_type == 'vol'
OUTPUTS:
- sub_idxs: list
the indexes corresponding to the subordinated values
'''
from datetime import datetime
# Distinguish different situations according to self.sub_type
if self.sub_type == 'clock':
sub_idxs = [int(
len(data.index) / (self.c-1) * i
) for i in range(self.c-1)] + [len(data.index)-1]
return sub_idxs
elif self.sub_type == 'identity':
sub_idxs = range(len(data.index))
return sub_idxs
elif self.sub_type == 'tpv':
cte = 1.9357924048803463 #Normalization constant
# Compute the TriPower Variation intensity
tpv = list()
for t in data.index: #Iterate over every minute
temp_data = data[(data.index>=t-pd.Timedelta(minutes=tpv_int_min)) &\
(data.index<=t+pd.Timedelta(minutes=tpv_int_min))].values #Isolate the time window
temp_data = temp_data[1:] - temp_data[:-1] #Compute the price delta inside each window
temp_data = temp_data[ temp_data != 0 ] #Remove zeros
if len(temp_data) < 3: #If there are not enough values for computing the tpv, just return 0
tpv.append(0)
else:
temp_tpv = 0 #Compute the sum for the tpv
for j in range(2, len(temp_data)):
temp_tpv += np.abs(temp_data[j-2])**(2/3) *\
np.abs(temp_data[j-1])**(2/3) *\
np.abs(temp_data[j])**(2/3)
tpv.append(temp_tpv * cte) #Normalize
tpv = np.array(tpv).cumsum() #From intensities to cumulative intensities
# Compute the subordinator as equispaced points in the TPV
sub_idxs = list(self.sample_with_tie_break(tpv)) + [len(data.index)-1]
return sub_idxs
elif self.sub_type == 'vol':
t_vol = np.where(vol.values>=0, vol.values, 0).cumsum() #Clean potential missing/corrupted values
# Compute the subordinator as equispaced points in the vol
sub_idxs = list(self.sample_with_tie_break(t_vol)) + [len(data.index)-1]
return sub_idxs
[docs]
def Fill_RTH_Minutes(df):
'''
Data preprocessing. Given a pandas dataframe, it fills missing values in the Regular Trading Hours (RTH) (09:30 - 16:00).
Only the days where there is at least one observation are used, as they are assumed to be all and only the working days.
INPUTS:
- df: pandas DataFrame,
the dataframe, made up of minute-by-minute values
OUTPUTS:
- df: pandas DataFrame,
the preprocessed dataframe, with no holes and filled missing values.
Example of usage
----------------
.. code-block:: python
import pandas as pd
from utils import Fill_RTH_Minutes
df = pd.read_csv('stock.csv')
df.index = pd.to_datetime(df.index)
df_full = Fill_RTH_Minutes(df) #Data Preprocessing
'''
full_days = pd.to_datetime(df.index.date).unique() #Obtain the list of days
rth_minutes = pd.date_range('09:30', '16:00', freq='T').time # Create a time range for all minutes during RTH
full_index = pd.MultiIndex.from_product([full_days, rth_minutes], names=['Day', 'Minute']) # Create a MultiIndex for all possible day-minute combinations
# Reset the index of df to 'Day' and 'Minute' if it's not already
df['Day'] = df.index.date
df['Minute'] = df.index.time
df.set_index(['Day', 'Minute'], inplace=True)
# Reindex the DataFrame to include all possible minutes
df = df.reindex(full_index)
df[['Open', 'High', 'Low', 'Close']] = df[['Open', 'High', 'Low', 'Close']].fillna(method='ffill') # Forward fill the price columns
df['Volume'] = df['Volume'].fillna(0) # Set Volume to 0 for newly filled minutes
df.reset_index(inplace=True) # Reset index to have 'Day' and 'Minute' as columns if needed
df['timestamp'] = pd.to_datetime(df['Day'].astype(str) + ' ' + df['Minute'].astype(str))
df.set_index('timestamp', inplace=True)
df.drop(columns=['Day', 'Minute'], inplace=True) # Drop 'Day' and 'Minute' columns if not needed anymore
return df
[docs]
def price2params(y, c, mu=0., sub_type='clock', vol=None):
'''
Fit the intra-day distribution, which is assumed to be a Student's t-distribution
INPUTS:
- y: pandas Series,
the price time series, with a minute-by-minute granularity, over all the considered period (e.g., one year of data)
- c: int
the number of time indexes to sample
- mu: float,
the intra-day mean. It could be either a float or None. In the latter case, it will be estimated from the data. It is preferable to not set mu=None.
If you don't have a reliable estimate for it, simply use 0. Default is 0
- sub_type: str, optional
Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation;
'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock'
- vol: pd.Series, optional
the volume series, on the same indexes as data. Only used when sub_type == 'vol'
OUTPUTS:
- out_pars: dict,
the fitted parameters. Every key correspond to a day of the y series. Every value is a list containing nu, mu, sigma.
Example of usage
----------------
.. code-block:: python
import pandas as pd
from utils import price2params
df = pd.read_csv('stock.csv')
price = df.Close
vol = df.Volume
fitted_pars = price2params(price, c=78, sub_type='vol', vol=vol)
'''
from scipy.stats import t as stud_t #Load the Student's t-distribution
subordinator = Subordinator(c, sub_type=sub_type) #Initialize the subordinator
out_pars = dict() #Initialize the output dictionary
for day, _ in y.groupby(pd.Grouper(freq='D')): #Iterate over the days
# Isolate the day to work with
end_day = day + pd.Timedelta(days=1)
temp_y = y[(y.index >= day) & (y.index < end_day)]
#Eventually, ass the volume
if isinstance(vol, type(None)):
temp_vol = None
else:
temp_vol = vol[(vol.index >= day) & (vol.index < end_day)]
if len(temp_y) > 0: #If it is a working day
target_points = temp_y.iloc[subordinator.predict(temp_y, vol=temp_vol)] #Subordinated prices
log_ret = np.log(target_points).diff().dropna() # Subordinated logarithmic returns
if isinstance(mu, type(None)): #If mu is not given, it has to be estimated
temp_out = list(stud_t.fit(log_ret))
if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu
temp_out = list(stud_t.fit(log_ret, f0=2+1e-6))
if temp_out[2] < 1e-6: #Cap on the standard deviation
temp_out[2] = 1e-6
else: #If mu is given, fit the other parameters by keeping it fixed
temp_out = list(stud_t.fit(log_ret, floc=mu))
if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu
temp_out = list(stud_t.fit(log_ret, f0=2+1e-6, floc=mu))
if temp_out[2] < 1e-6: #Cap on the standard deviation
temp_out[2] = 1e-6
out_pars[day] = temp_out #Add the results to the output dictionary
return out_pars
[docs]
def price2params_ma(y, c, mu=0, sub_type='clock', vol=None):
'''
Fit the intra-day distribution, which is assumed to be a MA(1) process with Student's t innovations.
INPUTS:
- y: pandas Series,
the price time series, with a minute-by-minute granularity, over all the considered period (e.g., one year of data)
- c: int
the number of time indexes to sample
- mu: float,
the intra-day mean. It could be either a float or None. In the latter case, it will be estimated from the data. It is preferable to not set mu=None.
If you don't have a reliable estimate for it, simply use 0. Default is 0
- sub_type: str, optional
Type of subordinator to use. Either 'clock' for the clock time; 'tpv' for the TriPower Variation;
'vol' for the volume; or 'identity' for the identity (that is, all the indexes are returned). Default is 'clock'
- vol: pd.Series, optional
the volume series, on the same indexes as data. Only used when sub_type == 'vol'
OUTPUTS:
- out_pars: dict,
the fitted parameters. Every key correspond to a day of the y series. Every value is a list containing nu, mu, sigma.
Example of usage
----------------
.. code-block:: python
import pandas as pd
from utils import price2params_ma
df = pd.read_csv('stock.csv')
price = df.Close
vol = df.Volume
fitted_pars = price2params_ma(price, c=39, sub_type='vol', vol=vol)
'''
from scipy.stats import t as stud_t #Load the Student's t-distribution
from scipy.optimize import minimize, Bounds #Load function for minimization
def my_t_fit(obs, mu): #Fit the innovations to a Student's t-distribution
if isinstance(mu, type(None)): #If mu is not given, it has to be estimated
temp_out = list(stud_t.fit(obs))
if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu
temp_out = list(stud_t.fit(obs, f0=2+1e-6))
if temp_out[2] < 1e-6: #Cap on the standard deviation
temp_out[2] = 1e-6
else: #If mu is given, fit the other parameters by keeping it fixed
temp_out = list(stud_t.fit(obs, floc=mu))
if temp_out[0] < 2+1e-6: #Cap on the degrees of freedom nu
temp_out = list(stud_t.fit(obs, f0=2+1e-6, floc=mu))
if temp_out[2] < 1e-6: #Cap on the standard deviation
temp_out[2] = 1e-6
return temp_out
def ma_resids(y, phi, xi0): #Filter the residuals of the MA(1) process
inn = [xi0] #Initialize the residuals
for y_curr in y: #Iterate over the observations
inn.append( y_curr - phi*inn[-1] )
return inn
def ma_neg_ll(params, y, mu): #Negative log-likelihood of the MA(1) process
phi, xi0 = params
inn = ma_resids(y, phi, xi0) #Filter the residuals
if isinstance(mu, type(None)): #If mu is not given, it has to be estimated
return - stud_t.logpdf(inn, *my_t_fit(inn, None)).sum() #Return the negative log-likelihood
else: #If mu is given, it has to been adjusted according to phi
return - stud_t.logpdf(inn, *my_t_fit(inn, mu/(1+phi))).sum() #Return the negative log-likelihood
starting_point = [-0.05, 0] #Starting point for the minimization: phi=-0.05, xi0=0
bounds = Bounds([-0.1, -1], [0.1, 1]) #Bounds for the parameters
subordinator = Subordinator(c, sub_type=sub_type) #Initialize the subordinator
out_pars = dict() #Initialize the output dictionary
for day, _ in y.groupby(pd.Grouper(freq='D')): #Iterate over the days
# Isolate the day to work with
end_day = day + pd.Timedelta(days=1)
temp_y = y[(y.index >= day) & (y.index < end_day)]
#Eventually, ass the volume
if isinstance(vol, type(None)):
temp_vol = None
else:
temp_vol = vol[(vol.index >= day) & (vol.index < end_day)]
if len(temp_y) > 0: #If it is a working day
target_points = temp_y.iloc[subordinator.predict(temp_y, vol=temp_vol)] #Subordinated prices
log_ret = np.log(target_points).diff().dropna() # Subordinated logarithmic returns
temp_out = minimize(
ma_neg_ll, starting_point, args=(log_ret, mu),
method='SLSQP', bounds=bounds).x #Find the optimal phi and starting point
temp_out = [temp_out[0]] + list(
my_t_fit(ma_resids(log_ret, *temp_out), mu)) #Concatenate phi and innovations' parameters
out_pars[day] = temp_out #Add the results to the output dictionary
return out_pars