Module scrilla.analysis.estimators
A module of statistical point estimators and likelihood functions.
Expand source code
# This file is part of scrilla: https://github.com/chinchalinchin/scrilla.
# scrilla is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3
# as published by the Free Software Foundation.
# scrilla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with scrilla. If not, see <https://www.gnu.org/licenses/>
# or <https://github.com/chinchalinchin/scrilla/blob/develop/main/LICENSE>.
"""
A module of statistical point estimators and likelihood functions.
"""
from os import path
from sys import path as sys_path
from typing import List, Union
from numpy import inf
from math import log, sqrt, exp
from scipy.stats import norm, multivariate_normal
if __name__ == "__main__":
APP_DIR = path.dirname(path.dirname(path.abspath(__file__)))
sys_path.append(APP_DIR)
from scrilla import settings, errors, cache
from scrilla.static.constants import constants
import scrilla.util.outputter as outputter
logger = outputter.Logger('scrilla.analysis.estimators', settings.LOG_LEVEL)
profile_cache = cache.ProfileCache()
correlation_cache = cache.CorrelationCache()
def univariate_normal_likelihood_function(params: list, data: list) -> float:
"""
This function returns the likelihood of a vector of parameters being observed from a sample univariate data of normal data. It can be used as objective function input for `scipy.optimize`'s optimization methods.
Parameters
----------
1. **x** : ``list``
Array representing a vector of parameters , in this case the mean rate of return and volatility from a sample of data.
2. **data** : ``list``
A list of data that has been drawn from a univariate normal population.
"""
likelihood = 0
for point in data:
likelihood += norm.logpdf(x=point, loc=params[0], scale=params[1])
return likelihood
def bivariate_normal_likelihood_function(params: list, data: list) -> float:
r"""
Returns the likelihood of a vector of parameters being observed from a sample bivariate data of normal data. It can be used as objective function input for `scipy.optimize`'s optimization methods.
Parameters
----------
1. **params** : ``list``
Array representing a vector of parameters, in this case the mean rate of returns, voledatilities and covariance for a bivariate normal distribution. *Important*: The vector must be order: 1. params[0] = \\(\mu_x\\), params[1]=\\(\mu_y\\), params[2] = \\(\sigma_x\\), params[3] = \\(\sigma_y\\), params[4] = \\(\rho_{xy} \cdot \sigma_x \cdot \sigma_y\\). The matrix is parameterized in this manner in order to interface more easily with `scipy.optimize.minimize`.
2. **data** : ``list``
A list of data that has been drawn from a bivariate normal population. Must be formatted in the following manner: `[ [x1,y1], [x2,y2],...]`
.. notes::
* the covariance matrix of a bivariate normal distribution must be positive semi-definite (PSD) and non-singular. PSD can be checked with the [Slyvester Criterion](https://en.wikipedia.org/wiki/Sylvester%27s_criterion) or [Cauchy-Schwarz Inequality](https://en.wikipedia.org/wiki/Cauchy%E2%80%93Schwarz_inequality#Probability_theory). Since sample variance will always be positive, this reduces to checking the determinant of the covariance matrix is greater than 0. This function will return `numpy.inf` if the covariance matrix is singular or non-positive semi-definite.
"""
mean = [params[0], params[1]]
cov = [[params[2], params[4]], [params[4], params[3]]]
determinant = params[2]*params[3] - params[4]**2
if determinant == 0 or determinant < 0 or determinant < (10**(-constants['ACCURACY'])):
return inf
likelihood = 0
for point in data:
likelihood += multivariate_normal.logpdf(x=point, mean=mean, cov=cov)
return likelihood
def sample_percentile(data: List[float], percentile: float):
"""
Returns the observation in a sample data corresponding to the given percentile, i.e. the observation from a sorted sample where the percentage of the observations below that point is specified by the percentile. If the percentile falls between data points, the observation is smoothed based on the distance from the adjoining observations in the following manner,
.. todo:: add latex here
Parameters
----------
1. **data** : ``list``
Array representing the set of data whose percentile is to be calculated.
2. **percentile**: ``float``
The percentile corresponding to the desired observation.
"""
data.sort()
obs_number = (len(data) + 1)*percentile
extrapolate = obs_number - int(obs_number)
if extrapolate == 0:
return data[int(obs_number)-1]
if obs_number > len(data):
return data[-1]
first_index = int(obs_number) - 1
second_index = first_index + 1
weight = obs_number - int(obs_number)
return (1-weight)*data[first_index] + weight*data[second_index]
def empirical_copula(sample: List[List[float]], x_order: float, y_order: float):
"""
Computes an empirical estimate of the copula distribution for a bivariate sample, i.e.
$$ C(u_{1}, u_{2}) = F_{c} (\textbf{X_{1}}<F_x^{-1}(u_{1}),\textbf{X_{2}}<F_x^{-1}(u_{2})) $$
Using the empirical estimate defined by,
$$ C_{n} = \frac{\{ (x,y) \vert x \leq x_{p} & y \leq y_{p} \}}{n}$$
where \\(x_p\\) and \\(y_p\\) are the *p*-th percentiles of their respective univariate samples.
"""
n = len(sample)
def x_order_bounds(test_point):
return test_point < x_order or test_point == x_order
def y_order_bounds(test_point):
return test_point < y_order or test_point == y_order
copula = [1 for point in sample if x_order_bounds(
point[0]) and y_order_bounds(point[1])]
return len(copula) / n
def sample_correlation(x: List[float], y: List[float]):
"""
Returns the sample correlation calculated using the Pearson correlation coefficient estimator,
.. todo:: Pearson coefficient formula here
Parameters
----------
1. **x**: ``list``
The *x* sample of paired data (*x*, *y*). Must preserve order with **y**.
2. **y**: ``list``
The *y* sample of paired data (*x*, *y*). Must preserve order with **x**.
Raises
------
1. `scrilla.errors.SampleSizeError` :
If the sample sizes do not meet the requirements for estimation, this error will be thrown.
2. **ValueError** :
If the denominator of the correlation coefficient becomes too small for floating point arithmetic, this error is thrown.
.. todos ::
* Possibly wrap the correlation coefficient numerator and denominator in `Decimal` class before calculation to bypass the **ValueError** that occurs in some samples where the denominator is too small for the arithmetic to detect.
"""
if len(x) != len(y):
raise errors.SampleSizeError('Samples are not of comparable lengths')
if len(x) in [0, 1]:
raise errors.SampleSizeError(
'Sample correlation cannot be computed for a sample size less than or equal to 1.')
sumproduct, sum_x_squared, sum_x, sum_y, sum_y_squared = 0, 0, 0, 0, 0
n = len(x)
for i, item in enumerate(x):
sumproduct += item*y[i]
sum_x += item
sum_x_squared += item**2
sum_y += y[i]
sum_y_squared += y[i]**2
correl_num = ((n*sumproduct) - sum_x*sum_y)
correl_den = sqrt((n*sum_x_squared-sum_x**2)*(n*sum_y_squared-sum_y**2))
# LET'S DO SOME MATHEMATICS! (to get around division by zero!)
# Unfortunately, this only works when A and B > 0 because log
# of a negative number only exists in complex plane.
# 1. correl = A/B
# 2. log(correl) = log(A/B) = log(A) - log(B)
# 3. exp(log(correl)) = exp(log(A/B))
# 4. correl = exp(log(A/B))
if correl_num > 0 and correl_den > 0:
log_correl = log(correl_num) - log(correl_den)
correlation = exp(log_correl)
else:
if correl_den != 0:
correlation = correl_num / correl_den
else:
raise ValueError(
'Denominator for correlation formula to small for division')
return correlation
def recursive_rolling_correlation(correl_previous, new_x_observation, lost_x_obs,
new_y_obs, lost_y_obs, n=settings.DEFAULT_ANALYSIS_PERIOD):
pass
def sample_mean(x: List[float]) -> float:
r"""
Returns the sample mean from a sample of data \\(\{x_1 , x_2, ... , x_n \}\\),
$$ \bar{x} = \frac{\sum_{i=1}^{n} x_i}/{n} $$
Parameters
----------
1. **x**: ``List[Union[float,int]]``
List containing a sample of numerical data.
Raises
------
1. **scrilla.errors.SampleSizeError**
If ``len(x)==0``, this error will be thrown.
2. **ValueError**
If the sample contains null or non-numerical data, this error will be thrown.
"""
xbar, n = 0, len(x)
if not all(this_x is not None and isinstance(this_x, (float, int)) for this_x in x):
raise ValueError(
'Sample contains null values')
if n == 0:
raise errors.SampleSizeError(
'Sample mean cannot be computed for a sample size of 0.')
for i in x:
xbar += i/n
return xbar
def recursive_rolling_mean(xbar_previous, new_obs, lost_obs, n=settings.DEFAULT_ANALYSIS_PERIOD):
xbar_next = xbar_previous + (new_obs - lost_obs)/n
return xbar_next
def sample_variance(x: List[float]):
r"""
Returns the sample variance from a sample of data \\(\{x_1 , x_2, ... , x_n \}\\),
$$ s^2=\frac{\sum_{i=1}^{n} (x_i - \bar{x})^2}/{n-1} $$
Parameters
----------
1. **x**: ``list``
List containing a sample of numerical data.
Raises
------
1. `scrilla.errors.SampleSizeError`
"""
mu, sigma, n = sample_mean(x=x), 0, len(x)
if not all(this_x is not None and isinstance(this_x, (float, int)) for this_x in x):
raise ValueError(
'Sample contains null values')
if n in [0, 1]:
raise errors.SampleSizeError(
'Sample variance cannot be computed for a sample size less than or equal to 1.')
for i in x:
sigma += ((i-mu)**2)/(n-1)
return sigma
def recursive_rolling_variance(var_previous, xbar_previous, new_obs, lost_obs, n=settings.DEFAULT_ANALYSIS_PERIOD):
xbar_new = recursive_rolling_mean(xbar_previous=xbar_previous, new_obs=new_obs,
lost_obs=lost_obs, n=n)
var_new = var_previous + \
(n/(n-1))*((new_obs**2 - lost_obs**2)/n + (xbar_previous**2-xbar_new**2))
return var_new
def sample_covariance(x: list, y: list):
"""
Parameters
----------
1. **x**: ``list``
The *x* sample of paired data (*x*, *y*). Must preserve order with **y**.
2. **y**: ``list``
The *y* sample of paired data (*x*, *y*). Must preserve order with **x**.
Raises
------
1. `scrilla.errors.SampleSizeError`
If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) in [0,1]`` (insufficient data/degrees of freedom), this error will be thrown.
"""
if len(x) != len(y):
raise errors.SampleSizeError('Samples are not of comparable length')
if len(x) in [0, 1]:
raise errors.SampleSizeError(
'Sample correlation cannot be computed for a sample size less than or equal to 1.')
n, covariance = len(x), 0
x_mean, y_mean = sample_mean(x=x), sample_mean(x=y)
for i, item in enumerate(x):
covariance += (item - x_mean)*(y[i] - y_mean) / (n - 1)
return covariance
def recursive_rolling_covariance(covar_previous: float, new_x_obs: float, lost_x_obs: float, previous_x_bar: float, new_y_obs: float, lost_y_obs: float, previous_y_bar: float, n: int = settings.DEFAULT_ANALYSIS_PERIOD):
new_sum_term = new_x_obs*new_y_obs - lost_x_obs*lost_y_obs
xy_cross_term = previous_x_bar*(new_y_obs-lost_y_obs)
yx_cross_term = previous_y_bar*(new_x_obs-lost_x_obs)
perturbation = (new_x_obs-lost_x_obs)*(new_y_obs-lost_y_obs) / n
numerator = new_sum_term - xy_cross_term - yx_cross_term - perturbation
covar_new = covar_previous + numerator / (n-1)
return covar_new
def simple_regression_beta(x: List[float], y: List[float]):
"""
Parameters
----------
1. **x**: ``list``
The *x* sample of paired data (*x*, *y*). Must preserve order with **y**.
2. **y**: ``list``
The *y* sample of paired data (*x*, *y*). Must preserve order with **x**.
Raises
------
1. `scrilla.errors.statistics.SampleSizeError`
If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) < 3`` (insufficient data/degrees of freedom), this error will be thrown.
"""
if len(x) != len(y):
raise errors.SampleSizeError(f'len(x) = {len(x)} != len(y) = {len(y)}')
if len(x) < 3:
raise errors.SampleSizeError(
f'Sample size of {len(x)} is less than the necessary degrees of freedom (n > 2) for regression estimation.')
correl = sample_correlation(x=x, y=y)
vol_x = sqrt(sample_variance(x=x))
vol_y = sqrt(sample_variance(x=y))
beta = correl * vol_y / vol_x
return beta
def simple_regression_alpha(x: List[float], y: List[float]):
"""
Parameters
----------
1. **x**: ``list``
The *x* sample of paired data (*x*, *y*). Must preserve order with **y**.
2. **y**: ``list``
The *y* sample of paired data (*x*, *y*). Must preserve order with **x**.
Raises
------
1. `scrilla.errors.SampleSizeError`
If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) < 3`` (insufficient data/degrees of freedom), this error will be thrown.
"""
if len(x) != len(y):
raise errors.SampleSizeError(
f'len(x) == {len(x)} != len(y) == {len(y)}')
if len(x) < 3:
raise errors.SampleSizeError(
f'Sample size of {len(x)} is less than the necessary degrees of freedom (n > 2) for regression estimation.')
y_mean, x_mean = sample_mean(y), sample_mean(x)
alpha = y_mean - simple_regression_beta(x=x, y=y)*x_mean
return alpha
def qq_series_for_sample(sample: List[float]) -> List[list]:
"""
Calculates the QQ series for a sample of data, i.e. the set defined by the ordered pair of sample percentiles and theoretical normal percentiles. A sample's normality can be assessed by how linear the result graph is.
Parameters
----------
1. **sample**: ``list``
A sample of numerical data.
"""
qq_series = []
n = len(sample)
for i in range(len(sample)):
percentile = (i + 0.5)/n
percentile_sample = sample_percentile(
data=sample, percentile=percentile)
percentile_norm = norm.ppf(q=percentile)
qq_series += [[percentile_norm, percentile_sample]]
return qq_series
def standardize(x: List[float]):
mu = sample_mean(x)
sigma = sqrt(sample_variance(x))
return [(this_x - mu)/sigma for this_x in x]
Functions
def bivariate_normal_likelihood_function(params: list, data: list) ‑> float
-
Returns the likelihood of a vector of parameters being observed from a sample bivariate data of normal data. It can be used as objective function input for
scipy.optimize
's optimization methods.Parameters
- params :
list
Array representing a vector of parameters, in this case the mean rate of returns, voledatilities and covariance for a bivariate normal distribution. Important: The vector must be order: 1. params[0] = \(\mu_x\), params[1]=\(\mu_y\), params[2] = \(\sigma_x\), params[3] = \(\sigma_y\), params[4] = \(\rho_{xy} \cdot \sigma_x \cdot \sigma_y\). The matrix is parameterized in this manner in order to interface more easily withscipy.optimize.minimize
. - data :
list
A list of data that has been drawn from a bivariate normal population. Must be formatted in the following manner:[ [x1,y1], [x2,y2],…]
Notes
- the covariance matrix of a bivariate normal distribution must be positive semi-definite (PSD) and non-singular. PSD can be checked with the Slyvester Criterion or Cauchy-Schwarz Inequality. Since sample variance will always be positive, this reduces to checking the determinant of the covariance matrix is greater than 0. This function will return
numpy.inf
if the covariance matrix is singular or non-positive semi-definite.
Expand source code
def bivariate_normal_likelihood_function(params: list, data: list) -> float: r""" Returns the likelihood of a vector of parameters being observed from a sample bivariate data of normal data. It can be used as objective function input for `scipy.optimize`'s optimization methods. Parameters ---------- 1. **params** : ``list`` Array representing a vector of parameters, in this case the mean rate of returns, voledatilities and covariance for a bivariate normal distribution. *Important*: The vector must be order: 1. params[0] = \\(\mu_x\\), params[1]=\\(\mu_y\\), params[2] = \\(\sigma_x\\), params[3] = \\(\sigma_y\\), params[4] = \\(\rho_{xy} \cdot \sigma_x \cdot \sigma_y\\). The matrix is parameterized in this manner in order to interface more easily with `scipy.optimize.minimize`. 2. **data** : ``list`` A list of data that has been drawn from a bivariate normal population. Must be formatted in the following manner: `[ [x1,y1], [x2,y2],...]` .. notes:: * the covariance matrix of a bivariate normal distribution must be positive semi-definite (PSD) and non-singular. PSD can be checked with the [Slyvester Criterion](https://en.wikipedia.org/wiki/Sylvester%27s_criterion) or [Cauchy-Schwarz Inequality](https://en.wikipedia.org/wiki/Cauchy%E2%80%93Schwarz_inequality#Probability_theory). Since sample variance will always be positive, this reduces to checking the determinant of the covariance matrix is greater than 0. This function will return `numpy.inf` if the covariance matrix is singular or non-positive semi-definite. """ mean = [params[0], params[1]] cov = [[params[2], params[4]], [params[4], params[3]]] determinant = params[2]*params[3] - params[4]**2 if determinant == 0 or determinant < 0 or determinant < (10**(-constants['ACCURACY'])): return inf likelihood = 0 for point in data: likelihood += multivariate_normal.logpdf(x=point, mean=mean, cov=cov) return likelihood
- params :
def empirical_copula(sample: List[List[float]], x_order: float, y_order: float)
-
Computes an empirical estimate of the copula distribution for a bivariate sample, i.e.
C(u_{1}, u_{2}) = F_{c} ( extbf{X_{1}}<F_x^{-1}(u_{1}), extbf{X_{2}}<F_x^{-1}(u_{2}))
Using the empirical estimate defined by,
C_{n} = rac{\{ (x,y) ert x \leq x_{p} & y \leq y_{p} \}}{n}
where x_p and y_p are the p-th percentiles of their respective univariate samples.
Expand source code
def empirical_copula(sample: List[List[float]], x_order: float, y_order: float): """ Computes an empirical estimate of the copula distribution for a bivariate sample, i.e. $$ C(u_{1}, u_{2}) = F_{c} (\textbf{X_{1}}<F_x^{-1}(u_{1}),\textbf{X_{2}}<F_x^{-1}(u_{2})) $$ Using the empirical estimate defined by, $$ C_{n} = \frac{\{ (x,y) \vert x \leq x_{p} & y \leq y_{p} \}}{n}$$ where \\(x_p\\) and \\(y_p\\) are the *p*-th percentiles of their respective univariate samples. """ n = len(sample) def x_order_bounds(test_point): return test_point < x_order or test_point == x_order def y_order_bounds(test_point): return test_point < y_order or test_point == y_order copula = [1 for point in sample if x_order_bounds( point[0]) and y_order_bounds(point[1])] return len(copula) / n
def qq_series_for_sample(sample: List[float]) ‑> List[list]
-
Calculates the QQ series for a sample of data, i.e. the set defined by the ordered pair of sample percentiles and theoretical normal percentiles. A sample's normality can be assessed by how linear the result graph is.
Parameters
- sample:
list
A sample of numerical data.
Expand source code
def qq_series_for_sample(sample: List[float]) -> List[list]: """ Calculates the QQ series for a sample of data, i.e. the set defined by the ordered pair of sample percentiles and theoretical normal percentiles. A sample's normality can be assessed by how linear the result graph is. Parameters ---------- 1. **sample**: ``list`` A sample of numerical data. """ qq_series = [] n = len(sample) for i in range(len(sample)): percentile = (i + 0.5)/n percentile_sample = sample_percentile( data=sample, percentile=percentile) percentile_norm = norm.ppf(q=percentile) qq_series += [[percentile_norm, percentile_sample]] return qq_series
- sample:
def recursive_rolling_correlation(correl_previous, new_x_observation, lost_x_obs, new_y_obs, lost_y_obs, n=100)
-
Expand source code
def recursive_rolling_correlation(correl_previous, new_x_observation, lost_x_obs, new_y_obs, lost_y_obs, n=settings.DEFAULT_ANALYSIS_PERIOD): pass
def recursive_rolling_covariance(covar_previous: float, new_x_obs: float, lost_x_obs: float, previous_x_bar: float, new_y_obs: float, lost_y_obs: float, previous_y_bar: float, n: int = 100)
-
Expand source code
def recursive_rolling_covariance(covar_previous: float, new_x_obs: float, lost_x_obs: float, previous_x_bar: float, new_y_obs: float, lost_y_obs: float, previous_y_bar: float, n: int = settings.DEFAULT_ANALYSIS_PERIOD): new_sum_term = new_x_obs*new_y_obs - lost_x_obs*lost_y_obs xy_cross_term = previous_x_bar*(new_y_obs-lost_y_obs) yx_cross_term = previous_y_bar*(new_x_obs-lost_x_obs) perturbation = (new_x_obs-lost_x_obs)*(new_y_obs-lost_y_obs) / n numerator = new_sum_term - xy_cross_term - yx_cross_term - perturbation covar_new = covar_previous + numerator / (n-1) return covar_new
def recursive_rolling_mean(xbar_previous, new_obs, lost_obs, n=100)
-
Expand source code
def recursive_rolling_mean(xbar_previous, new_obs, lost_obs, n=settings.DEFAULT_ANALYSIS_PERIOD): xbar_next = xbar_previous + (new_obs - lost_obs)/n return xbar_next
def recursive_rolling_variance(var_previous, xbar_previous, new_obs, lost_obs, n=100)
-
Expand source code
def recursive_rolling_variance(var_previous, xbar_previous, new_obs, lost_obs, n=settings.DEFAULT_ANALYSIS_PERIOD): xbar_new = recursive_rolling_mean(xbar_previous=xbar_previous, new_obs=new_obs, lost_obs=lost_obs, n=n) var_new = var_previous + \ (n/(n-1))*((new_obs**2 - lost_obs**2)/n + (xbar_previous**2-xbar_new**2)) return var_new
def sample_correlation(x: List[float], y: List[float])
-
Returns the sample correlation calculated using the Pearson correlation coefficient estimator,
TODO
Pearson coefficient formula here
Parameters
- x:
list
The x sample of paired data (x, y). Must preserve order with y. - y:
list
The y sample of paired data (x, y). Must preserve order with x.
Raises
SampleSizeError
: If the sample sizes do not meet the requirements for estimation, this error will be thrown.- ValueError : If the denominator of the correlation coefficient becomes too small for floating point arithmetic, this error is thrown.
.. todos :: * Possibly wrap the correlation coefficient numerator and denominator in
Decimal
class before calculation to bypass the ValueError that occurs in some samples where the denominator is too small for the arithmetic to detect.Expand source code
def sample_correlation(x: List[float], y: List[float]): """ Returns the sample correlation calculated using the Pearson correlation coefficient estimator, .. todo:: Pearson coefficient formula here Parameters ---------- 1. **x**: ``list`` The *x* sample of paired data (*x*, *y*). Must preserve order with **y**. 2. **y**: ``list`` The *y* sample of paired data (*x*, *y*). Must preserve order with **x**. Raises ------ 1. `scrilla.errors.SampleSizeError` : If the sample sizes do not meet the requirements for estimation, this error will be thrown. 2. **ValueError** : If the denominator of the correlation coefficient becomes too small for floating point arithmetic, this error is thrown. .. todos :: * Possibly wrap the correlation coefficient numerator and denominator in `Decimal` class before calculation to bypass the **ValueError** that occurs in some samples where the denominator is too small for the arithmetic to detect. """ if len(x) != len(y): raise errors.SampleSizeError('Samples are not of comparable lengths') if len(x) in [0, 1]: raise errors.SampleSizeError( 'Sample correlation cannot be computed for a sample size less than or equal to 1.') sumproduct, sum_x_squared, sum_x, sum_y, sum_y_squared = 0, 0, 0, 0, 0 n = len(x) for i, item in enumerate(x): sumproduct += item*y[i] sum_x += item sum_x_squared += item**2 sum_y += y[i] sum_y_squared += y[i]**2 correl_num = ((n*sumproduct) - sum_x*sum_y) correl_den = sqrt((n*sum_x_squared-sum_x**2)*(n*sum_y_squared-sum_y**2)) # LET'S DO SOME MATHEMATICS! (to get around division by zero!) # Unfortunately, this only works when A and B > 0 because log # of a negative number only exists in complex plane. # 1. correl = A/B # 2. log(correl) = log(A/B) = log(A) - log(B) # 3. exp(log(correl)) = exp(log(A/B)) # 4. correl = exp(log(A/B)) if correl_num > 0 and correl_den > 0: log_correl = log(correl_num) - log(correl_den) correlation = exp(log_correl) else: if correl_den != 0: correlation = correl_num / correl_den else: raise ValueError( 'Denominator for correlation formula to small for division') return correlation
- x:
def sample_covariance(x: list, y: list)
-
Parameters
- x:
list
The x sample of paired data (x, y). Must preserve order with y. - y:
list
The y sample of paired data (x, y). Must preserve order with x.
Raises
SampleSizeError
Iflen(x) != len(y)
(samples of incomparable length) orlen(x) in [0,1]
(insufficient data/degrees of freedom), this error will be thrown.
Expand source code
def sample_covariance(x: list, y: list): """ Parameters ---------- 1. **x**: ``list`` The *x* sample of paired data (*x*, *y*). Must preserve order with **y**. 2. **y**: ``list`` The *y* sample of paired data (*x*, *y*). Must preserve order with **x**. Raises ------ 1. `scrilla.errors.SampleSizeError` If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) in [0,1]`` (insufficient data/degrees of freedom), this error will be thrown. """ if len(x) != len(y): raise errors.SampleSizeError('Samples are not of comparable length') if len(x) in [0, 1]: raise errors.SampleSizeError( 'Sample correlation cannot be computed for a sample size less than or equal to 1.') n, covariance = len(x), 0 x_mean, y_mean = sample_mean(x=x), sample_mean(x=y) for i, item in enumerate(x): covariance += (item - x_mean)*(y[i] - y_mean) / (n - 1) return covariance
- x:
def sample_mean(x: List[float]) ‑> float
-
Returns the sample mean from a sample of data \({x_1 , x_2, … , x_n }\),
\bar{x} = \frac{\sum_{i=1}^{n} x_i}/{n}
Parameters
- x:
List[Union[float,int]]
List containing a sample of numerical data.
Raises
- scrilla.errors.SampleSizeError
If
len(x)==0
, this error will be thrown. - ValueError If the sample contains null or non-numerical data, this error will be thrown.
Expand source code
def sample_mean(x: List[float]) -> float: r""" Returns the sample mean from a sample of data \\(\{x_1 , x_2, ... , x_n \}\\), $$ \bar{x} = \frac{\sum_{i=1}^{n} x_i}/{n} $$ Parameters ---------- 1. **x**: ``List[Union[float,int]]`` List containing a sample of numerical data. Raises ------ 1. **scrilla.errors.SampleSizeError** If ``len(x)==0``, this error will be thrown. 2. **ValueError** If the sample contains null or non-numerical data, this error will be thrown. """ xbar, n = 0, len(x) if not all(this_x is not None and isinstance(this_x, (float, int)) for this_x in x): raise ValueError( 'Sample contains null values') if n == 0: raise errors.SampleSizeError( 'Sample mean cannot be computed for a sample size of 0.') for i in x: xbar += i/n return xbar
- x:
def sample_percentile(data: List[float], percentile: float)
-
Returns the observation in a sample data corresponding to the given percentile, i.e. the observation from a sorted sample where the percentage of the observations below that point is specified by the percentile. If the percentile falls between data points, the observation is smoothed based on the distance from the adjoining observations in the following manner,
TODO
add latex here
Parameters
- data :
list
Array representing the set of data whose percentile is to be calculated. - percentile:
float
The percentile corresponding to the desired observation.
Expand source code
def sample_percentile(data: List[float], percentile: float): """ Returns the observation in a sample data corresponding to the given percentile, i.e. the observation from a sorted sample where the percentage of the observations below that point is specified by the percentile. If the percentile falls between data points, the observation is smoothed based on the distance from the adjoining observations in the following manner, .. todo:: add latex here Parameters ---------- 1. **data** : ``list`` Array representing the set of data whose percentile is to be calculated. 2. **percentile**: ``float`` The percentile corresponding to the desired observation. """ data.sort() obs_number = (len(data) + 1)*percentile extrapolate = obs_number - int(obs_number) if extrapolate == 0: return data[int(obs_number)-1] if obs_number > len(data): return data[-1] first_index = int(obs_number) - 1 second_index = first_index + 1 weight = obs_number - int(obs_number) return (1-weight)*data[first_index] + weight*data[second_index]
- data :
def sample_variance(x: List[float])
-
Returns the sample variance from a sample of data \({x_1 , x_2, … , x_n }\),
s^2=\frac{\sum_{i=1}^{n} (x_i - \bar{x})^2}/{n-1}
Parameters
- x:
list
List containing a sample of numerical data.
Raises
Expand source code
def sample_variance(x: List[float]): r""" Returns the sample variance from a sample of data \\(\{x_1 , x_2, ... , x_n \}\\), $$ s^2=\frac{\sum_{i=1}^{n} (x_i - \bar{x})^2}/{n-1} $$ Parameters ---------- 1. **x**: ``list`` List containing a sample of numerical data. Raises ------ 1. `scrilla.errors.SampleSizeError` """ mu, sigma, n = sample_mean(x=x), 0, len(x) if not all(this_x is not None and isinstance(this_x, (float, int)) for this_x in x): raise ValueError( 'Sample contains null values') if n in [0, 1]: raise errors.SampleSizeError( 'Sample variance cannot be computed for a sample size less than or equal to 1.') for i in x: sigma += ((i-mu)**2)/(n-1) return sigma
- x:
def simple_regression_alpha(x: List[float], y: List[float])
-
Parameters
- x:
list
The x sample of paired data (x, y). Must preserve order with y. - y:
list
The y sample of paired data (x, y). Must preserve order with x.
Raises
SampleSizeError
Iflen(x) != len(y)
(samples of incomparable length) orlen(x) < 3
(insufficient data/degrees of freedom), this error will be thrown.
Expand source code
def simple_regression_alpha(x: List[float], y: List[float]): """ Parameters ---------- 1. **x**: ``list`` The *x* sample of paired data (*x*, *y*). Must preserve order with **y**. 2. **y**: ``list`` The *y* sample of paired data (*x*, *y*). Must preserve order with **x**. Raises ------ 1. `scrilla.errors.SampleSizeError` If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) < 3`` (insufficient data/degrees of freedom), this error will be thrown. """ if len(x) != len(y): raise errors.SampleSizeError( f'len(x) == {len(x)} != len(y) == {len(y)}') if len(x) < 3: raise errors.SampleSizeError( f'Sample size of {len(x)} is less than the necessary degrees of freedom (n > 2) for regression estimation.') y_mean, x_mean = sample_mean(y), sample_mean(x) alpha = y_mean - simple_regression_beta(x=x, y=y)*x_mean return alpha
- x:
def simple_regression_beta(x: List[float], y: List[float])
-
Parameters
- x:
list
The x sample of paired data (x, y). Must preserve order with y. - y:
list
The y sample of paired data (x, y). Must preserve order with x.
Raises
scrilla.errors.statistics.SampleSizeError
Iflen(x) != len(y)
(samples of incomparable length) orlen(x) < 3
(insufficient data/degrees of freedom), this error will be thrown.
Expand source code
def simple_regression_beta(x: List[float], y: List[float]): """ Parameters ---------- 1. **x**: ``list`` The *x* sample of paired data (*x*, *y*). Must preserve order with **y**. 2. **y**: ``list`` The *y* sample of paired data (*x*, *y*). Must preserve order with **x**. Raises ------ 1. `scrilla.errors.statistics.SampleSizeError` If ``len(x) != len(y)`` (samples of incomparable length) or ``len(x) < 3`` (insufficient data/degrees of freedom), this error will be thrown. """ if len(x) != len(y): raise errors.SampleSizeError(f'len(x) = {len(x)} != len(y) = {len(y)}') if len(x) < 3: raise errors.SampleSizeError( f'Sample size of {len(x)} is less than the necessary degrees of freedom (n > 2) for regression estimation.') correl = sample_correlation(x=x, y=y) vol_x = sqrt(sample_variance(x=x)) vol_y = sqrt(sample_variance(x=y)) beta = correl * vol_y / vol_x return beta
- x:
def standardize(x: List[float])
-
Expand source code
def standardize(x: List[float]): mu = sample_mean(x) sigma = sqrt(sample_variance(x)) return [(this_x - mu)/sigma for this_x in x]
def univariate_normal_likelihood_function(params: list, data: list) ‑> float
-
This function returns the likelihood of a vector of parameters being observed from a sample univariate data of normal data. It can be used as objective function input for
scipy.optimize
's optimization methods.Parameters
- x :
list
Array representing a vector of parameters , in this case the mean rate of return and volatility from a sample of data. - data :
list
A list of data that has been drawn from a univariate normal population.
Expand source code
def univariate_normal_likelihood_function(params: list, data: list) -> float: """ This function returns the likelihood of a vector of parameters being observed from a sample univariate data of normal data. It can be used as objective function input for `scipy.optimize`'s optimization methods. Parameters ---------- 1. **x** : ``list`` Array representing a vector of parameters , in this case the mean rate of return and volatility from a sample of data. 2. **data** : ``list`` A list of data that has been drawn from a univariate normal population. """ likelihood = 0 for point in data: likelihood += norm.logpdf(x=point, loc=params[0], scale=params[1]) return likelihood
- x :