import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = [10,5]
plt.rcParams['axes.grid'] = True
import pandas as pd
import pandas_datareader.data as web
import datetime
import pprint as pp
import scipy as sc
import numpy as np
import scipy.linalg as lg # SciPy Linear Algebra Library
import scipy.sparse as sp
import scipy.sparse.linalg as sl
from IPython.display import HTML
from matplotlib import animation, rc
"Oh, honestly, Ron. If you ask me, Divination's a woolly discipline."
def l(n, x, Xn):
res = 1
for (k, xn) in enumerate(Xn):
if k!=n:
res *= (x-Xn[k])/(Xn[n]-Xn[k])
return res
def show_interpolation_example(F, N=11, my_ylims=()):
x = np.linspace(-1,1.5, 1000)
X = np.linspace(-1,1, N)
Y = F(X)
y = F(x)
res = 0*x
for n in range(N):
res += l(n, x, X)*Y[n]
plt.grid(True)
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.ylim(*my_ylims)
plt.plot(x, y)
plt.plot(x[x<=1], res[x<=1])
plt.plot(X, Y, "o")
plt.plot(x[x>1], res[x>1], linestyle='dashed')
plt.legend(['$f(x)$','$f_{aprx}(x)$','col. p.', 'extrapolation'])
plt.show()
return { 'max_error': np.max(np.abs(y-res))}
def f(x): return 1/(25*x**2+1)
plt.title("Lagrange interpolation")
show_interpolation_example(f, N=11, my_ylims=(-6, 2))
{'max_error': 3813.8276479129663}
def f(x): return 1/(25*(x+1.2)**2+1)
plt.title("Lagrange interpolation")
show_interpolation_example(f, N=11)
{'max_error': 0.41060566040601904}
def f(x): return 1/(25*(x+1.2)**2+1) + 1e-2*np.random.random(len(x))
plt.title("Lagrange interpolation with noisy data")
show_interpolation_example(f, N=11, my_ylims=(-0.5, 0.5))
{'max_error': 112.34458879852396}
One dimensional case (we have a single time-dependent variable $X_t$ - stock prices).
If the returns
$$R_t=\frac{X_t-X_{t-1}}{X_{t-1}}$$have the stationary propeperty with
Than we can generate the series for the future $t=0$ - the end of recorded data, the begining of the generated data, where we can expect that
$$E(X_{t>0}) = X_0 + \bar R t$$with uncertainty
$$\sqrt{\textrm{var}(X_{t>0})} = \sigma\sqrt{t}$$Or in the simplest scheme for small time step $dt$
$$X_{t+dt} = X_t+\bar R dt + \sigma\sqrt{dt}\epsilon_t$$where $\epsilon_t$ is a random number from normal(?) distribution
What about multidimensional data, when $X$ and $R$ are vectors like we have for a portfolio?
Instead of a variance we have a covariance matrix
$$\Sigma_{ij}=\Sigma_{ji} = E\left[(R_i-E(R_i))\left(R_j-E(R_j\right)\right]$$which is semipositively defined $x^T\Sigma x\geq 0$ for all real vectors $x$.
We need to generate numbers which are correlated.
Suppose that a vector $Y$ have elements which are independent and drown from formal distribution $E[Y]=0$
$$ E\left[Y_iY_j\right] = \delta_{ij}$$For transformed variable $R' = L Y$ we have
$$E\left[R'_iR'_j\right]=E\left[L_{ik}Y_kL_{jl}Y_l\right]=L_{ik}L_{jl}E\left[Y_kY_l\right] = L_{ik}L_{jl}\delta_{kl}=L_{ik}L_{jk}$$Or we can also write
$$\Sigma=LL^T$$In order to generate correlated data
A similar but more computantionally expensive method is to construct an eigenportfolio. Diagonalyzing $\Sigma$ we obtain
$$\Sigma = O^T\Sigma'O\qquad \text{where}\qquad O^TO=OO^T=\mathbb{1}$$and $\Sigma'$ is a diagonal matrix with eigenvalues (eigen-variances) on the diagonal.
Verctor $R$ can be decomposed as
$$R=\sum_{i}\alpha_iv_i$$Since $v_i^Tv_j=\delta_{ij}$ there is no correlation between $v_j$ and $v_j$ when $i\neq j$ so $\alpha_i$ can be drown independently.
In general we have some function
$$y_{i}=\beta _{1}x_{i1}+\beta _{2}x_{i2}+\cdots +\beta _{p}x_{ip}+\epsilon _{i}$$or in a matrix form
$$y=X\beta +\epsilon$$where $\beta_p$ are unknown parameters and $x_{ip}$ some observed quantities (not necessarily simple values, but can be functions of some independent variables or weighted values etc.)
Mathematical properties are well understood only for linear fnctions in $\beta$.
But still there are a lot of questions that need to be answered:
How to find the parameters:
Two uses of regression models:
x = np.linspace(-3,3, 1000)
plt.plot(x, np.exp(-(x)**2/0.7), label = 'more accurate')
plt.plot(x, np.exp(-(x-0.2)**2/0.01)*1.2, label='biased but better preditions')
plt.axvline(x=0, color='black', linestyle = 'dashed')
plt.legend()
<matplotlib.legend.Legend at 0x7fe3452a39b0>
OLS is BLUE (Best Linear Unbiased Estimator)
Usually we look for minimum of the $\chi^2$ construct function
$$\chi^2(\beta) = ||y-X\beta||^2=\sum_i\epsilon_i^2$$Usually $X$ is a matrix $N\times M$ where $N>M$.
why not $||\cdot||=\min$ (MAD estimator) or $||\cdot||^7=\min$ or $\log||\cdot||=\min$?
$||\cdot||^2$ is BLUE
OLS is BLUE if
OLS may work bad if one of the assumptions is not met
def homoskdascicity():
plt.figure(figsize=(10, 3))
plt.subplot(1, 2, 1)
x = np.linspace(0, 10, 1000)
y = 10+0.4*x
sig =1+0.1*x**2
y1 = y + sig
y2 = y-sig
plt.title('scale effect')
plt.plot(x, y, 'black', linestyle='dashed')
plt.plot(x, y1, 'blue')
plt.plot(x, y2, 'blue')
plt.fill_between(x, y1, y2, alpha=0.25)
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.subplot(1, 2, 2)
sig =5*np.exp(-0.2*x)
y1 = y + sig
y2 = y-sig
plt.title('learning effect')
plt.plot(x, y, 'black', linestyle='dashed')
plt.plot(x, y1, 'green')
plt.plot(x, y2, 'green')
plt.fill_between(x, y1, y2, alpha=0.15, color='green')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.plot()
homoskdascicity()
In that case use the weights $1/\sigma_i$
def error_correlation():
plt.figure(figsize=(10, 3))
plt.subplot(1, 2, 1)
x = np.linspace(0, 10, 1000)
y = 10+0.4*x
plt.plot(x, y, 'black', linestyle='dashed')
x1 = np.linspace(0, 10, 50)
errors = np.random.normal(size=len(x1), scale = 0.2)
y1 = 10+0.4*x1 + errors
plt.scatter(x1, y1, s=10)
plt.title('uncorrelated errors')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.subplot(1, 2, 2)
plt.plot(x, y, 'black', linestyle='dashed')
x1 = np.linspace(0, 10, 50)
#errors = np.random.normal(size=len(x1), scale = 0.2)
errors = np.zeros_like(x1)
errors[0] = np.random.normal(scale = 1)
for n in range(1, 50):
errors[n] = 0.8*errors[n-1] + np.random.normal(scale = 0.1)
y1 = 10+0.4*x1 + errors
plt.scatter(x1, y1, color = 'red', s=10)
plt.title('correlated errors')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.plot()
error_correlation()
Suppose we have a model
$$y_i = Ax_i + \epsilon_i$$
when the actual relation is $y=Ax+B$
import statsmodels.api as sm
""" Preparing some example data """
nsample = 10000
x = np.linspace(0, 2, nsample)
y_exact = 0.31416 + 0.27182*x
y = y_exact + np.random.normal(size=len(x), scale=0.15)
""" Preparing toregression """
X = np.ones_like(x)
X = np.column_stack((X, x)) # Two vectors X ~ [1, x]
model = sm.OLS(y, X)
results = model.fit()
results.summary()
Dep. Variable: | y | R-squared: | 0.530 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.530 |
Method: | Least Squares | F-statistic: | 1.127e+04 |
Date: | Thu, 23 Jan 2020 | Prob (F-statistic): | 0.00 |
Time: | 19:40:11 | Log-Likelihood: | 4801.4 |
No. Observations: | 10000 | AIC: | -9599. |
Df Residuals: | 9998 | BIC: | -9584. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.3099 | 0.003 | 103.488 | 0.000 | 0.304 | 0.316 |
x1 | 0.2753 | 0.003 | 106.161 | 0.000 | 0.270 | 0.280 |
Omnibus: | 1.377 | Durbin-Watson: | 1.996 |
---|---|---|---|
Prob(Omnibus): | 0.502 | Jarque-Bera (JB): | 1.342 |
Skew: | 0.018 | Prob(JB): | 0.511 |
Kurtosis: | 3.044 | Cond. No. | 3.78 |
def show_reg1(x, y, y_exact, y_fit):
plt.figure(figsize=(10, 6))
plt.subplot(2, 2, 3)
plt.errorbar(x, y_exact, [0*x, y-y_exact], alpha=0.1, color='orange')
plt.plot(x, y_exact, 'black')
plt.plot(x, y_fit, 'red', ls='dashed')
plt.subplot(2, 2, (1,2))
plt.errorbar(x, 0*x, [0*x, y-y_fit], alpha=0.1, color='green')
plt.subplot(2, 2, 4)
plt.hist(y-y_fit, np.linspace(-1, 1, 50), density=1)
𝛽 = results.params
# y_fit = 𝛽[0]*X[:, 0] + 𝛽[1]*X[:, 1] # or better
y_fit = np.dot(X, 𝛽)
show_reg1(x, y, y_exact, y_fit)
X2 = x # only one vector X2 = [x]
model2 = sm.OLS(y, X2)
results2 = model2.fit()
results2.summary()
Dep. Variable: | y | R-squared (uncentered): | 0.881 |
---|---|---|---|
Model: | OLS | Adj. R-squared (uncentered): | 0.881 |
Method: | Least Squares | F-statistic: | 7.402e+04 |
Date: | Thu, 23 Jan 2020 | Prob (F-statistic): | 0.00 |
Time: | 19:40:12 | Log-Likelihood: | 1160.7 |
No. Observations: | 10000 | AIC: | -2319. |
Df Residuals: | 9999 | BIC: | -2312. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
x1 | 0.5077 | 0.002 | 272.070 | 0.000 | 0.504 | 0.511 |
Omnibus: | 35.515 | Durbin-Watson: | 0.964 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 27.874 |
Skew: | -0.036 | Prob(JB): | 8.85e-07 |
Kurtosis: | 2.752 | Cond. No. | 1.00 |
b2 = results2.params
y_fit2 = b2[0]*X2
show_reg1(x, y, y_exact, y_fit2)
X3 = np.ones_like(x) # only one vector X3 = [1]
model3 = sm.OLS(y, X3)
results3 = model3.fit()
b3 = results3.params
y_fit3 = b3[0]*X3
results3.summary()
Dep. Variable: | y | R-squared: | 0.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.000 |
Method: | Least Squares | F-statistic: | inf |
Date: | Thu, 23 Jan 2020 | Prob (F-statistic): | nan |
Time: | 19:40:42 | Log-Likelihood: | 1027.3 |
No. Observations: | 10000 | AIC: | -2053. |
Df Residuals: | 9999 | BIC: | -2045. |
Df Model: | 0 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.5851 | 0.002 | 267.972 | 0.000 | 0.581 | 0.589 |
Omnibus: | 66.201 | Durbin-Watson: | 0.938 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 45.852 |
Skew: | 0.003 | Prob(JB): | 1.11e-10 |
Kurtosis: | 2.668 | Cond. No. | 1.00 |
show_reg1(x, y, y_exact, y_fit3)
X4 = np.column_stack((X, x**2)) # Overfitting with three vectors X ~ [1, x, x**2]
model4 = sm.OLS(y, X4)
results4 = model4.fit()
#results4.summary()
Finding $\beta$'s $$\chi^2(\beta) = \beta^TX^TX\beta - 2\beta^TX^Ty +y^Ty$$
$$\frac{\partial \chi^2}{\partial \beta}=2X^TX\beta - 2X^Ty = 0$$best fitting $\beta$:
$$\beta = (X^TX)^{-1}X^Ty$$If $X^TX$ is singular or almost singular SVD decomposition should be used instead
SVD decomposition:
where
Zero diagonal elements of $\Sigma$ correspond to the null space of $X$.
Pseudo-inverse
$$X^+ = V\Sigma ^+U^*$$$$\left.XX^+ = U\Sigma V^* V\Sigma ^+U^* = U\Sigma\Sigma ^+U^* =\mathbb{1}\right|_{range}$$if $$\Sigma = \begin{pmatrix} \lambda_1 \\ & \lambda_2 \\ & & 0 \\ &\cdots\\ \end{pmatrix}\quad\textrm{then}\quad \Sigma^+ = \begin{pmatrix} \lambda_1^{-1} \\ & \lambda_2^{-2} &&\vdots \\ & & 0 \\ \end{pmatrix}$$
In principle we could also minimize
$$F(\beta) = ||y-f(x; \beta)||^2$$where $f$ is an arbitrary (even nonlinear) function in $\beta$. But then the solution may not be unique nor easy to find.
The errors of $\beta$ may also have bad statistical properties.
Example: $$f(x;y_0, A, x_0, \sigma) = y_0+Ae^{-\frac{(x-x_0)^2}{2\sigma^2}}$$
The above function cannot be written in terms of linear regression.
However sume of exponents with unknown (complex even) frequencies
$$y(t_i) = \sum A_ne^{i \omega_n t_i}$$can be solved using linear algebra after finding zeros of some polynomal polynomial. It can be applied for uniform sampling $t_i = idt$ (Prony's method)
For noisy data the original Prony's method perform badly, but usage of SVD can deal with the noise.
Applications:
Information criterion
$$AIC = 2k-2\log(L)$$where $L$ likehood function, $k$ - number of parameters
Relative likelihood:
$$r_{12} = \exp\left(\frac{AIC_1-AIC_2}{2}\right)\qquad AIC_1<AIC_2$$idx = ['$\large \\beta_0 + \\beta_1x$', '$\large \\beta_0 x$', ' $\large \\beta_0 $', ' $\large \\beta_0 + \\beta_1x + \\beta_2 x^2$']
data = np.array([results.aic, results2.aic, results3.aic, results4.aic])
data = np.column_stack((data, np.exp(0.5*(data[0]-data))))
pd.DataFrame(data=data, index=idx, columns=['AIC', 'rel likehood'])
AIC | rel likehood | |
---|---|---|
$\large \beta_0 + \beta_1x$ | -9598.749646 | 1.000000 |
$\large \beta_0 x$ | -2319.481421 | 0.000000 |
$\large \beta_0 $ | -2052.505505 | 0.000000 |
$\large \beta_0 + \beta_1x + \beta_2 x^2$ | -9602.665141 | 7.083357 |
Model $y=\beta_0+\beta_1x$ is $10^{1600}$ times more likely than $y=\beta_0$ or $y=\beta_0x$
but model $y=\beta_0+\beta_1x+\beta_2x^2$ is 37% likely
$AR(p)$ - autoregressive model of a order $p$ is a model where the value at time $t$ is a liner function of preivious $p$ values:
$$y_t = c+\beta_1y_{t-1}+\beta_2y_{t-2} + \beta_3y_{t-3} + \cdots \beta_py_{t-p} + \epsilon_t$$Often the backshift or lag operator $B_sy_t = y_{t-s}$is used then $AR(p)$ model can be written in a more concise form
$$B_0y_t=c+\sum_{i=1}^p\beta_iB_iy_t+\epsilon_t$$or even
$$\beta(B)y_t=c+\epsilon_t$$The last expression can be used to test if the residuals have apprpriate statistical properties i.e. $\epsilon_t$ is a white noise
def generate_AR_series(𝛽, c, scale=0.1, nsamples=200):
y = np.empty(nsamples)
y[0] = 0
for n in range(1, nsamples):
y[n] = c + 𝛽*y[n-1] + np.random.normal(scale = scale)
return (np.arange(nsamples), y)
def show_AR_examples():
for n, parameters in enumerate([ (0, 0, 0.1), (0, 1, 0.1), (0.03, 1, 0.1), (0.1, 0.9, 0.) ]):
plt.subplot(2,2,n+1)
c, 𝛽, scale = (parameters)
plt.plot( *generate_AR_series(𝛽, c, scale), f'C{n}', label=f'$c$ = {c}, $\\beta_1=${𝛽}, $\\sigma={scale}$')
plt.legend()
show_AR_examples()
In the case of $AR(1)$ model
In AR models any shock $\epsilon_t$ implicitly affects all the future results
$$y_t = c+\beta_1y_{t-1}+\epsilon_t = c+\beta_1(c+\beta_1y_{t-2}+\color{red}{\epsilon_{t-1}})+\epsilon_t = \ldots = F(\epsilon_t, \epsilon_{t-1}, \ldots, \epsilon_0)$$limit the shocks only to last $q$ steps
def show_ma_example():
nsamples = 20000+3
eps = np.random.normal(size=nsamples)
y = 1 + eps[3:] - 0.9*eps[2:-1] + 0.*eps[1:-2] - 0.8*eps[0:-3]
n = np.arange(nsamples-3)
#plt.figure(figsize=(10, 3))
plt.subplot(2, 2, 1)
plt.plot(n, y)
for n in range(1, 4):
plt.subplot(2, 2, n+1)
plt.xlabel('n')
plt.ylabel(f'n-{n}')
plt.scatter(y[0:-n], y[n:], s=10, alpha = 0.01, color=f'C{n}')
show_ma_example()
#np.cov()
A = np.array([y[0:-3], y[1:-2], y[2:-1], y[3:]])
np.corrcoef(A)
array([[1. , 0.53033413, 0.53200663, 0.52811241], [0.53033413, 1. , 0.53035447, 0.5322098 ], [0.53200663, 0.53035447, 1. , 0.53044532], [0.52811241, 0.5322098 , 0.53044532, 1. ]])
High $AR(p)$ models are very impractical. Difficult to obtain the parameters (OLS reduces many degrees of freedom) and still are finite.
Kock Distribited Lag model - infinitely lagged model with geometric parameters:
$$y_t = c+\beta_0\sum_k\lambda^kx_{t-k}+\epsilon_k$$We need to obtain only three parameters.
The model works for $0<\lambda<1$
Multiplying the second equation by $\lambda$ and puttng into the first one we obtain:
$$y_t=(1-\lambda)c + \beta_0x_t+\lambda y_{t-1}+\xi_t$$where $\xi_t=\epsilon_t-\lambda\epsilon_{t-1}$ is a composite error term.
All these models have their generalizations to multivariables $y_i, x_i, \epsilon_i$ are vectors. Which can be correlated etc.
Nonlinear models also have different kind of combinations such as $$\sum_i\sum_j y_{t-i}y_{t-j}$$ $$\sum_i\sum_j y_{t-i}x_{t-j}$$ $$\sum_i\sum_j y_{t-i}\epsilon_{t-j}$$ $$\sum_i\sum_j\sum_k y_{t-i}y_{t-j}y_{t-k}$$
etc