## Timeseries Analysis in Python


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

lynx_df = pd.read_csv("https://drive.google.com/uc?export=download&id=1-FzO-QtYX9saH2Xpb5Or3wtx0dY7jSkY", index_col=0, header=0, names=['year', 'trappings'])
nottem_df = pd.read_csv("https://drive.google.com/uc?export=download&id=1-Crjfr2hipny9RN_4saB0zNj5ymOZw2m", index_col=0, header=0, names=['index', 'temp'])


lynx_ts = pd.Series(lynx_df["trappings"].values, pd.date_range('31/12/1821', periods=len(lynx_df), freq='A-DEC'))
nottem_ts = pd.Series(nottem_df["temp"].values, pd.date_range('31/01/1920', periods=len(nottem_df), freq='M'))

plt.plot(lynx_ts)
plt.show()

plt.plot(nottem_ts)
plt.show()


from statsmodels.tsa.stattools import adfuller
adfuller(lynx_ts, autolag='AIC')

(-2.996303675142558,
 0.03524103535901374,
 7,
 106,
 {'1%': -3.4936021509366793,
  '5%': -2.8892174239808703,
  '10%': -2.58153320754717},
 1635.975223662428)


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(lynx_ts, lags=20);
plot_pacf(lynx_ts, lags=20);


import random

def gen_ts(start, coefs, count, error):
  assert(len(start) == len(coefs))
  assert(count > len(start))

  lst = start + [0] * (count - len(start))

  for i in range(len(start), count):
    lst[i] = random.uniform(-0.5, 0.5) * error
    for j in range (1, len(start)+1):
      lst[i] += coefs[j-1] * lst[i-j]

  return lst


from statsmodels.tsa.stattools import adfuller

v = gen_ts([0] * 3, [0.0, 0, 0.1], 100, 1.0) # sum of coefficients < 1
plt.plot(v)
adf = adfuller(v, autolag='AIC')[1]
print(adf)

1.3940180104247353e-19


plt.plot(lynx_ts.rolling(5).mean())

[<matplotlib.lines.Line2D at 0x7fe1e76862e8>]


plt.plot(lynx_ts)
plt.plot(lynx_ts.ewm(alpha=0.5).mean())
plt.plot(lynx_ts.rolling(window=10, win_type="gaussian", center=True).mean(std=1))

[<matplotlib.lines.Line2D at 0x7fe1f9f36ac8>]


from statsmodels.tsa.arima.model import ARIMA

m = ARIMA(lynx_ts, order=([1], 0, 1))
results = m.fit()
plt.plot(lynx_ts)
plt.plot(results.fittedvalues, color="orange")
print(results.arparams)

[0.54235655]


resid = lynx_ts - results.fittedvalues
plt.plot(resid)

import scipy.stats as stats
plt.hist(resid)
plot_acf(resid)
stats.shapiro(stats.zscore(resid))[1]

results.summary()


# plot various timeseries
white_noise = np.random.normal(0, 10, size=100)
plot_acf(white_noise)
plt.show()
plot_pacf(white_noise)
plt.show()


# ar(1) process
ts2 = [white_noise[0]]
phi_0 = 10
phi_1 = 0.8
k = 00.5 # 0 = perfectly noiseless, 1 = very noisy

# Expected value of the timeseries (perfect timeseries converges to this value)
miu = phi_0 / (1-phi_1)
print("Expected value: ", miu)

for i in range(1, 100):
    # note that without the error term this goes fast to the mean
    # AR(1)
    ts2.append(phi_0 + ts2[i-1] * phi_1 + k * white_noise[i])

ts2 = np.array(ts2)
plt.plot(ts2)

plot_acf(ts2, lags=40)
plt.show()
plot_pacf(ts2, lags=40)
plt.show()

Expected value:  50.000000000000014


ts3 = [white_noise[0]]
mean = phi_0 = 10 # this is already the mean
phi_1 = 0.8
for i in range(1, 100):
    # MA(1)
    ts3.append(phi_0 + white_noise[i] + phi_1 * white_noise[i-1])

ts3 = np.array(ts3)
plt.plot(ts3)

plot_acf(ts3, lags=40)
plt.show()
plot_pacf(ts3, lags=40)
plt.show()


arma_series = [white_noise[0], white_noise[1]]
m = 5
phi_1 = 0.4
phi_2 = 0.3
theta_1 = 0.2
theta_2 = 0.2

# AR(2) I(0) MA(2)
for i in range(2, 100):
    arma_series.append( \
        m + \
        arma_series[i-1] * phi_1 + arma_series [i-2] * phi_2 + \
        white_noise[i] + theta_1 * white_noise[i-1] + theta_2 * white_noise [i-2])

plt.plot(arma_series)
plt.show()

adf = adfuller(arma_series, autolag='AIC')[1]
print(adf) # stationary

arma_series = np.array(arma_series)

# fit the model
plot_acf(arma_series)
plt.show()
plot_pacf(arma_series)
plt.show()

0.1807099668382452


# sharp cutoff in PACF after lag 1 and slow decay in autocorrelation => try ARIMA(1, 0, 0)
from statsmodels.tsa.arima.model import ARIMA

m = ARIMA(arma_series, order=(1,0,0))
results = m.fit()
plt.plot(arma_series)
plt.plot(results.fittedvalues, color="orange")
print(results.arparams)

[0.76092039]


resid = arma_series - results.fittedvalues
plt.hist(resid)

import scipy.stats as stats
# visual inspection of the residuals
plt.hist(resid)
# Shapiro test for normality
stats.shapiro(stats.zscore(resid))[1]

# no autocorrelation
plot_acf(resid)
plt.show()
plot_pacf(resid)
plt.show()

Dep. Variable:	y	No. Observations:	114
Model:	ARIMA(2, 0, 0)	Log Likelihood	-935.020
Date:	Sun, 21 Nov 2021	AIC	1878.041
Time:	15:13:04	BIC	1888.985
Sample:	12-31-1821	HQIC	1882.483
	- 12-31-1934
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	1538.0176	258.564	5.948	0.000	1031.241	2044.794
ar.L1	1.1473	0.064	17.871	0.000	1.021	1.273
ar.L2	-0.5996	0.085	-7.079	0.000	-0.766	-0.434
sigma2	7.768e+05	8.08e+04	9.612	0.000	6.18e+05	9.35e+05

Ljung-Box (L1) (Q):	0.01	Jarque-Bera (JB):	47.33
Prob(Q):	0.93	Prob(JB):	0.00
Heteroskedasticity (H):	1.82	Skew:	0.86
Prob(H) (two-sided):	0.07	Kurtosis:	5.65