import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.dates as mdates
import scipy.stats
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pylab
sns.set(style='white')
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
df = pd.read_csv('KOTAKBANK.csv')
df.head()
Date | Symbol | Series | Prev Close | Open | High | Low | Last | Close | VWAP | Volume | Turnover | Trades | Deliverable Volume | %Deliverble | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2000-01-03 | KOTAKMAH | EQ | 212.35 | 220.0 | 229.35 | 220.00 | 229.35 | 229.35 | 229.13 | 7086 | 1.623640e+11 | NaN | NaN | NaN |
1 | 2000-01-04 | KOTAKMAH | EQ | 229.35 | 247.7 | 247.70 | 225.25 | 247.70 | 246.95 | 244.12 | 73681 | 1.798729e+12 | NaN | NaN | NaN |
2 | 2000-01-05 | KOTAKMAH | EQ | 246.95 | 229.0 | 240.00 | 227.20 | 228.00 | 228.40 | 233.75 | 105799 | 2.473093e+12 | NaN | NaN | NaN |
3 | 2000-01-06 | KOTAKMAH | EQ | 228.40 | 235.1 | 239.00 | 217.00 | 224.95 | 225.90 | 226.84 | 40202 | 9.119546e+11 | NaN | NaN | NaN |
4 | 2000-01-07 | KOTAKMAH | EQ | 225.90 | 213.0 | 219.00 | 207.85 | 207.85 | 208.85 | 209.94 | 24463 | 5.135747e+11 | NaN | NaN | NaN |
# 探索性数据分析
def eda(data):
print("Size and shape of the data: ")
print(data.size)
print(data.shape)
print('-'*50)
print("\nData types of the features: ")
print(data.dtypes)
eda(df)
Size and shape of the data: 74775 (4985, 15) -------------------------------------------------- Data types of the features: Date object Symbol object Series object Prev Close float64 Open float64 High float64 Low float64 Last float64 Close float64 VWAP float64 Volume int64 Turnover float64 Trades float64 Deliverable Volume float64 %Deliverble float64 dtype: object
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(['Date'], inplace=True)
df.head()
Symbol | Series | Prev Close | Open | High | Low | Last | Close | VWAP | Volume | Turnover | Trades | Deliverable Volume | %Deliverble | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||||
2000-01-03 | KOTAKMAH | EQ | 212.35 | 220.0 | 229.35 | 220.00 | 229.35 | 229.35 | 229.13 | 7086 | 1.623640e+11 | NaN | NaN | NaN |
2000-01-04 | KOTAKMAH | EQ | 229.35 | 247.7 | 247.70 | 225.25 | 247.70 | 246.95 | 244.12 | 73681 | 1.798729e+12 | NaN | NaN | NaN |
2000-01-05 | KOTAKMAH | EQ | 246.95 | 229.0 | 240.00 | 227.20 | 228.00 | 228.40 | 233.75 | 105799 | 2.473093e+12 | NaN | NaN | NaN |
2000-01-06 | KOTAKMAH | EQ | 228.40 | 235.1 | 239.00 | 217.00 | 224.95 | 225.90 | 226.84 | 40202 | 9.119546e+11 | NaN | NaN | NaN |
2000-01-07 | KOTAKMAH | EQ | 225.90 | 213.0 | 219.00 | 207.85 | 207.85 | 208.85 | 209.94 | 24463 | 5.135747e+11 | NaN | NaN | NaN |
df.describe()
Prev Close | Open | High | Low | Last | Close | VWAP | Volume | Turnover | Trades | Deliverable Volume | %Deliverble | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4985.000000 | 4985.000000 | 4985.000000 | 4985.000000 | 4985.000000 | 4985.000000 | 4985.000000 | 4.985000e+03 | 4.985000e+03 | 2456.000000 | 4.789000e+03 | 4789.000000 |
mean | 696.291755 | 697.154925 | 708.147543 | 684.984112 | 696.556409 | 696.615135 | 696.772050 | 1.283626e+06 | 1.355506e+14 | 54912.741857 | 6.701163e+05 | 0.514785 |
std | 440.761023 | 441.037354 | 445.558375 | 435.579256 | 440.924524 | 440.938692 | 440.580761 | 2.486726e+06 | 3.589591e+14 | 60401.337897 | 1.577341e+06 | 0.166689 |
min | 27.300000 | 26.950000 | 30.000000 | 26.000000 | 26.500000 | 27.300000 | 27.670000 | 1.050000e+02 | 4.608250e+08 | 375.000000 | 1.002000e+03 | 0.062300 |
25% | 355.150000 | 355.000000 | 366.900000 | 346.650000 | 356.000000 | 355.600000 | 355.180000 | 1.855800e+05 | 5.388055e+12 | 19646.500000 | 1.126580e+05 | 0.401600 |
50% | 650.950000 | 652.650000 | 662.000000 | 638.450000 | 650.500000 | 651.200000 | 651.390000 | 7.506900e+05 | 5.000255e+13 | 33764.500000 | 3.548760e+05 | 0.511100 |
75% | 966.400000 | 967.000000 | 979.600000 | 955.250000 | 968.000000 | 966.850000 | 966.270000 | 1.577817e+06 | 1.243851e+14 | 66802.500000 | 8.194760e+05 | 0.631900 |
max | 2019.650000 | 2016.700000 | 2049.000000 | 1999.000000 | 2023.900000 | 2019.650000 | 2028.690000 | 8.385990e+07 | 1.498222e+16 | 846705.000000 | 5.853186e+07 | 0.990100 |
def missing_value_table(df):
mis_val = df.isnull().sum()
mis_val_percent = 100*mis_val/len(df)
# 制作一个包含结果的表格
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 重命名列
mis_val_table_ren_columns = mis_val_table.rename(
columns={0:'Missing Values', 1:'% of Total Values'})
#按照缺失百分比降序对表格进行排序
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] !=0].sort_values(
'% of Total Values', ascending=False).round(1)
# 打印一些摘要信息
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
return mis_val_table_ren_columns
missing_table = missing_value_table(df)
missing_table
Your selected dataframe has 14 columns. There are 3 columns that have missing values.
Missing Values | % of Total Values | |
---|---|---|
Trades | 2529 | 50.7 |
Deliverable Volume | 196 | 3.9 |
%Deliverble | 196 | 3.9 |
msno.matrix(df)
<Axes: >
df.Trades.plot()
<Axes: xlabel='Date'>
df.Trades[:2000]
Date 2000-01-03 NaN 2000-01-04 NaN 2000-01-05 NaN 2000-01-06 NaN 2000-01-07 NaN .. 2009-04-09 NaN 2009-04-13 NaN 2009-04-15 NaN 2009-04-16 NaN 2009-04-17 NaN Name: Trades, Length: 2000, dtype: float64
# 删除缺失值
df.drop(['Trades', 'Deliverable Volume','%Deliverble'],axis=1,inplace=True)
import plotly.io as pio
pio.renderers.default = "notebook_connected"
fig = go.Figure([go.Scatter(x=df.index, y=df['VWAP'])])
fig.update_layout(
autosize = False,
width = 1000,
height = 500,
title ='VMAP over time',
template='simple_white'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="VWAP")
fig.show()
股票看涨失败,因为我们可以看到股票自首次公开募股以来的表现。
2008 年之后价格有所下降。
sns.kdeplot(df['VWAP'], fill=True)
<Axes: xlabel='VWAP', ylabel='Density'>
fig = go.Figure([go.Scatter(x=df.loc['2019', 'VWAP'].index, y=df.loc['2020', 'VWAP'])])
fig.update_layout(
autosize = False,
width = 1000,
height=500,
title='VWAP in 2019',
template="simple_white"
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title='VWAP')
fig.show()
fig = go.Figure([go.Scatter(x=df.loc['2020', 'VWAP'].index, y=df.loc['2020', 'VWAP'])])
fig.update_layout(
autosize = False,
width = 1000,
height=500,
title='VWAP in 2020',
template="simple_white"
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title='VWAP')
fig.show()
cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
ax.set_ylabel('Daily trade')
fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
autosize=False,
width = 1000,
height = 500,
template = 'simple_white',
title='Volume over time'
)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Volume')
fig.show()
fig = go.Figure([go.Scatter(x=df.loc['2020', 'Volume'].index, y=df.loc['2020', 'Volume'])])
fig.update_layout(
autosize=False,
width = 1000,
height = 500,
template = 'simple_white',
title = 'Volume in 2020'
)
fig.update_xaxes(title='Date'),
fig.update_yaxes(title='Volume')
fig.show()
scipy.stats.probplot(df.VWAP, plot=pylab)
pylab.show()
增强迪基-富勒检验是一种称为单位根检验的统计检验。
单位根检验背后的直觉是,它确定趋势定义时间序列的强度。它使用自回归模型并优化多个不同滞后值的信息标准。
检验的零假设是时间序列可以用单位根表示,它不是平稳的(具有一些时间相关的结构)。备择假设(拒绝原假设)是时间序列是平稳的。
原假设(H0):如果未能被拒绝,则表明时间序列具有单位根,即非平稳。它具有一些时间依赖性结构。
替代假设(H1):拒绝原假设;它表明时间序列没有单位根,这意味着它是平稳的。它具有一些时间依赖性结构。
我们使用测试中的 p 值来解释此结果。 p 值低于阈值(例如 5% 或 1%)表明我们拒绝原假设(平稳),否则 p 值高于阈值表明我们无法拒绝原假设(非平稳)。
p 值 > 0.05:无法拒绝原假设 (H0),数据具有单位根且非平稳。 p 值 <= 0.05:拒绝原假设 (H0),数据没有单位根且平稳。
# Dicky Fuller 测试函数
def dickyFullerTest(x):
result = adfuller(x)
print("ADF Statistics: %f"% result[0])
print("p-value: %f"% result[1])
print('Critical Value')
for key, value in result[4].items():
print('\t%s: %.3f'%(key, value))
if result[1]>0.05:
print("Fail to reject null hypothesis(H0), the data is non-stationary.")
else:
print("Reject the null hypothesis(H0), the data is stationary.")
dickyFullerTest(df['VWAP'])
ADF Statistics: -0.906425 p-value: 0.785849 Critical Value 1%: -3.432 5%: -2.862 10%: -2.567 Fail to reject null hypothesis(H0), the data is non-stationary.
在 Python 中,statsmodels 库有一个seasonal_decompose() 方法,可让您在一行代码中将时间序列分解为趋势、季节性和噪声。 加法时间序列
如果时间序列的组成部分加在一起形成时间序列。那么该时间序列称为加性时间序列。通过可视化,如果时间序列的增加或减少模式在整个序列中相似,我们可以说时间序列是可加的。任意加性时间序列的数学函数可以表示为:
y(t) = 水平 + 趋势 + 季节性 + 噪声
乘法时间序列
如果时间序列的组成部分相乘,则该时间序列称为乘性时间序列。通过可视化,如果时间序列随时间呈指数增长或递减,则可以将时间序列视为乘法时间序列。乘法时间序列的数学函数可以表示为。
y(t) = 水平 趋势 季节性 * 噪音
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
plt.rcParams.update({"figure.figsize":(10,10)})
y = df['VWAP'].to_frame()
# 乘法分解
result_mul = seasonal_decompose(y, model='multiplicative',period=52)
# 加法分解
result_add = seasonal_decompose(y, model='additive',period = 52)
plt.rcParams.update({'figure.figsize': (10,10)})
result_mul.plot().suptitle('Multiplicative Decompose', fontsize=22)
result_add.plot().suptitle('Additive Decompose', fontsize=22)
plt.show()
df['vwap_diff'] = df['VWAP'] - df['VWAP'].shift(1)
fig = go.Figure([go.Scatter(x=df.index, y=df.VWAP)])
fig.update_layout(
autosize=False,
width = 1000,
height = 500,
template = 'simple_white',
title = 'VWAP over time'
)
fig.show()
fig = go.Figure([go.Scatter(x=df.index,y=df.vwap_diff)])
fig.update_layout(
autosize=False,
width=1000,
height=500,
template='simple_white',
title='Difference VWAP over time ')
fig.show()
自相关和部分自相关图在时间序列分析和预测中大量使用。
这些图以图形方式总结了时间序列中的观察值与先前时间步骤的观察值之间的关系强度。
统计相关性总结了两个变量之间关系的强度。
我们可以计算时间序列观测值与先前时间步的观测值的相关性,称为滞后。由于时间序列观测值的相关性是用先前时间的同一序列的值计算的,因此这称为序列相关性或自相关性。
按滞后绘制的时间序列自相关图称为自相关函数,或缩写为 ACF。该图有时称为相关图或自相关图。
部分自相关是时间序列中的观测值与先前时间步的观测值之间关系的总结,并且删除了中间观测值的关系。
观测值和先前时间步的观测值的自相关由直接相关和间接相关组成。这些间接相关性是观测值与干预时间步长的观测值的相关性的线性函数。
偏自相关函数试图消除的正是这些间接相关性。无需深入数学,这就是部分自相关的直觉。
部分自相关是时间序列中的观测值与先前时间步骤的观测值之间关系的总结,并且删除了中间观测值的关系。
观测值和先前时间步的观测值的自相关由直接相关和间接相关组成。这些间接相关性是观测值与干预时间步长的观测值的相关性的线性函数。
偏自相关函数试图消除的正是这些间接相关性。无需深入数学,这就是部分自相关的直觉。
sm.graphics.tsa.plot_acf(df['VWAP'].iloc[1:], lags=40, title='Auto Correlation of VWAP')
plt.show()
sm.graphics.tsa.plot_pacf(df['VWAP'].iloc[1:], lags=40, title='Partial Auto Correlation of VWAP')
plt.show()
sm.graphics.tsa.plot_acf(df['vwap_diff'].iloc[1:], lags=40, title='Auto Correlation of difference VWAP', zero=False)
plt.show()
sm.graphics.tsa.plot_pacf(df['vwap_diff'].iloc[1:], lags=40, title='Partial Auto Correlation of difference VWAP', zero=False)
plt.show()
添加最高价、最低价、交易量、营业额的滞后值将使用三组滞后值(一组是前一天的值,一组是回顾 7 天的值,另一组是回顾 30 天的值)作为上周和上个月指标的代理。
df.head()
Symbol | Series | Prev Close | Open | High | Low | Last | Close | VWAP | Volume | Turnover | vwap_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||
2000-01-03 | KOTAKMAH | EQ | 212.35 | 220.0 | 229.35 | 220.00 | 229.35 | 229.35 | 229.13 | 7086 | 1.623640e+11 | NaN |
2000-01-04 | KOTAKMAH | EQ | 229.35 | 247.7 | 247.70 | 225.25 | 247.70 | 246.95 | 244.12 | 73681 | 1.798729e+12 | 14.99 |
2000-01-05 | KOTAKMAH | EQ | 246.95 | 229.0 | 240.00 | 227.20 | 228.00 | 228.40 | 233.75 | 105799 | 2.473093e+12 | -10.37 |
2000-01-06 | KOTAKMAH | EQ | 228.40 | 235.1 | 239.00 | 217.00 | 224.95 | 225.90 | 226.84 | 40202 | 9.119546e+11 | -6.91 |
2000-01-07 | KOTAKMAH | EQ | 225.90 | 213.0 | 219.00 | 207.85 | 207.85 | 208.85 | 209.94 | 24463 | 5.135747e+11 | -16.90 |
df = df.reset_index()
lag_features= ["High", "Low", "Volume","Turnover", "Close"]
window1 = 3
window2 = 7
window3 = 30
df_rolled_3d = df[lag_features].rolling(window=window1, min_periods=0)
df_rolled_7d = df[lag_features].rolling(window=window2, min_periods=0)
df_rolled_30d = df[lag_features].rolling(window=window3, min_periods=0)
df_mean_3d = df_rolled_3d.mean().shift(1).reset_index().astype(np.float32)
df_mean_7d = df_rolled_7d.mean().shift(1).reset_index().astype(np.float32)
df_mean_30d = df_rolled_30d.mean().shift(1).reset_index().astype(np.float32)
# 标准差
df_std_3d = df_rolled_3d.std().shift(1).reset_index().astype(np.float32)
df_std_7d = df_rolled_7d.std().shift(1).reset_index().astype(np.float32)
df_std_30d = df_rolled_30d.std().shift(1).reset_index().astype(np.float32)
for feature in lag_features:
df[f"{feature}_mean_lag{window1}"] = df_mean_3d[feature]
df[f"{feature}_mean_lag{window2}"] = df_mean_7d[feature]
df[f"{feature}_mean_lag{window3}"] = df_mean_30d[feature]
df[f"{feature}_std_lag{window1}"] = df_std_3d[feature]
df[f"{feature}_std_lag{window2}"] = df_std_7d[feature]
df[f"{feature}_std_lag{window3}"] = df_std_30d[feature]
# 选择数值类型的列,然后计算均值
numeric_cols = df.select_dtypes(include=[np.number])
df.fillna(numeric_cols.mean(), inplace=True)
# 设置索引
df.set_index("Date", drop=False, inplace=True)
df.Date = pd.to_datetime(df.Date, format="%Y-%m-%d")
df["month"] = df.Date.dt.month
df["week"] = df.Date.dt.isocalendar().week
df["day"] = df.Date.dt.day
df["day_of_week"] = df.Date.dt.dayofweek
df.head()
Date | Symbol | Series | Prev Close | Open | High | Low | Last | Close | VWAP | ... | Close_mean_lag3 | Close_mean_lag7 | Close_mean_lag30 | Close_std_lag3 | Close_std_lag7 | Close_std_lag30 | month | week | day | day_of_week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||||||||||
2000-01-03 | 2000-01-03 | KOTAKMAH | EQ | 212.35 | 220.0 | 229.35 | 220.00 | 229.35 | 229.35 | 229.13 | ... | 696.088013 | 695.472778 | 691.873108 | 10.377261 | 16.013401 | 33.444679 | 1 | 1 | 3 | 0 |
2000-01-04 | 2000-01-04 | KOTAKMAH | EQ | 229.35 | 247.7 | 247.70 | 225.25 | 247.70 | 246.95 | 244.12 | ... | 229.350006 | 229.350006 | 229.350006 | 10.377261 | 16.013401 | 33.444679 | 1 | 1 | 4 | 1 |
2000-01-05 | 2000-01-05 | KOTAKMAH | EQ | 246.95 | 229.0 | 240.00 | 227.20 | 228.00 | 228.40 | 233.75 | ... | 238.149994 | 238.149994 | 238.149994 | 12.445080 | 12.445080 | 12.445080 | 1 | 1 | 5 | 2 |
2000-01-06 | 2000-01-06 | KOTAKMAH | EQ | 228.40 | 235.1 | 239.00 | 217.00 | 224.95 | 225.90 | 226.84 | ... | 234.899994 | 234.899994 | 234.899994 | 10.446411 | 10.446411 | 10.446411 | 1 | 1 | 6 | 3 |
2000-01-07 | 2000-01-07 | KOTAKMAH | EQ | 225.90 | 213.0 | 219.00 | 207.85 | 207.85 | 208.85 | 209.94 | ... | 233.750000 | 232.649994 | 232.649994 | 11.499674 | 9.643737 | 9.643737 | 1 | 1 | 7 | 4 |
5 rows × 47 columns
# 确保日期格式正确并设置为索引
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df.set_index('Date', inplace=True)
# 将数据分为训练和验证
df_train = df[df.index < "2019"].copy()
df_valid = df[df.index >= "2019"].copy()
exogenous_features = ["High_mean_lag3", "High_std_lag3", "Low_mean_lag3", "Low_std_lag3",
"Volume_mean_lag3", "Volume_std_lag3", "Turnover_mean_lag3",
"Turnover_std_lag3","High_mean_lag7", "High_std_lag7", "Low_mean_lag7", "Low_std_lag7",
"Volume_mean_lag7", "Volume_std_lag7", "Turnover_mean_lag7",
"Turnover_std_lag7","High_mean_lag30", "High_std_lag30", "Low_mean_lag30", "Low_std_lag30",
"Volume_mean_lag30", "Volume_std_lag30", "Turnover_mean_lag30",
"Close_mean_lag3", "Close_mean_lag7","Close_mean_lag30","Close_std_lag3","Close_std_lag7","Close_std_lag30",
"Turnover_std_lag30","month","week","day","day_of_week"]
# 自动 ARIMA 模型
model = auto_arima(df_train.VWAP, exogenous=df_train[exogenous_features],
trace=True, error_action="ignore", suppress_warnings=True)
model.fit(df_train.VWAP, exogenous=df_train[exogenous_features])
forecast = model.predict(n_periods=len(df_valid), exogenous=df_valid[exogenous_features])
df_valid.loc[:, "Forecast_ARIMAX"] = forecast
Performing stepwise search to minimize aic ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=38672.159, Time=1.59 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=38685.808, Time=0.08 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=38668.282, Time=0.11 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=38667.567, Time=0.41 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=38684.428, Time=0.05 sec ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=38668.168, Time=0.67 sec ARIMA(0,1,2)(0,0,0)[0] intercept : AIC=38668.433, Time=0.27 sec ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=38667.003, Time=1.05 sec ARIMA(1,1,3)(0,0,0)[0] intercept : AIC=38668.951, Time=2.22 sec ARIMA(0,1,3)(0,0,0)[0] intercept : AIC=38670.404, Time=0.76 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=38670.513, Time=0.32 sec ARIMA(2,1,3)(0,0,0)[0] intercept : AIC=38671.009, Time=2.29 sec ARIMA(1,1,2)(0,0,0)[0] : AIC=38665.670, Time=0.46 sec ARIMA(0,1,2)(0,0,0)[0] : AIC=38666.998, Time=0.14 sec ARIMA(1,1,1)(0,0,0)[0] : AIC=38666.730, Time=0.33 sec ARIMA(2,1,2)(0,0,0)[0] : AIC=38670.724, Time=0.55 sec ARIMA(1,1,3)(0,0,0)[0] : AIC=38667.619, Time=0.75 sec ARIMA(0,1,1)(0,0,0)[0] : AIC=38666.113, Time=0.18 sec ARIMA(0,1,3)(0,0,0)[0] : AIC=38668.965, Time=0.39 sec ARIMA(2,1,1)(0,0,0)[0] : AIC=38669.068, Time=0.15 sec ARIMA(2,1,3)(0,0,0)[0] : AIC=38669.669, Time=0.96 sec Best model: ARIMA(1,1,2)(0,0,0)[0] Total fit time: 13.742 seconds
d:\anaconda\envs\pytorch2.0\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. d:\anaconda\envs\pytorch2.0\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
model.summary()
Dep. Variable: | y | No. Observations: | 4408 |
---|---|---|---|
Model: | SARIMAX(1, 1, 2) | Log Likelihood | -19328.835 |
Date: | Mon, 08 Jan 2024 | AIC | 38665.670 |
Time: | 22:31:05 | BIC | 38691.234 |
Sample: | 0 | HQIC | 38674.687 |
- 4408 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | 0.7859 | 0.155 | 5.071 | 0.000 | 0.482 | 1.090 |
ma.L1 | -0.7190 | 0.156 | -4.611 | 0.000 | -1.025 | -0.413 |
ma.L2 | -0.0732 | 0.011 | -6.888 | 0.000 | -0.094 | -0.052 |
sigma2 | 377.6698 | 0.645 | 585.116 | 0.000 | 376.405 | 378.935 |
Ljung-Box (L1) (Q): | 0.00 | Jarque-Bera (JB): | 52232264.51 |
---|---|---|---|
Prob(Q): | 1.00 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 4.34 | Skew: | -15.42 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 535.45 |
SARIMAX 代表季节性自回归积分滑动平均带外生变量模型(Seasonal AutoRegressive Integrated Moving Average with eXogenous variables)。这个模型常用于具有季节性模式和/或外生变量的时间序列数据。下面是对各个部分的解释:
Dep. Variable: y: 依赖(或响应)变量为 'y'。
No. Observations: 4408: 数据集中有 4408 个观测值。
Model: SARIMAX(1, 1, 2): 表示使用了一个 SARIMAX 模型,其参数为 (p=1, d=1, q=2)。这里 p=1 表示自回归项的阶数为 1,d=1 表示差分阶数为 1(用于使序列平稳),q=2 表示移动平均项的阶数为 2。
Log Likelihood: 对数似然值。这是模型拟合的一个度量,对数似然值越高表示模型的拟合度越好。
AIC: 赤池信息准则值。这是衡量模型拟合优度的一个指标,考虑到了模型的复杂度。AIC 值越低,模型越好。
BIC: 贝叶斯信息准则值与 AIC 类似,但对模型复杂度的惩罚更严格。BIC 值越低,模型越好。
HQIC: 汉南-奎恩信息准则值是另一种衡量模型拟合优度的指标,介于 AIC 和 BIC 之间。
Covariance Type: opg: 估计协方差矩阵的类型是 'opg'。
coef: 模型参数的估计值。例如,ar.L1(自回归项)的系数为 0.7859。
std err: 参数估计的标准误差。例如,ar.L1 的标准误差为 0.155。
z: 参数估计的 z-分数。
P>|z|: 参数估计的 p-值。p-值越小,表示该参数越显著。
[0.025 0.975]: 参数估计的 95% 置信区间。
sigma2: 模型的残差方差。
Ljung-Box, Jarque-Bera: 模型诊断的统计检验,用于检查残差的自相关性(Ljung-Box)和正态性(Jarque-Bera)。
Heteroskedasticity (H): 异方差性检验。H 值大于 1 可能表明存在异方差性。
Skew: 残差的偏度。
Kurtosis: 残差的峰度。
df_valid[["VWAP", "Forecast_ARIMAX"]].plot(figsize=(14,7))
<Axes: xlabel='Date'>