import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.dates as mdates
import scipy.stats
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pylab
sns.set(style='white')
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose


df = pd.read_csv('KOTAKBANK.csv')


df.head()


# 探索性数据分析
def eda(data):
    print("Size and shape of the data: ")
    print(data.size)
    print(data.shape)
    print('-'*50)
    print("\nData types of the features: ")
    print(data.dtypes)
    
eda(df)

Size and shape of the data: 
74775
(4985, 15)
--------------------------------------------------

Data types of the features: 
Date                   object
Symbol                 object
Series                 object
Prev Close            float64
Open                  float64
High                  float64
Low                   float64
Last                  float64
Close                 float64
VWAP                  float64
Volume                  int64
Turnover              float64
Trades                float64
Deliverable Volume    float64
%Deliverble           float64
dtype: object


df['Date'] = pd.to_datetime(df['Date'])
df.set_index(['Date'], inplace=True)


df.head()


df.describe()


def missing_value_table(df):
    mis_val = df.isnull().sum()
    
    mis_val_percent = 100*mis_val/len(df)
# 制作一个包含结果的表格
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 重命名列
    mis_val_table_ren_columns = mis_val_table.rename(
    columns={0:'Missing Values', 1:'% of Total Values'})
#按照缺失百分比降序对表格进行排序
    mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] !=0].sort_values(
    '% of Total Values', ascending=False).round(1)
# 打印一些摘要信息
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
    return mis_val_table_ren_columns


missing_table = missing_value_table(df)
missing_table

Your selected dataframe has 14 columns.
There are 3 columns that have missing values.


msno.matrix(df)

<Axes: >


df.Trades.plot()

<Axes: xlabel='Date'>


df.Trades[:2000]

Date
2000-01-03   NaN
2000-01-04   NaN
2000-01-05   NaN
2000-01-06   NaN
2000-01-07   NaN
              ..
2009-04-09   NaN
2009-04-13   NaN
2009-04-15   NaN
2009-04-16   NaN
2009-04-17   NaN
Name: Trades, Length: 2000, dtype: float64


# 删除缺失值

df.drop(['Trades', 'Deliverable Volume','%Deliverble'],axis=1,inplace=True)


import plotly.io as pio
pio.renderers.default = "notebook_connected"


fig = go.Figure([go.Scatter(x=df.index, y=df['VWAP'])])
fig.update_layout(
    autosize = False,
    width = 1000,
    height = 500,
    title ='VMAP over time',
    template='simple_white'
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title="VWAP")
fig.show()


sns.kdeplot(df['VWAP'],   fill=True)

<Axes: xlabel='VWAP', ylabel='Density'>


fig = go.Figure([go.Scatter(x=df.loc['2019', 'VWAP'].index, y=df.loc['2020', 'VWAP'])])
fig.update_layout(
    autosize = False,
    width = 1000,
    height=500,
    title='VWAP in 2019',
    template="simple_white"
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title='VWAP')
fig.show()


fig = go.Figure([go.Scatter(x=df.loc['2020', 'VWAP'].index, y=df.loc['2020', 'VWAP'])])
fig.update_layout(
    autosize = False,
    width = 1000,
    height=500,
    title='VWAP in 2020',
    template="simple_white"
)
fig.update_xaxes(title="Date")
fig.update_yaxes(title='VWAP')
fig.show()


cols_plot = ['Open', 'Close', 'High','Low']
axes = df[cols_plot].plot(figsize=(11, 9), subplots=True)
for ax in axes:
    ax.set_ylabel('Daily trade')


fig = go.Figure([go.Scatter(x=df.index, y=df['Volume'])])
fig.update_layout(
    autosize=False,
    width = 1000,
    height = 500,
    template = 'simple_white',
    title='Volume over time'
)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Volume')
fig.show()


fig = go.Figure([go.Scatter(x=df.loc['2020', 'Volume'].index, y=df.loc['2020', 'Volume'])])
fig.update_layout(
    autosize=False,
    width = 1000,
    height = 500,
    template = 'simple_white',
    title = 'Volume in 2020'
)
fig.update_xaxes(title='Date'),
fig.update_yaxes(title='Volume')
fig.show()


scipy.stats.probplot(df.VWAP, plot=pylab)
pylab.show()


# Dicky Fuller 测试函数
def dickyFullerTest(x):
    result = adfuller(x)
    print("ADF Statistics: %f"% result[0])
    print("p-value: %f"% result[1])
    print('Critical Value')
    for key, value in result[4].items():
        print('\t%s: %.3f'%(key, value))
    if result[1]>0.05:
        print("Fail to reject null hypothesis(H0), the data is non-stationary.")
    else:
        print("Reject the null hypothesis(H0), the data is stationary.")


dickyFullerTest(df['VWAP'])

ADF Statistics: -0.906425
p-value: 0.785849
Critical Value
	1%: -3.432
	5%: -2.862
	10%: -2.567
Fail to reject null hypothesis(H0), the data is non-stationary.


from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse

plt.rcParams.update({"figure.figsize":(10,10)})
y = df['VWAP'].to_frame()

# 乘法分解
result_mul = seasonal_decompose(y, model='multiplicative',period=52)

# 加法分解
result_add = seasonal_decompose(y, model='additive',period = 52)
 
plt.rcParams.update({'figure.figsize': (10,10)})
result_mul.plot().suptitle('Multiplicative Decompose', fontsize=22)
result_add.plot().suptitle('Additive Decompose', fontsize=22)
plt.show()


df['vwap_diff'] = df['VWAP'] - df['VWAP'].shift(1)


fig = go.Figure([go.Scatter(x=df.index, y=df.VWAP)])
fig.update_layout(
    autosize=False,
    width = 1000,
    height = 500,
    template = 'simple_white',
    title = 'VWAP over time'
)
fig.show()


fig = go.Figure([go.Scatter(x=df.index,y=df.vwap_diff)])
fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    template='simple_white',
    title='Difference VWAP over time ')
fig.show()


sm.graphics.tsa.plot_acf(df['VWAP'].iloc[1:], lags=40, title='Auto Correlation of VWAP')
plt.show()


sm.graphics.tsa.plot_pacf(df['VWAP'].iloc[1:], lags=40, title='Partial Auto Correlation of VWAP')
plt.show()


sm.graphics.tsa.plot_acf(df['vwap_diff'].iloc[1:], lags=40, title='Auto Correlation of difference VWAP', zero=False)
plt.show()


sm.graphics.tsa.plot_pacf(df['vwap_diff'].iloc[1:], lags=40, title='Partial Auto Correlation of difference VWAP', zero=False)
plt.show()


df.head()


df = df.reset_index()


lag_features= ["High", "Low", "Volume","Turnover", "Close"]
window1 = 3
window2 = 7
window3 = 30

df_rolled_3d = df[lag_features].rolling(window=window1, min_periods=0)
df_rolled_7d = df[lag_features].rolling(window=window2, min_periods=0)
df_rolled_30d = df[lag_features].rolling(window=window3, min_periods=0)

 
df_mean_3d = df_rolled_3d.mean().shift(1).reset_index().astype(np.float32)
df_mean_7d = df_rolled_7d.mean().shift(1).reset_index().astype(np.float32)
df_mean_30d  = df_rolled_30d.mean().shift(1).reset_index().astype(np.float32)

# 标准差
df_std_3d = df_rolled_3d.std().shift(1).reset_index().astype(np.float32)
df_std_7d = df_rolled_7d.std().shift(1).reset_index().astype(np.float32)
df_std_30d = df_rolled_30d.std().shift(1).reset_index().astype(np.float32)


for feature in lag_features:
    df[f"{feature}_mean_lag{window1}"] = df_mean_3d[feature]
    df[f"{feature}_mean_lag{window2}"] = df_mean_7d[feature]
    df[f"{feature}_mean_lag{window3}"] = df_mean_30d[feature]
    
    df[f"{feature}_std_lag{window1}"] = df_std_3d[feature]
    df[f"{feature}_std_lag{window2}"] = df_std_7d[feature]
    df[f"{feature}_std_lag{window3}"] = df_std_30d[feature]
    
# 选择数值类型的列，然后计算均值
numeric_cols = df.select_dtypes(include=[np.number])
df.fillna(numeric_cols.mean(), inplace=True)

# 设置索引
df.set_index("Date", drop=False, inplace=True)


df.Date = pd.to_datetime(df.Date, format="%Y-%m-%d")
df["month"] = df.Date.dt.month
df["week"] = df.Date.dt.isocalendar().week
df["day"] = df.Date.dt.day
df["day_of_week"] = df.Date.dt.dayofweek


df.head()


# 确保日期格式正确并设置为索引
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df.set_index('Date', inplace=True)

# 将数据分为训练和验证
df_train = df[df.index < "2019"].copy()
df_valid = df[df.index >= "2019"].copy()


exogenous_features = ["High_mean_lag3", "High_std_lag3", "Low_mean_lag3", "Low_std_lag3",
                      "Volume_mean_lag3", "Volume_std_lag3", "Turnover_mean_lag3",
                      "Turnover_std_lag3","High_mean_lag7", "High_std_lag7", "Low_mean_lag7", "Low_std_lag7",
                      "Volume_mean_lag7", "Volume_std_lag7", "Turnover_mean_lag7",
                      "Turnover_std_lag7","High_mean_lag30", "High_std_lag30", "Low_mean_lag30", "Low_std_lag30",
                      "Volume_mean_lag30", "Volume_std_lag30", "Turnover_mean_lag30",
                      "Close_mean_lag3", "Close_mean_lag7","Close_mean_lag30","Close_std_lag3","Close_std_lag7","Close_std_lag30",
                      "Turnover_std_lag30","month","week","day","day_of_week"]


# 自动 ARIMA 模型
model = auto_arima(df_train.VWAP, exogenous=df_train[exogenous_features],
                   trace=True, error_action="ignore", suppress_warnings=True)
model.fit(df_train.VWAP, exogenous=df_train[exogenous_features])

forecast = model.predict(n_periods=len(df_valid), exogenous=df_valid[exogenous_features])
df_valid.loc[:, "Forecast_ARIMAX"] = forecast

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=38672.159, Time=1.59 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=38685.808, Time=0.08 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=38668.282, Time=0.11 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=38667.567, Time=0.41 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=38684.428, Time=0.05 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=38668.168, Time=0.67 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=38668.433, Time=0.27 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=38667.003, Time=1.05 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=38668.951, Time=2.22 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=38670.404, Time=0.76 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=38670.513, Time=0.32 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=38671.009, Time=2.29 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=38665.670, Time=0.46 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=38666.998, Time=0.14 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=38666.730, Time=0.33 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=38670.724, Time=0.55 sec
 ARIMA(1,1,3)(0,0,0)[0]             : AIC=38667.619, Time=0.75 sec
 ARIMA(0,1,1)(0,0,0)[0]             : AIC=38666.113, Time=0.18 sec
 ARIMA(0,1,3)(0,0,0)[0]             : AIC=38668.965, Time=0.39 sec
 ARIMA(2,1,1)(0,0,0)[0]             : AIC=38669.068, Time=0.15 sec
 ARIMA(2,1,3)(0,0,0)[0]             : AIC=38669.669, Time=0.96 sec

Best model:  ARIMA(1,1,2)(0,0,0)[0]          
Total fit time: 13.742 seconds

d:\anaconda\envs\pytorch2.0\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

d:\anaconda\envs\pytorch2.0\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning:

No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


model.summary()


df_valid[["VWAP", "Forecast_ARIMAX"]].plot(figsize=(14,7))

<Axes: xlabel='Date'>

	Date	Symbol	Series	Prev Close	Open	High	Low	Last	Close	VWAP	Volume	Turnover	Trades	Deliverable Volume	%Deliverble
0	2000-01-03	KOTAKMAH	EQ	212.35	220.0	229.35	220.00	229.35	229.35	229.13	7086	1.623640e+11	NaN	NaN	NaN
1	2000-01-04	KOTAKMAH	EQ	229.35	247.7	247.70	225.25	247.70	246.95	244.12	73681	1.798729e+12	NaN	NaN	NaN
2	2000-01-05	KOTAKMAH	EQ	246.95	229.0	240.00	227.20	228.00	228.40	233.75	105799	2.473093e+12	NaN	NaN	NaN
3	2000-01-06	KOTAKMAH	EQ	228.40	235.1	239.00	217.00	224.95	225.90	226.84	40202	9.119546e+11	NaN	NaN	NaN
4	2000-01-07	KOTAKMAH	EQ	225.90	213.0	219.00	207.85	207.85	208.85	209.94	24463	5.135747e+11	NaN	NaN	NaN

	Symbol	Series	Prev Close	Open	High	Low	Last	Close	VWAP	Volume	Turnover	Trades	Deliverable Volume	%Deliverble
Date
2000-01-03	KOTAKMAH	EQ	212.35	220.0	229.35	220.00	229.35	229.35	229.13	7086	1.623640e+11	NaN	NaN	NaN
2000-01-04	KOTAKMAH	EQ	229.35	247.7	247.70	225.25	247.70	246.95	244.12	73681	1.798729e+12	NaN	NaN	NaN
2000-01-05	KOTAKMAH	EQ	246.95	229.0	240.00	227.20	228.00	228.40	233.75	105799	2.473093e+12	NaN	NaN	NaN
2000-01-06	KOTAKMAH	EQ	228.40	235.1	239.00	217.00	224.95	225.90	226.84	40202	9.119546e+11	NaN	NaN	NaN
2000-01-07	KOTAKMAH	EQ	225.90	213.0	219.00	207.85	207.85	208.85	209.94	24463	5.135747e+11	NaN	NaN	NaN

	Prev Close	Open	High	Low	Last	Close	VWAP	Volume	Turnover	Trades	Deliverable Volume	%Deliverble
count	4985.000000	4985.000000	4985.000000	4985.000000	4985.000000	4985.000000	4985.000000	4.985000e+03	4.985000e+03	2456.000000	4.789000e+03	4789.000000
mean	696.291755	697.154925	708.147543	684.984112	696.556409	696.615135	696.772050	1.283626e+06	1.355506e+14	54912.741857	6.701163e+05	0.514785
std	440.761023	441.037354	445.558375	435.579256	440.924524	440.938692	440.580761	2.486726e+06	3.589591e+14	60401.337897	1.577341e+06	0.166689
min	27.300000	26.950000	30.000000	26.000000	26.500000	27.300000	27.670000	1.050000e+02	4.608250e+08	375.000000	1.002000e+03	0.062300
25%	355.150000	355.000000	366.900000	346.650000	356.000000	355.600000	355.180000	1.855800e+05	5.388055e+12	19646.500000	1.126580e+05	0.401600
50%	650.950000	652.650000	662.000000	638.450000	650.500000	651.200000	651.390000	7.506900e+05	5.000255e+13	33764.500000	3.548760e+05	0.511100
75%	966.400000	967.000000	979.600000	955.250000	968.000000	966.850000	966.270000	1.577817e+06	1.243851e+14	66802.500000	8.194760e+05	0.631900
max	2019.650000	2016.700000	2049.000000	1999.000000	2023.900000	2019.650000	2028.690000	8.385990e+07	1.498222e+16	846705.000000	5.853186e+07	0.990100

	Missing Values	% of Total Values
Trades	2529	50.7
Deliverable Volume	196	3.9
%Deliverble	196	3.9

	Symbol	Series	Prev Close	Open	High	Low	Last	Close	VWAP	Volume	Turnover	vwap_diff
Date
2000-01-03	KOTAKMAH	EQ	212.35	220.0	229.35	220.00	229.35	229.35	229.13	7086	1.623640e+11	NaN
2000-01-04	KOTAKMAH	EQ	229.35	247.7	247.70	225.25	247.70	246.95	244.12	73681	1.798729e+12	14.99
2000-01-05	KOTAKMAH	EQ	246.95	229.0	240.00	227.20	228.00	228.40	233.75	105799	2.473093e+12	-10.37
2000-01-06	KOTAKMAH	EQ	228.40	235.1	239.00	217.00	224.95	225.90	226.84	40202	9.119546e+11	-6.91
2000-01-07	KOTAKMAH	EQ	225.90	213.0	219.00	207.85	207.85	208.85	209.94	24463	5.135747e+11	-16.90

时间序列分析与预测 ⌛📈📉¶

KOTAK BANK 股价分析与预测¶

导入数据集¶

探索性数据分析¶

将日期转换为日期时间格式¶

数据汇总¶

数据中缺失值¶

可视化缺失数据的位置¶

处理缺失值¶

绘制随时间变化的 VWAP（成交量加权平均价格）。¶

使用 KDE 进行可视化¶

2019年成交量加权平均价格¶

2020 年成交量加权平均价格¶

随着时间的推移，开盘价、收盘价、最高价、最低价。¶

随时间变化的成交量¶

2020年产量¶

Q-Q 图¶

迪基·富勒测试¶

季节性分解¶

将平稳转换为非平稳¶

绘制 ACF 和 PACF¶

特征工程¶

Dep. Variable:	y	No. Observations:	4408
Model:	SARIMAX(1, 1, 2)	Log Likelihood	-19328.835
Date:	Mon, 08 Jan 2024	AIC	38665.670
Time:	22:31:05	BIC	38691.234
Sample:	0	HQIC	38674.687
	- 4408
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	0.7859	0.155	5.071	0.000	0.482	1.090
ma.L1	-0.7190	0.156	-4.611	0.000	-1.025	-0.413
ma.L2	-0.0732	0.011	-6.888	0.000	-0.094	-0.052
sigma2	377.6698	0.645	585.116	0.000	376.405	378.935

Ljung-Box (L1) (Q):	0.00	Jarque-Bera (JB):	52232264.51
Prob(Q):	1.00	Prob(JB):	0.00
Heteroskedasticity (H):	4.34	Skew:	-15.42
Prob(H) (two-sided):	0.00	Kurtosis:	535.45