In [1]:
import matplotlib as mpl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('nflx_2014_2023.csv')
df.head(5)
Out[1]:
| date | open | high | low | close | volume | rsi_7 | rsi_14 | cci_7 | cci_14 | sma_50 | ema_50 | sma_100 | ema_100 | macd | bollinger | TrueRange | atr_7 | atr_14 | next_day_close | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-01-02 | 52.401428 | 52.511429 | 51.542858 | 51.831429 | 12325600 | 34.729664 | 49.183584 | -89.573201 | -131.288579 | 50.112828 | 50.235157 | 46.385428 | 46.650698 | 0.751929 | 52.607357 | 1.052857 | 1.161182 | 1.247748 | 51.871429 |
| 1 | 2014-01-03 | 52.000000 | 52.495712 | 51.842857 | 51.871429 | 10817100 | 35.587886 | 49.457208 | -65.820581 | -103.026189 | 50.228771 | 50.299327 | 46.537571 | 46.754726 | 0.624259 | 52.656143 | 0.664283 | 1.090197 | 1.206072 | 51.367142 |
| 2 | 2014-01-06 | 51.889999 | 52.044285 | 50.475716 | 51.367142 | 15501500 | 29.820674 | 46.087900 | -121.472559 | -139.640566 | 50.312571 | 50.341203 | 46.680971 | 46.846621 | 0.476890 | 52.666928 | 1.568569 | 1.158535 | 1.231965 | 48.500000 |
| 3 | 2014-01-07 | 49.684284 | 49.698570 | 48.152859 | 48.500000 | 36167600 | 14.371863 | 32.522091 | -206.762171 | -238.029120 | 50.336228 | 50.268997 | 46.791957 | 46.879558 | 0.127277 | 52.560214 | 3.214283 | 1.452214 | 1.373559 | 48.712856 |
| 4 | 2014-01-08 | 48.104286 | 49.425713 | 48.074287 | 48.712856 | 20001100 | 18.049045 | 34.073549 | -117.836707 | -180.766801 | 50.373257 | 50.207969 | 46.917071 | 46.916075 | -0.131106 | 52.455357 | 1.351426 | 1.437815 | 1.371978 | 48.150002 |
In [2]:
df.shape
df.info()
df.describe().T
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2516 entries, 0 to 2515 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 2516 non-null object 1 open 2516 non-null float64 2 high 2516 non-null float64 3 low 2516 non-null float64 4 close 2516 non-null float64 5 volume 2516 non-null int64 6 rsi_7 2516 non-null float64 7 rsi_14 2516 non-null float64 8 cci_7 2516 non-null float64 9 cci_14 2516 non-null float64 10 sma_50 2516 non-null float64 11 ema_50 2516 non-null float64 12 sma_100 2516 non-null float64 13 ema_100 2516 non-null float64 14 macd 2516 non-null float64 15 bollinger 2516 non-null float64 16 TrueRange 2516 non-null float64 17 atr_7 2516 non-null float64 18 atr_14 2516 non-null float64 19 next_day_close 2516 non-null float64 dtypes: float64(18), int64(1), object(1) memory usage: 393.2+ KB
Out[2]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| open | 2516.0 | 2.744558e+02 | 1.660051e+02 | 4.460571e+01 | 1.099825e+02 | 2.880000e+02 | 3.845425e+02 | 6.923500e+02 |
| high | 2516.0 | 2.786389e+02 | 1.682052e+02 | 4.584286e+01 | 1.118875e+02 | 2.926900e+02 | 3.913175e+02 | 7.009900e+02 |
| low | 2516.0 | 2.701246e+02 | 1.636123e+02 | 4.278571e+01 | 1.071175e+02 | 2.826600e+02 | 3.777950e+02 | 6.860900e+02 |
| close | 2516.0 | 2.744870e+02 | 1.659030e+02 | 4.488714e+01 | 1.100650e+02 | 2.882300e+02 | 3.845600e+02 | 6.916900e+02 |
| volume | 2516.0 | 1.049230e+07 | 9.173072e+06 | 1.144000e+06 | 5.017050e+06 | 7.795950e+06 | 1.299060e+07 | 1.333875e+08 |
| rsi_7 | 2516.0 | 5.339958e+01 | 1.776386e+01 | 4.374756e+00 | 4.001592e+01 | 5.395392e+01 | 6.715112e+01 | 9.630571e+01 |
| rsi_14 | 2516.0 | 5.342408e+01 | 1.308827e+01 | 9.152344e+00 | 4.373117e+01 | 5.366803e+01 | 6.337870e+01 | 9.154787e+01 |
| cci_7 | 2516.0 | 1.192076e+01 | 1.008320e+02 | -2.333333e+02 | -7.456580e+01 | 2.563277e+01 | 9.421276e+01 | 2.333333e+02 |
| cci_14 | 2516.0 | 1.570504e+01 | 1.113966e+02 | -4.240129e+02 | -7.187032e+01 | 2.716595e+01 | 1.003219e+02 | 3.567957e+02 |
| sma_50 | 2516.0 | 2.704167e+02 | 1.644050e+02 | 4.967994e+01 | 1.076279e+02 | 2.860346e+02 | 3.765922e+02 | 6.485930e+02 |
| ema_50 | 2516.0 | 2.705223e+02 | 1.634548e+02 | 4.940977e+01 | 1.059231e+02 | 2.873826e+02 | 3.812841e+02 | 6.427655e+02 |
| sma_100 | 2516.0 | 2.667604e+02 | 1.631895e+02 | 4.638543e+01 | 1.056630e+02 | 2.876342e+02 | 3.657409e+02 | 6.167718e+02 |
| ema_100 | 2516.0 | 2.668134e+02 | 1.610500e+02 | 4.665070e+01 | 1.024905e+02 | 2.913791e+02 | 3.693439e+02 | 6.107501e+02 |
| macd | 2516.0 | 1.151103e+00 | 9.655979e+00 | -5.923381e+01 | -1.929731e+00 | 1.535425e+00 | 5.693881e+00 | 2.434734e+01 |
| bollinger | 2516.0 | 2.728651e+02 | 1.652599e+02 | 4.764357e+01 | 1.066550e+02 | 2.844448e+02 | 3.807084e+02 | 6.721270e+02 |
| TrueRange | 2516.0 | 9.249735e+00 | 8.414224e+00 | 5.999980e-01 | 3.102141e+00 | 7.830006e+00 | 1.271501e+01 | 1.361000e+02 |
| atr_7 | 2516.0 | 9.231142e+00 | 6.141865e+00 | 1.090197e+00 | 3.349633e+00 | 9.445428e+00 | 1.322733e+01 | 3.522181e+01 |
| atr_14 | 2516.0 | 9.206930e+00 | 5.898383e+00 | 1.206072e+00 | 3.351988e+00 | 9.852787e+00 | 1.321154e+01 | 2.780988e+01 |
| next_day_close | 2516.0 | 2.746526e+02 | 1.658886e+02 | 4.488714e+01 | 1.101225e+02 | 2.882850e+02 | 3.848575e+02 | 6.916900e+02 |
In [19]:
#t 典型的股票价格等于最高、最低和收盘价格的平均值
df['typical'] = df[['high', 'low','close']].mean(axis=1)
df_timeseries = df[['date','typical']]
df_timeseries
Out[19]:
| date | typical | |
|---|---|---|
| 0 | 2014-01-02 | 51.961905 |
| 1 | 2014-01-03 | 52.069999 |
| 2 | 2014-01-06 | 51.295714 |
| 3 | 2014-01-07 | 48.783810 |
| 4 | 2014-01-08 | 48.737619 |
| ... | ... | ... |
| 2511 | 2023-12-22 | 489.410004 |
| 2512 | 2023-12-26 | 489.683339 |
| 2513 | 2023-12-27 | 491.686666 |
| 2514 | 2023-12-28 | 490.823344 |
| 2515 | 2023-12-29 | 487.016673 |
2516 rows × 2 columns
In [20]:
df_timeseries['date'] = pd.to_datetime(df_timeseries['date'])
df_timeseries.set_index('date', inplace=True)
plt.plot(df_timeseries)
plt.ylabel('Typical Price')
plt.title('Typical Stock Price Over Time')
plt.show()
In [21]:
# 按月重新抽样数据
monthly_data = df_timeseries.resample('M').mean()
plt.plot(monthly_data)
plt.ylabel('Typical Price')
plt.title('Typical Stock Price Over Time by Month')
plt.show()
In [22]:
# 提取时间序列值
ts_values = monthly_data['typical'].values
# 执行ADF检验
result = adfuller(ts_values)
# 打印测试结果
print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:')
for key, value in result[4].items():
print(f'\t{key}: {value}')
ADF Statistic: -1.538395370579076 p-value: 0.5144394812928306 Critical Values: 1%: -3.4870216863700767 5%: -2.8863625166643136 10%: -2.580009026141913
In [23]:
start_date = "2018-01-01"
end_date = "2023-12-31"
mask = (df_timeseries.index > start_date)
df_timeseries_filtered = df_timeseries[mask]
df_timeseries_filtered
Out[23]:
| typical | |
|---|---|
| date | |
| 2018-01-02 | 199.380000 |
| 2018-01-03 | 204.253337 |
| 2018-01-04 | 205.560003 |
| 2018-01-05 | 208.533335 |
| 2018-01-08 | 210.996668 |
| ... | ... |
| 2023-12-22 | 489.410004 |
| 2023-12-26 | 489.683339 |
| 2023-12-27 | 491.686666 |
| 2023-12-28 | 490.823344 |
| 2023-12-29 | 487.016673 |
1509 rows × 1 columns
In [24]:
# 按月重新抽样数据
monthly_data = df_timeseries.resample('M').mean()
plt.plot(monthly_data)
plt.ylabel('Typical Price')
plt.title('Typical Stock Price Over Time by Month')
plt.show()
In [25]:
# 提取时间序列值
ts_values = monthly_data['typical'].values
# 执行ADF检验
result = adfuller(ts_values)
# 打印测试结果
print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:')
for key, value in result[4].items():
print(f'\t{key}: {value}')
ADF Statistic: -1.538395370579076 p-value: 0.5144394812928306 Critical Values: 1%: -3.4870216863700767 5%: -2.8863625166643136 10%: -2.580009026141913
In [ ]:
In [26]:
# 假设您的时间序列数据存储在名为“ts”的系列中。
# Plot ACF
plot_acf(ts_values_2, lags=15)
plt.title('Autocorrelation Function (ACF)')
plt.show()
In [27]:
# 进行季节性分解
decomposition = seasonal_decompose(df_timeseries_filtered_monthly, model='additive')
plt.figure(figsize=(10, 8))
# 原始时间序列
plt.subplot(411)
plt.plot(df_timeseries_filtered_monthly, label='Original')
plt.legend()
# 趋势组件
plt.subplot(412)
plt.plot(decomposition.trend, label='Trend')
plt.legend()
# 季节性的组件
plt.subplot(413)
plt.plot(decomposition.seasonal, label='Seasonal')
plt.legend()
# 残差分量
plt.subplot(414)
plt.plot(decomposition.resid, label='Residual')
plt.legend()
plt.tight_layout()
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[27], line 3 1 # 进行季节性分解 ----> 3 decomposition = seasonal_decompose(df_timeseries_filtered_monthly, model='additive') 5 plt.figure(figsize=(10, 8)) 7 # 原始时间序列 File D:\Python310\lib\site-packages\statsmodels\tsa\seasonal.py:163, in seasonal_decompose(x, model, filt, period, two_sided, extrapolate_trend) 161 if period is None: 162 if pfreq is not None: --> 163 pfreq = freq_to_period(pfreq) 164 period = pfreq 165 else: File D:\Python310\lib\site-packages\statsmodels\tsa\tsatools.py:822, in freq_to_period(freq) 820 return 24 821 else: # pragma : no cover --> 822 raise ValueError( 823 "freq {} not understood. Please report if you " 824 "think this is in error.".format(freq) 825 ) ValueError: freq ME not understood. Please report if you think this is in error.
In [28]:
ts = df_timeseries_filtered_monthly['typical']
# 定义并拟合SARIMA模型
model = SARIMAX(ts, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results = model.fit()
# 预测未来值
forecast = results.forecast(steps=12) # 预测未来一年
# 绘制原始数据和预测
plt.plot(ts.index, ts, label='Original Data')
plt.plot(forecast.index, forecast, label='Forecast')
plt.xlabel('Date')
plt.ylabel('Typical Stock Price')
plt.title('Time Series Forecasting with SARIMA')
plt.legend()
plt.show()
In [ ]: