import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 999


df = pd.read_csv('hour.csv')
df.head()


# 统计信息
df.describe()


# 数据类型信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


# 每个特征中不重复的值
df.apply(lambda x: len(x.unique()))

instant       17379
dteday          731
season            4
yr                2
mnth             12
hr               24
holiday           2
weekday           7
workingday        2
weathersit        4
temp             50
atemp            65
hum              89
windspeed        30
casual          322
registered      776
cnt             869
dtype: int64


# 检查是否有空值
df.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


df = df.rename(columns={'weathersit':'weather',
                       'yr':'year',
                       'mnth':'month',
                       'hr':'hour',
                       'hum':'humidity',
                       'cnt':'count'})
df.head()


df = df.drop(columns=['instant', 'dteday', 'year'])


# 将 int 列更改为类别
cols = ['season','month','hour','holiday','weekday','workingday','weather']

for col in cols:
    df[col] = df[col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   month       17379 non-null  category
 2   hour        17379 non-null  category
 3   holiday     17379 non-null  category
 4   weekday     17379 non-null  category
 5   workingday  17379 non-null  category
 6   weather     17379 non-null  category
 7   temp        17379 non-null  float64 
 8   atemp       17379 non-null  float64 
 9   humidity    17379 non-null  float64 
 10  windspeed   17379 non-null  float64 
 11  casual      17379 non-null  int64   
 12  registered  17379 non-null  int64   
 13  count       17379 non-null  int64   
dtypes: category(7), float64(4), int64(3)
memory usage: 1.0 MB


fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='hour', y='count', hue='weekday', ax=ax)
ax.set(title='工作日和周末的自行车数量')

[Text(0.5, 1.0, '工作日和周末的自行车数量')]


fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='hour', y='casual', hue='weekday', ax=ax)
ax.set(title='工作日和周末的自行车数量：未注册用户')

[Text(0.5, 1.0, '工作日和周末的自行车数量：未注册用户')]


fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='hour', y='registered', hue='weekday', ax=ax)
ax.set(title='工作日和周末的自行车数量：注册用户')

[Text(0.5, 1.0, '工作日和周末的自行车数量：注册用户')]


fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='hour', y='count', hue='weather', ax=ax)
ax.set(title='不同天气下的自行车数量')

[Text(0.5, 1.0, '不同天气下的自行车数量')]


fig, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=df, x='hour', y='count', hue='season', ax=ax)
ax.set(title='不同季节下的自行车数量')

[Text(0.5, 1.0, '不同季节下的自行车数量')]


fig, ax = plt.subplots(figsize=(20,10))
sns.barplot(data=df, x='month', y='count', ax=ax)
ax.set(title='不同月份下的自行车数量')

[Text(0.5, 1.0, '不同月份下的自行车数量')]


fig, ax = plt.subplots(figsize=(20,10))
sns.barplot(data=df, x='weekday', y='count', ax=ax)
ax.set(title='不同天的自行车数量')

[Text(0.5, 1.0, '不同天的自行车数量')]


fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(20,6))
sns.regplot(x=df['temp'], y=df['count'], ax=ax1)
ax1.set(title="气温与用户数量的关系")
sns.regplot(x=df['humidity'], y=df['count'], ax=ax2)
ax2.set(title="湿度与用户数量的关系")

[Text(0.5, 1.0, '湿度与用户数量的关系')]


from statsmodels.graphics.gofplots import qqplot
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(20,6))
sns.distplot(df['count'], ax=ax1)
ax1.set(title='用户数量分布')
qqplot(df['count'], ax=ax2, line='s')
ax2.set(title='理论分位数与样本分位数的比较（QQ图）')

[Text(0.5, 1.0, '理论分位数与样本分位数的比较（QQ图）')]


df['count'] = np.log(df['count'])


fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(20,6))
sns.distplot(df['count'], ax=ax1)
ax1.set(title='Distribution of the users')
qqplot(df['count'], ax=ax2, line='s')
ax2.set(title='Theoritical quantiles')

[Text(0.5, 1.0, 'Theoritical quantiles')]


corr = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot=True, annot_kws={'size':15})

<Axes: >


pd.get_dummies(df['season'], prefix='season', drop_first=True)


df_oh = df

def one_hot_encoding(data, column):
    data = pd.concat([data, pd.get_dummies(data[column], prefix=column, drop_first=True)], axis=1)
    data = data.drop([column], axis=1)
    return data

cols = ['season','month','hour','holiday','weekday','workingday','weather']

for col in cols:
    df_oh = one_hot_encoding(df_oh, col)
df_oh.head()


X = df_oh.drop(columns=['atemp', 'windspeed', 'casual', 'registered', 'count'], axis=1)
y = df_oh['count']


from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

models = [LinearRegression(),
         Ridge(),
         HuberRegressor(),
         ElasticNetCV(),
         DecisionTreeRegressor(),
         RandomForestRegressor(),
         ExtraTreesRegressor(),
         GradientBoostingRegressor()]


from sklearn import model_selection
def train(model):
    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=42) 
    pred = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
    cv_score = pred.mean()
    print('Model:',model)
    print('CV score:', abs(cv_score))


for model in models:
    train(model)

Model: LinearRegression()
CV score: 0.44849511159541205
Model: Ridge()
CV score: 0.4484090089563206
Model: HuberRegressor()
CV score: 0.46596807512124105
Model: ElasticNetCV()
CV score: 0.45614918135359145
Model: DecisionTreeRegressor()
CV score: 0.44255199359646225
Model: RandomForestRegressor()
CV score: 0.23279282002190094
Model: ExtraTreesRegressor()
CV score: 0.23485168754583902
Model: GradientBoostingRegressor()
CV score: 0.35702811006978274


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


model = RandomForestRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)


# 绘制误差差
error = y_test - y_pred
fig, ax = plt.subplots()
ax.scatter(y_test, error)
ax.axhline(lw=3, color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Error')
plt.show()


from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))

0.48527134611361483

	instant	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
count	17379.0000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000	17379.000000
mean	8690.0000	2.501640	0.502561	6.537775	11.546752	0.028770	3.003683	0.682721	1.425283	0.496987	0.475775	0.627229	0.190098	35.676218	153.786869	189.463088
std	5017.0295	1.106918	0.500008	3.438776	6.914405	0.167165	2.005771	0.465431	0.639357	0.192556	0.171850	0.192930	0.122340	49.305030	151.357286	181.387599
min	1.0000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.020000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	4345.5000	2.000000	0.000000	4.000000	6.000000	0.000000	1.000000	0.000000	1.000000	0.340000	0.333300	0.480000	0.104500	4.000000	34.000000	40.000000
50%	8690.0000	3.000000	1.000000	7.000000	12.000000	0.000000	3.000000	1.000000	1.000000	0.500000	0.484800	0.630000	0.194000	17.000000	115.000000	142.000000
75%	13034.5000	3.000000	1.000000	10.000000	18.000000	0.000000	5.000000	1.000000	2.000000	0.660000	0.621200	0.780000	0.253700	48.000000	220.000000	281.000000
max	17379.0000	4.000000	1.000000	12.000000	23.000000	1.000000	6.000000	1.000000	4.000000	1.000000	1.000000	1.000000	0.850700	367.000000	886.000000	977.000000

	season_2	season_3	season_4
0	0	0	0
1	0	0	0
2	0	0	0
3	0	0	0
4	0	0	0
...	...	...	...
17374	0	0	0
17375	0	0	0
17376	0	0	0
17377	0	0	0
17378	0	0	0

	temp	atemp	humidity	casual	registered	count	hour_1	hour_2	hour_3	hour_4	weekday_6
0	0.24	0.2879	0.81	3	13	2.772589	0	0	0	0	1
1	0.22	0.2727	0.80	8	32	3.688879	1	0	0	0	1
2	0.22	0.2727	0.80	5	27	3.465736	0	1	0	0	1
3	0.24	0.2879	0.75	3	10	2.564949	0	0	1	0	1
4	0.24	0.2879	0.75	0	1	0.000000	0	0	0	1	1

导入模块¶

加载数据集¶

预处理数据集¶

探索性数据分析¶

相关矩阵¶

独热编码¶

模型训练¶

	instant	dteday	season	mnth	hr	weekday	weathersit	temp	atemp	hum	casual	registered	cnt
0	1	2011-01-01	1	1	0	6	1	0.24	0.2879	0.81	3	13	16
1	2	2011-01-01	1	1	1	6	1	0.22	0.2727	0.80	8	32	40
2	3	2011-01-01	1	1	2	6	1	0.22	0.2727	0.80	5	27	32
3	4	2011-01-01	1	1	3	6	1	0.24	0.2879	0.75	3	10	13
4	5	2011-01-01	1	1	4	6	1	0.24	0.2879	0.75	0	1	1