In [18]:
import warnings 
import matplotlib.pyplot as plt 
import pandas as pd 
plt.style.use("ggplot")
warnings.simplefilter("ignore")
In [19]:
plt.rcParams['figure.figsize'] = (12,8)

任务 2:探索性数据分析¶

In [20]:
hr=pd.read_csv('employee_data.csv')
hr.head()
Out[20]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident quit promotion_last_5years department salary
0 0.38 0.53 2 157 3 0 1 0.0 sales low
1 0.80 0.86 5 262 6 0 1 0.0 sales medium
2 0.11 0.88 7 272 4 0 1 0.0 sales medium
3 0.72 0.87 5 223 5 0 1 0.0 sales low
4 0.37 0.52 2 159 3 0 1 0.0 sales low
In [21]:
# hr.profile_report(title="DATA REPORT")
In [22]:
pd.crosstab(hr.salary,hr.quit).plot(kind='bar')
plt.title("Turnover Frequency on salary Bracket")
plt.xlabel('Salary')
plt.ylabel('Frequency of turnover')
plt.show()
In [23]:
pd.crosstab(hr.department,hr.quit).plot(kind='bar')
plt.title("Turnover Frequency on department")
plt.xlabel('Department')
plt.ylabel('Frequency of turnover')
plt.show()

任务 3:对分类特征进行编码¶

In [24]:
cat_vars=['department','salary']
for i in cat_vars:
    cat_list=pd.get_dummies(hr[i], prefix=i)
    hr=hr.join(cat_list)
    
In [25]:
hr.head()
Out[25]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident quit promotion_last_5years department salary ... department_hr department_management department_marketing department_product_mng department_sales department_support department_technical salary_high salary_low salary_medium
0 0.38 0.53 2 157 3 0 1 0.0 sales low ... 0 0 0 0 1 0 0 0 1 0
1 0.80 0.86 5 262 6 0 1 0.0 sales medium ... 0 0 0 0 1 0 0 0 0 1
2 0.11 0.88 7 272 4 0 1 0.0 sales medium ... 0 0 0 0 1 0 0 0 0 1
3 0.72 0.87 5 223 5 0 1 0.0 sales low ... 0 0 0 0 1 0 0 0 1 0
4 0.37 0.52 2 159 3 0 1 0.0 sales low ... 0 0 0 0 1 0 0 0 1 0

5 rows × 23 columns

In [26]:
hr.drop(columns=['department','salary'],axis=1,inplace=True)
hr.dropna(inplace=True)

任务 4:可视化类不平衡¶

In [27]:
from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)
In [28]:
visualizer=ClassBalance(labels=['stayed','quit']).fit(hr.quit)
visualizer.show()
Out[28]:
<Axes: title={'center': 'Class Balance for 11,581 Instances'}, ylabel='support'>

任务 5:创建训练和测试集¶

In [29]:
x=hr.loc[:,hr.columns !='quit']
y=hr.quit
In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2,stratify=y)

构建交互式决策树分类器¶

监督学习:

  • 输入是随机变量 $X = X_1, ..., X_p$;
  • 输出是随机变量$Y.$
  • 数据是有限集 $$\mathbb{L}=\{(x_i,y_i)|i=0, ..., N-1\}$$ 其中 $x_i \in X = X_1 \times ... \times X_p$ 和 $y_i \in y$ 是从 $P_{X,Y}.$ 中随机抽取的

例如,$(x_i,y_i)=((\text{工资=低,部门=销售,...}),\text{退出=1})$

  • 目标是找到一个模型 $\varphi_\mathbb{L}: X \mapsto y$ 最小化 $$\text{Err}(\varphi_\mathbb{L}) = \mathbb{E}_{X,Y }\{L(Y, \varphi_\mathbb{L}(X))\}.$$

关于:

  • 决策树是非参数模型,可以对输入和输出之间的任意复杂关系进行建模,无需任何先验假设

  • 决策树处理数字和分类变量

  • 它们实现了特征选择,使它们对噪声特征具有鲁棒性(在一定程度上)

  • 对标签中的异常值或错误具有鲁棒性

  • 即使是非机器学习从业者也可以轻松解释。

决策树:划分特征空间:¶

  • 决策树通常具有较低的偏差,但具有较高的方差。
  • 我们将解决高方差问题。
In [31]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score  
from graphviz import Source
from IPython.display import display
from ipywidgets import interactive, IntSlider, FloatSlider, interact 
from IPython.display import Image 
In [32]:
@interact
def plot_tree(crit=['gini','entropy'],
              split=['best','random'],
              depth=IntSlider(min=1,max=30,value=2, continuous_update=False),
              min_split=IntSlider(min=2,max=5,value=2, continuous_update=False),
              min_leaf=IntSlider(min=1,max=5,value=1, continuous_update=False)):
    estimator=DecisionTreeClassifier(random_state=0,
                                    criterion=crit,
                                    splitter=split,
                                    max_depth=depth,
                                    min_samples_split=min_split,
                                    min_samples_leaf=min_leaf)
    estimator.fit(x_train,y_train)
    print('Decision Tree Training Accuracy: {:.3f}'.format(accuracy_score(y_train,estimator.predict(x_train))))
    print('Decision Tree Testing Accuracy: {:.3f}'.format(accuracy_score(y_test,estimator.predict(x_test))))
    
    graph=Source(tree.export_graphviz(estimator,out_file=None,
                                     feature_names=x_train.columns,
                                     class_names=['stayed','guit'],
                                     filled=True))
    display(Image(data=graph.pipe(format='png')))
    return estimator
interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…

任务 8:构建交互式随机森林分类器¶

尽管随机化会增加偏差,但有可能减少整体的方差。随机森林是解决各种问题的最强大的机器学习算法之一。

  • 随机化和平均可以减少方差并提高准确性
  • 实现是可并行的
  • 可以通过引导来减少内存消耗和训练时间
  • 对特征进行采样而不仅仅是对示例进行采样对于提高准确性至关重要
In [33]:
@interact
def plot_tree_rf(crit=['gini', 'entropy'],
                 bootstrap=['True', 'False'],  # Keep the interact options as strings
                 depth=IntSlider(min=1, max=30, value=3, continuous_update=False),
                 forests=IntSlider(min=1, max=200, value=100, continuous_update=False),
                 min_split=IntSlider(min=2, max=5, value=2, continuous_update=False),
                 min_leaf=IntSlider(min=1, max=5, value=1, continuous_update=False)):
    
    # Convert the bootstrap parameter from string to boolean
    bootstrap_bool = bootstrap == 'True'

    estimator = RandomForestClassifier(random_state=1,
                                       criterion=crit,
                                       bootstrap=bootstrap_bool,  # Use the converted boolean value
                                       n_estimators=forests,
                                       max_depth=depth,
                                       min_samples_split=min_split,
                                       min_samples_leaf=min_leaf,
                                       n_jobs=-1,
                                       verbose=False)
    estimator.fit(x_train, y_train)

    print('Random Forests Training Accuracy: {:.3f}'.format(accuracy_score(y_train, estimator.predict(x_train))))
    print('Random Forests Testing Accuracy: {:.3f}'.format(accuracy_score(y_test, estimator.predict(x_test))))
    num_tree = estimator.estimators_[0]
    print('Visualizing Tree: ', 0)
    graph = Source(tree.export_graphviz(num_tree,
                                        out_file=None,
                                        feature_names=x_train.columns,
                                        class_names=['stayed', 'quit'],
                                        filled=True))
    display(Image(data=graph.pipe(format='png')))
    return estimator
interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…

特征重要性和评估指标¶

In [34]:
from yellowbrick.model_selection import FeatureImportances
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")
In [37]:
rf = RandomForestClassifier(bootstrap=True,  # Corrected to boolean True
                            class_weight=None, 
                            criterion='gini',
                            max_depth=5, 
                            max_features='auto', 
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            min_samples_leaf=1, 
                            min_samples_split=2,
                            min_weight_fraction_leaf=0.0, 
                            n_estimators=100, 
                            n_jobs=-1,
                            oob_score=False, 
                            random_state=1, 
                            verbose=False,
                            warm_start=False)

viz = FeatureImportances(rf)
viz.fit(x_train, y_train)
viz.show();
In [40]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, random_state=0,
                            splitter='best')

viz = FeatureImportances(dt)
viz.fit(x_train, y_train)
viz.show();
In [39]:
from yellowbrick.classifier import ROCAUC

visualizer=ROCAUC(rf,classes=['stayed','quit'])
visualizer.fit(x_train,y_train)
visualizer.score(x_test,y_test)
visualizer.poof()
Out[39]:
<Axes: title={'center': 'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>
In [41]:
from yellowbrick.classifier import ROCAUC

visualizer=ROCAUC(dt,classes=['stayed','quit'])
visualizer.fit(x_train,y_train)
visualizer.score(x_test,y_test)
visualizer.poof()
Out[41]:
<Axes: title={'center': 'ROC Curves for DecisionTreeClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>