import warnings
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("ggplot")
warnings.simplefilter("ignore")
plt.rcParams['figure.figsize'] = (12,8)
hr=pd.read_csv('employee_data.csv')
hr.head()
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | quit | promotion_last_5years | department | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0.0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0.0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0.0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0.0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0.0 | sales | low |
# hr.profile_report(title="DATA REPORT")
pd.crosstab(hr.salary,hr.quit).plot(kind='bar')
plt.title("Turnover Frequency on salary Bracket")
plt.xlabel('Salary')
plt.ylabel('Frequency of turnover')
plt.show()
pd.crosstab(hr.department,hr.quit).plot(kind='bar')
plt.title("Turnover Frequency on department")
plt.xlabel('Department')
plt.ylabel('Frequency of turnover')
plt.show()
cat_vars=['department','salary']
for i in cat_vars:
cat_list=pd.get_dummies(hr[i], prefix=i)
hr=hr.join(cat_list)
hr.head()
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | quit | promotion_last_5years | department | salary | ... | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | salary_high | salary_low | salary_medium | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0.0 | sales | low | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0.0 | sales | medium | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0.0 | sales | medium | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0.0 | sales | low | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0.0 | sales | low | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
5 rows × 23 columns
hr.drop(columns=['department','salary'],axis=1,inplace=True)
hr.dropna(inplace=True)
from yellowbrick.target import ClassBalance
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12,8)
visualizer=ClassBalance(labels=['stayed','quit']).fit(hr.quit)
visualizer.show()
<Axes: title={'center': 'Class Balance for 11,581 Instances'}, ylabel='support'>
x=hr.loc[:,hr.columns !='quit']
y=hr.quit
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2,stratify=y)
监督学习:
例如,$(x_i,y_i)=((\text{工资=低,部门=销售,...}),\text{退出=1})$
关于:
决策树是非参数模型,可以对输入和输出之间的任意复杂关系进行建模,无需任何先验假设
决策树处理数字和分类变量
它们实现了特征选择,使它们对噪声特征具有鲁棒性(在一定程度上)
对标签中的异常值或错误具有鲁棒性
即使是非机器学习从业者也可以轻松解释。
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from graphviz import Source
from IPython.display import display
from ipywidgets import interactive, IntSlider, FloatSlider, interact
from IPython.display import Image
@interact
def plot_tree(crit=['gini','entropy'],
split=['best','random'],
depth=IntSlider(min=1,max=30,value=2, continuous_update=False),
min_split=IntSlider(min=2,max=5,value=2, continuous_update=False),
min_leaf=IntSlider(min=1,max=5,value=1, continuous_update=False)):
estimator=DecisionTreeClassifier(random_state=0,
criterion=crit,
splitter=split,
max_depth=depth,
min_samples_split=min_split,
min_samples_leaf=min_leaf)
estimator.fit(x_train,y_train)
print('Decision Tree Training Accuracy: {:.3f}'.format(accuracy_score(y_train,estimator.predict(x_train))))
print('Decision Tree Testing Accuracy: {:.3f}'.format(accuracy_score(y_test,estimator.predict(x_test))))
graph=Source(tree.export_graphviz(estimator,out_file=None,
feature_names=x_train.columns,
class_names=['stayed','guit'],
filled=True))
display(Image(data=graph.pipe(format='png')))
return estimator
interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…
尽管随机化会增加偏差,但有可能减少整体的方差。随机森林是解决各种问题的最强大的机器学习算法之一。
@interact
def plot_tree_rf(crit=['gini', 'entropy'],
bootstrap=['True', 'False'], # Keep the interact options as strings
depth=IntSlider(min=1, max=30, value=3, continuous_update=False),
forests=IntSlider(min=1, max=200, value=100, continuous_update=False),
min_split=IntSlider(min=2, max=5, value=2, continuous_update=False),
min_leaf=IntSlider(min=1, max=5, value=1, continuous_update=False)):
# Convert the bootstrap parameter from string to boolean
bootstrap_bool = bootstrap == 'True'
estimator = RandomForestClassifier(random_state=1,
criterion=crit,
bootstrap=bootstrap_bool, # Use the converted boolean value
n_estimators=forests,
max_depth=depth,
min_samples_split=min_split,
min_samples_leaf=min_leaf,
n_jobs=-1,
verbose=False)
estimator.fit(x_train, y_train)
print('Random Forests Training Accuracy: {:.3f}'.format(accuracy_score(y_train, estimator.predict(x_train))))
print('Random Forests Testing Accuracy: {:.3f}'.format(accuracy_score(y_test, estimator.predict(x_test))))
num_tree = estimator.estimators_[0]
print('Visualizing Tree: ', 0)
graph = Source(tree.export_graphviz(num_tree,
out_file=None,
feature_names=x_train.columns,
class_names=['stayed', 'quit'],
filled=True))
display(Image(data=graph.pipe(format='png')))
return estimator
interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…
from yellowbrick.model_selection import FeatureImportances
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")
rf = RandomForestClassifier(bootstrap=True, # Corrected to boolean True
class_weight=None,
criterion='gini',
max_depth=5,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_jobs=-1,
oob_score=False,
random_state=1,
verbose=False,
warm_start=False)
viz = FeatureImportances(rf)
viz.fit(x_train, y_train)
viz.show();
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, random_state=0,
splitter='best')
viz = FeatureImportances(dt)
viz.fit(x_train, y_train)
viz.show();
from yellowbrick.classifier import ROCAUC
visualizer=ROCAUC(rf,classes=['stayed','quit'])
visualizer.fit(x_train,y_train)
visualizer.score(x_test,y_test)
visualizer.poof()
<Axes: title={'center': 'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>
from yellowbrick.classifier import ROCAUC
visualizer=ROCAUC(dt,classes=['stayed','quit'])
visualizer.fit(x_train,y_train)
visualizer.score(x_test,y_test)
visualizer.poof()
<Axes: title={'center': 'ROC Curves for DecisionTreeClassifier'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>