import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('diabetes.csv')
print("Dataset loaded succesfully !!")
Dataset loaded succesfully !!
#前10行数据
df.head(10)
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
5 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
6 | 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 |
7 | 10 | 115 | 0 | 0 | 0 | 35.3 | 0.134 | 29 | 0 |
8 | 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 |
9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 |
#此数据集中的行数和列数
df.shape
(768, 9)
#数据的统计测量
df.describe()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
# 检查“OutCome”的输出计数
df['Outcome'].value_counts()
0 500 1 268 Name: Outcome, dtype: int64
0 --> 非糖尿病
1 --> 糖尿病
# 两种情况的平均值
df.groupby('Outcome').mean()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
---|---|---|---|---|---|---|---|---|
Outcome | ||||||||
0 | 3.298000 | 109.980000 | 68.184000 | 19.664000 | 68.792000 | 30.304200 | 0.429734 | 31.190000 |
1 | 4.865672 | 141.257463 | 70.824627 | 22.164179 | 100.335821 | 35.142537 | 0.550500 | 37.067164 |
# 分离数据和标签
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(X)
[[ 6. 148. 72. ... 33.6 0.627 50. ] [ 1. 85. 66. ... 26.6 0.351 31. ] [ 8. 183. 64. ... 23.3 0.672 32. ] ... [ 5. 121. 72. ... 26.2 0.245 30. ] [ 1. 126. 60. ... 30.1 0.349 47. ] [ 1. 93. 70. ... 30.4 0.315 23. ]]
print(y)
[1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 0]
# 进行标准化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_X = sc.fit_transform(X)
scaled_X
array([[ 0.63994726, 0.84832379, 0.14964075, ..., 0.20401277, 0.46849198, 1.4259954 ], [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195, -0.36506078, -0.19067191], [ 1.23388019, 1.94372388, -0.26394125, ..., -1.10325546, 0.60439732, -0.10558415], ..., [ 0.3429808 , 0.00330087, 0.14964075, ..., -0.73518964, -0.68519336, -0.27575966], [-0.84488505, 0.1597866 , -0.47073225, ..., -0.24020459, -0.37110101, 1.17073215], [-0.84488505, -0.8730192 , 0.04624525, ..., -0.20212881, -0.47378505, -0.87137393]])
# 将scaled_X替换为X
X = scaled_X
X
array([[ 0.63994726, 0.84832379, 0.14964075, ..., 0.20401277, 0.46849198, 1.4259954 ], [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195, -0.36506078, -0.19067191], [ 1.23388019, 1.94372388, -0.26394125, ..., -1.10325546, 0.60439732, -0.10558415], ..., [ 0.3429808 , 0.00330087, 0.14964075, ..., -0.73518964, -0.68519336, -0.27575966], [-0.84488505, 0.1597866 , -0.47073225, ..., -0.24020459, -0.37110101, 1.17073215], [-0.84488505, -0.8730192 , 0.04624525, ..., -0.20212881, -0.47378505, -0.87137393]])
将数据集拆分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=2)
# 让我们检查一下形状
print(X.shape,X_train.shape, X_test.shape)
(768, 8) (614, 8) (154, 8)
我们的任务是预测疾病,但有许多机器学习分类算法。我们不能直接使用其中一种算法,所以我们需要先制作pipeline并找到合适的算法。
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
# 线性回归
pipeline_lr = Pipeline([('scaler1', StandardScaler()),
('pca1', PCA(n_components=2)),
('lr', LogisticRegression())])
# 支持向量分类器
pipeline_svc = Pipeline([('scaler2', StandardScaler()),
('pca2', PCA(n_components=2)),
('svc', SVC(kernel='linear'))])
# 决策树分类器
pipeline_dt = Pipeline([('scaler3', StandardScaler()),
('pca3', PCA(n_components=2)),
('dt', DecisionTreeClassifier())])
# 随机森林分类器
pipeline_rf = Pipeline([('scaler4', StandardScaler()),
('pca4', PCA(n_components=2)),
('rf', RandomForestClassifier(n_estimators=200))])
# K 均值
pipeline_km = Pipeline([('scaler5', StandardScaler()),
('pca5', PCA(n_components=2)),
('km', KMeans(n_clusters=2, random_state=0))])
# 朴素贝叶斯 - GaussianNB
pipeline_gnb = Pipeline([('scaler6', StandardScaler()),
('pca6', PCA(n_components=2)),
('gnb', GaussianNB())])
# XGBoost 分类器
pipeline_xgb = Pipeline([('scaler7', StandardScaler()),
('pca7', PCA(n_components=2)),
('xgb', XGBClassifier())])
# K-最近邻
pipeline_knb = Pipeline([('scaler8', StandardScaler()),
('pca8', PCA(n_components=2)),
('knb', KNeighborsClassifier())])
#管道列表
pipelines = [pipeline_svc, pipeline_dt, pipeline_gnb, pipeline_km, pipeline_lr, pipeline_xgb, pipeline_knb, pipeline_rf]
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""
pipe_dict = {
0:'Logistic Regression',
1:"Support Vector Classifier",
2:"Decision Tree",
3:"Random Forest Classifier",
4:"KMeans",
5:"Naive Bayes",
6:"XGBoostClassifier",
7:"K-Nearest"
}
for pipe in pipelines:
pipe.fit(X_train, y_train)
for i, model in enumerate(pipelines):
print('{} Test Accuracy: {}\n'.format(pipe_dict[i], model.score(X_test, y_test)))
Logistic Regression Test Accuracy: 0.7012987012987013 Support Vector Classifier Test Accuracy: 0.6168831168831169 Decision Tree Test Accuracy: 0.6948051948051948 Random Forest Classifier Test Accuracy: -335.34699890879193 KMeans Test Accuracy: 0.7012987012987013 Naive Bayes Test Accuracy: 0.6493506493506493 XGBoostClassifier Test Accuracy: 0.6948051948051948 K-Nearest Test Accuracy: 0.6818181818181818
for i, model in enumerate(pipelines):
if model.score(X_test, y_test) > best_accuracy:
best_accuracy = model.score(X_test, y_test)
best_pipeline = model
best_classifier = i
print("Classifier with best accuracy: {}".format(pipe_dict[best_classifier]))
Classifier with best accuracy: Logistic Regression
好的,现在我们要构建机器学习模型,运行管道后,我们将使用 Logistic Regressoin 因为我们的输出是二进制的,即 0 和 1(根据我们的管道,在这种情况下逻辑回归是最好的)
逻辑回归
model = LogisticRegression()
model.fit(X_train,y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
让我们检查一下上述模型的分类报告、混淆矩阵和准确度得分
#导入工具
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# 预测类
y_pred = model.predict(X_test)
分类报告
print("CLASSIFICATION REPORT: ")
print(classification_report(y_test, y_pred))
CLASSIFICATION REPORT: precision recall f1-score support 0 0.77 0.89 0.83 100 1 0.72 0.52 0.60 54 accuracy 0.76 154 macro avg 0.75 0.70 0.72 154 weighted avg 0.75 0.76 0.75 154
准确度分数
#训练数据的准确率得分
X_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_pred,y_train )*100
print("ACCURACY OF TRAINING DATA: ",training_data_accuracy.round(2))
ACCURACY OF TRAINING DATA: 78.5
#测试数据的准确率分数
X_test_pred = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_pred,y_test)*100
print("ACCURACY OF TESTING DATA: ",testing_data_accuracy.round(2))
ACCURACY OF TESTING DATA: 75.97
混淆矩阵
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
#可视化混淆矩阵
import seaborn as sns
sns.heatmap(cm,annot=True)
<Axes: >
# input_data = (4,110,92,0,0,37.6,0.191,30) 不是糖尿病患者
input_data = (5,166,72,19,175,25.8,0.587,51) # diabetic
# 将数据输入numpy数组
input_data_as_numpy_array = np.asarray(input_data)
# 在我们预测一个实例时重塑数组
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
# 标准化输入数据
std_data = sc.transform(input_data_reshaped)
# 打印(标准数据)
prediction = model.predict(std_data) # imp line
# 打印(预测)
if (prediction[0]==0):
print('The person is not diabetic')
else:
print('The person is diabetic')
The person is diabetic