该项目中使用的数据集可在 UCI 机器学习存储库中找到。
import pandas as pd
import numpy as np
import scipy as sp
import math as mt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.impute import SimpleImputer
检查数据集中的空值
df=pd.read_csv("arrhythmia.csv",header=None)
df = df.replace('?', np.NaN)
第 13 列包含总共 452 个实例中的 350 多个缺失值。因此我们将删除第 13 列。其他属性的空值相对较少。因此,我们不会删除其他属性,而是将其平均值替换为空值。
#删除第13列
df.drop(columns = 13, inplace=True)
使用均值插补策略
# 进行复制以避免更改原始数据(插补时)
new_df = df.copy()
# 创建新列来指示将估算的内容
cols_with_missing = (col for col in new_df.columns if new_df[col].isnull().any())
for col in cols_with_missing:
new_df[col] = new_df[col].isnull()
# 插补
# my_imputer = SimpleImputer(missing_values=np.nan, Strategy='mean')
my_imputer = SimpleImputer()
new_df = pd.DataFrame(my_imputer.fit_transform(new_df))
new_df.columns = df.columns
# 估算数据框
new_df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0.0 | 190.0 | 80.0 | 91.0 | 193.0 | 371.0 | 174.0 | 121.0 | -16.0 | ... | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 | 8.0 |
1 | 56.0 | 1.0 | 165.0 | 64.0 | 81.0 | 174.0 | 401.0 | 149.0 | 39.0 | 25.0 | ... | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 | 6.0 |
2 | 54.0 | 0.0 | 172.0 | 95.0 | 138.0 | 163.0 | 386.0 | 185.0 | 102.0 | 96.0 | ... | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 | 10.0 |
3 | 55.0 | 0.0 | 175.0 | 94.0 | 100.0 | 202.0 | 380.0 | 179.0 | 143.0 | 28.0 | ... | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 | 1.0 |
4 | 75.0 | 0.0 | 190.0 | 80.0 | 88.0 | 181.0 | 360.0 | 177.0 | 103.0 | -16.0 | ... | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 | 7.0 |
5 rows × 279 columns
# 具有零 null 值的数据集。
pd.isnull(new_df).sum().sum()
0
生成最终数据集
#创建列名
final_df_columns=["Age","Sex","Height","Weight","QRS_Dur",
"P-R_Int","Q-T_Int","T_Int","P_Int","QRS","T","P","J","Heart_Rate",
"Q_Wave","R_Wave","S_Wave","R'_Wave","S'_Wave","Int_Def","Rag_R_Nom",
"Diph_R_Nom","Rag_P_Nom","Diph_P_Nom","Rag_T_Nom","Diph_T_Nom",
"DII00", "DII01","DII02", "DII03", "DII04","DII05","DII06","DII07","DII08","DII09","DII10","DII11",
"DIII00","DIII01","DIII02", "DIII03", "DIII04","DIII05","DIII06","DIII07","DIII08","DIII09","DIII10","DIII11",
"AVR00","AVR01","AVR02","AVR03","AVR04","AVR05","AVR06","AVR07","AVR08","AVR09","AVR10","AVR11",
"AVL00","AVL01","AVL02","AVL03","AVL04","AVL05","AVL06","AVL07","AVL08","AVL09","AVL10","AVL11",
"AVF00","AVF01","AVF02","AVF03","AVF04","AVF05","AVF06","AVF07","AVF08","AVF09","AVF10","AVF11",
"V100","V101","V102","V103","V104","V105","V106","V107","V108","V109","V110","V111",
"V200","V201","V202","V203","V204","V205","V206","V207","V208","V209","V210","V211",
"V300","V301","V302","V303","V304","V305","V306","V307","V308","V309","V310","V311",
"V400","V401","V402","V403","V404","V405","V406","V407","V408","V409","V410","V411",
"V500","V501","V502","V503","V504","V505","V506","V507","V508","V509","V510","V511",
"V600","V601","V602","V603","V604","V605","V606","V607","V608","V609","V610","V611",
"JJ_Wave","Amp_Q_Wave","Amp_R_Wave","Amp_S_Wave","R_Prime_Wave","S_Prime_Wave","P_Wave","T_Wave",
"QRSA","QRSTA","DII170","DII171","DII172","DII173","DII174","DII175","DII176","DII177","DII178","DII179",
"DIII180","DIII181","DIII182","DIII183","DIII184","DIII185","DIII186","DIII187","DIII188","DIII189",
"AVR190","AVR191","AVR192","AVR193","AVR194","AVR195","AVR196","AVR197","AVR198","AVR199",
"AVL200","AVL201","AVL202","AVL203","AVL204","AVL205","AVL206","AVL207","AVL208","AVL209",
"AVF210","AVF211","AVF212","AVF213","AVF214","AVF215","AVF216","AVF217","AVF218","AVF219",
"V1220","V1221","V1222","V1223","V1224","V1225","V1226","V1227","V1228","V1229",
"V2230","V2231","V2232","V2233","V2234","V2235","V2236","V2237","V2238","V2239",
"V3240","V3241","V3242","V3243","V3244","V3245","V3246","V3247","V3248","V3249",
"V4250","V4251","V4252","V4253","V4254","V4255","V4256","V4257","V4258","V4259",
"V5260","V5261","V5262","V5263","V5264","V5265","V5266","V5267","V5268","V5269",
"V6270","V6271","V6272","V6273","V6274","V6275","V6276","V6277","V6278","V6279","class"]
#将列名称添加到数据集
new_df.columns=final_df_columns
new_df.to_csv("new data with target class.csv")
new_df.head()
Age | Sex | Height | Weight | QRS_Dur | P-R_Int | Q-T_Int | T_Int | P_Int | QRS | ... | V6271 | V6272 | V6273 | V6274 | V6275 | V6276 | V6277 | V6278 | V6279 | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0.0 | 190.0 | 80.0 | 91.0 | 193.0 | 371.0 | 174.0 | 121.0 | -16.0 | ... | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 | 8.0 |
1 | 56.0 | 1.0 | 165.0 | 64.0 | 81.0 | 174.0 | 401.0 | 149.0 | 39.0 | 25.0 | ... | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 | 6.0 |
2 | 54.0 | 0.0 | 172.0 | 95.0 | 138.0 | 163.0 | 386.0 | 185.0 | 102.0 | 96.0 | ... | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 | 10.0 |
3 | 55.0 | 0.0 | 175.0 | 94.0 | 100.0 | 202.0 | 380.0 | 179.0 | 143.0 | 28.0 | ... | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 | 1.0 |
4 | 75.0 | 0.0 | 190.0 | 80.0 | 88.0 | 181.0 | 360.0 | 177.0 | 103.0 | -16.0 | ... | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 | 7.0 |
5 rows × 279 columns
因为我们的数据框已完全清理和预处理。我们将删除目标属性并存储最终的数据帧。
target=new_df["class"]
final_df = new_df.drop(columns ="class")
我们将使用 80% 的数据集用于训练目的,20% 用于测试目的。
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df, target ,test_size=0.2, random_state=1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
import warnings
warnings.filterwarnings('ignore')
由于因变量是分类变量,我们将使用分类模型。分类模型的最佳评估策略是比较精度和召回率。考虑分类评估指标、我们模型预测的重要性(我们不能接受有可能对健康人说您患有心律失常 (FN) 的结果)。
我们肯定会关注敏感性(被正确识别为患有该疾病的病人的百分比)而不是特异性(被正确识别为没有该疾病的健康人的百分比)。
# 导入评估指标。
from sklearn.metrics import r2_score,mean_squared_error,accuracy_score,recall_score,precision_score,confusion_matrix
# 将存储每个模型的结果。
result = pd.DataFrame(columns=['Model','Train Accuracy','Test Accuracy'])
from sklearn.neighbors import KNeighborsClassifier
knnclassifier = KNeighborsClassifier()
knnclassifier.fit(X_train, y_train)
y_pred = knnclassifier.predict(X_test)
knn_train_accuracy = accuracy_score(y_train, knnclassifier.predict(X_train))
knn_test_accuracy = accuracy_score(y_test, knnclassifier.predict(X_test))
result = result.append(pd.Series({'Model':'KNN Classifier','Train Accuracy':knn_train_accuracy,'Test Accuracy':knn_test_accuracy}),ignore_index=True)
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.65097 | 0.648352 |
from sklearn.linear_model import LogisticRegression
lgclassifier = LogisticRegression(solver = 'saga',random_state = 0)
lgclassifier.fit(X_train, y_train)
y_pred = lgclassifier.predict(X_test)
lg_train_recall = recall_score(y_train, lgclassifier.predict(X_train),average='weighted')
lg_test_recall = recall_score(y_test, lgclassifier.predict(X_test),average='weighted')
lg_train_accuracy = accuracy_score(y_train, lgclassifier.predict(X_train))
lg_test_accuracy = accuracy_score(y_test, lgclassifier.predict(X_test))
result = result.append(pd.Series({'Model':'Logestic Regression','Train Accuracy':lg_train_accuracy,'Test Accuracy':lg_test_accuracy}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
from sklearn.tree import DecisionTreeClassifier
dtclassifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,max_depth=5)
dtclassifier.fit(X_train, y_train)
y_pred_test = dtclassifier.predict(X_test)
y_pred_train = dtclassifier.predict(X_train)
dt_train_accuracy = accuracy_score(y_train,y_pred_train )
dt_test_accuracy = accuracy_score(y_test, y_pred_test)
result = result.append(pd.Series({'Model':'Decision Tree Classifier','Train Accuracy':dt_train_accuracy,'Test Accuracy':dt_test_accuracy}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
from sklearn.svm import LinearSVC
lsvclassifier = LinearSVC(C=0.01)
lsvclassifier.fit(X_train, y_train)
y_pred_test = lsvclassifier.predict(X_test)
y_pred_train = lsvclassifier.predict(X_train)
lsvc_train_accuracy_score = accuracy_score(y_train, y_pred_train)
lsvc_test_accuracy_score = accuracy_score(y_test, y_pred_test)
result = result.append(pd.Series({'Model':'Linear SVC','Train Accuracy':lsvc_train_accuracy_score,'Test Accuracy':lsvc_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
from sklearn import svm
KSVC_clf = svm.SVC(kernel='sigmoid',C=10,gamma=0.001)
KSVC_clf.fit(X_train, y_train)
y_pred_train = KSVC_clf.predict(X_train)
y_pred_test = KSVC_clf.predict(X_test)
ksvc_train_accuracy_score = accuracy_score(y_train, y_pred_train)
ksvc_test_accuracy_score = accuracy_score(y_test, y_pred_test)
result = result.append(pd.Series({'Model':'Kernelized SVC','Train Accuracy':ksvc_train_accuracy_score,'Test Accuracy':ksvc_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=300, criterion='gini',max_features=100,max_depth=10,max_leaf_nodes=30)
rf_clf.fit(X_train, y_train)
RandomForestClassifier(max_depth=10, max_features=100, max_leaf_nodes=30, n_estimators=300)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=10, max_features=100, max_leaf_nodes=30, n_estimators=300)
y_pred_train = rf_clf.predict(X_train)
y_pred_test = rf_clf.predict(X_test)
rf_train_accuracy_score = accuracy_score(y_train, y_pred_train)
rf_test_accuracy_score = accuracy_score(y_test, y_pred_test)
result = result.append(pd.Series({'Model':'Random Forest Classifier','Train Accuracy':rf_train_accuracy_score,'Test Accuracy':rf_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
我们发现,在召回率方面最好的模型是核化 SVM,与其他模型相比,准确率达到 79.12。我们还发现逻辑回归具有更好的准确率。
由于我们的数据集不平衡,类别 7 和类别 8 的实例数量分别为 2,3,而类别 1 的实例数量为 245。因此,我们将尝试通过使用过采样对训练数据集随机重新采样来解决类别不平衡问题。
我们将使用 PCA(主成分分析)来减少采样数据集的维度,以获得最佳特征,从而获得更高的准确性。
#执行过采样
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(final_df, target)
X_resampled.shape
(3185, 278)
#求各类别的频率
import collections
counter = collections.Counter(y_resampled)
counter
Counter({8.0: 245, 6.0: 245, 10.0: 245, 1.0: 245, 7.0: 245, 14.0: 245, 3.0: 245, 16.0: 245, 2.0: 245, 4.0: 245, 5.0: 245, 9.0: 245, 15.0: 245})
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_resampled, y_resampled , test_size=0.2, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train1)
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)
from sklearn.decomposition import PCA
pca = PCA(.98)
pca.fit(X_train1)
pca.n_components_
99
X_train1 = pca.transform(X_train1)
X_test1 = pca.transform(X_test1)
classifier = KNeighborsClassifier()
classifier.fit(X_train1, y_train1)
Y_pred = classifier.predict(X_test1)
knnp_train_accuracy = accuracy_score(y_train1,classifier.predict(X_train1))
knnp_test_accuracy = accuracy_score(y_test1,Y_pred)
#print(knnp_train_accuracy,knnp_test_accuracy)
result = result.append(pd.Series({'Model':'Knn with PCA','Train Accuracy':knnp_train_accuracy,'Test Accuracy':knnp_test_accuracy}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
from sklearn.linear_model import LogisticRegression
lgpclassifier = LogisticRegression(C=10,random_state = 0)
lgpclassifier.fit(X_train1, y_train1)
y_pred_train1 = lgpclassifier.predict(X_train1)
y_pred_test1 = lgpclassifier.predict(X_test1)
lgp_train_accuracy = accuracy_score(y_train1,y_pred_train1)
lgp_test_accuracy = accuracy_score(y_test1,y_pred_test1)
#print(lgp_train_accuracy,knnp_test_accuracy)
result = result.append(pd.Series({'Model':'Logestic Regression PCA','Train Accuracy':lgp_train_accuracy,'Test Accuracy':lgp_test_accuracy}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
7 | Logestic Regression PCA | 0.999215 | 0.968603 |
from sklearn.tree import DecisionTreeClassifier
dtpclassifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtpclassifier.fit(X_train1, y_train1)
y_pred_test = dtpclassifier.predict(X_test1)
y_pred_train = dtpclassifier.predict(X_train1)
dtp_train_recall_score = recall_score(y_train1, y_pred_train, average='weighted')
dtp_test_recall_score = recall_score(y_test1, y_pred_test, average='weighted')
dtp_train_accuracy_score = accuracy_score(y_train1, y_pred_train)
dtp_test_accuracy_score = accuracy_score(y_test1, y_pred_test)
result = result.append(pd.Series({'Model':'Decision Tree with PCA','Train Accuracy':dtp_train_accuracy_score,'Test Accuracy':dtp_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
7 | Logestic Regression PCA | 0.999215 | 0.968603 |
8 | Decision Tree with PCA | 1.000000 | 0.962323 |
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0,probability=True)
classifier.fit(X_train1, y_train1)
y_pred = classifier.predict(X_test1)
#print("准确率得分",accuracy_score(y_test1,y_pred)*100)
lsvcp_train_accuracy = accuracy_score(y_train1,classifier.predict(X_train1))
lsvcp_test_accuracy = accuracy_score(y_test1,y_pred)
result = result.append(pd.Series({'Model':'Linear SVM with PCA','Train Accuracy':lsvcp_train_accuracy,'Test Accuracy':lsvcp_test_accuracy}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
7 | Logestic Regression PCA | 0.999215 | 0.968603 |
8 | Decision Tree with PCA | 1.000000 | 0.962323 |
9 | Linear SVM with PCA | 0.999608 | 0.973312 |
from sklearn import svm
KSVC_clf = svm.SVC(kernel='rbf',C=1,gamma=0.1)
KSVC_clf.fit(X_train1, y_train1)
y_pred_train1 = KSVC_clf.predict(X_train1)
y_pred_test1 = KSVC_clf.predict(X_test1)
ksvcp_train_accuracy_score = accuracy_score(y_train1, y_pred_train1)
ksvcp_test_accuracy_score = accuracy_score(y_test1, y_pred_test1)
result = result.append(pd.Series({'Model':'Kernelized SVM with PCA','Train Accuracy':ksvcp_train_accuracy_score,'Test Accuracy':ksvcp_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
7 | Logestic Regression PCA | 0.999215 | 0.968603 |
8 | Decision Tree with PCA | 1.000000 | 0.962323 |
9 | Linear SVM with PCA | 0.999608 | 0.973312 |
10 | Kernelized SVM with PCA | 1.000000 | 0.995290 |
from sklearn.ensemble import RandomForestClassifier
rfp_clf = RandomForestClassifier()
rfp_clf.fit(X_train1, y_train1)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
y_pred_train1 = rfp_clf.predict(X_train1)
y_pred_test1 = rfp_clf.predict(X_test1)
rfp_train_accuracy_score = accuracy_score(y_train1, y_pred_train1)
rfp_test_accuracy_score = accuracy_score(y_test1, y_pred_test1)
result = result.append(pd.Series({'Model':'Random Forest with PCA','Train Accuracy':rfp_train_accuracy_score,'Test Accuracy':rfp_test_accuracy_score}),ignore_index=True )
result
Model | Train Accuracy | Test Accuracy | |
---|---|---|---|
0 | KNN Classifier | 0.650970 | 0.648352 |
1 | Logestic Regression | 0.939058 | 0.747253 |
2 | Decision Tree Classifier | 0.789474 | 0.681319 |
3 | Linear SVC | 0.880886 | 0.780220 |
4 | Kernelized SVC | 0.847645 | 0.780220 |
5 | Random Forest Classifier | 0.886427 | 0.747253 |
6 | Knn with PCA | 0.967818 | 0.943485 |
7 | Logestic Regression PCA | 0.999215 | 0.968603 |
8 | Decision Tree with PCA | 1.000000 | 0.962323 |
9 | Linear SVM with PCA | 0.999608 | 0.973312 |
10 | Kernelized SVM with PCA | 1.000000 | 0.995290 |
11 | Random Forest with PCA | 1.000000 | 0.995290 |
result.plot(kind="bar",figsize=(15,4))
plt.title('Train&Test Scores of Classifiers')
plt.xlabel('Models')
plt.ylabel('Scores')
plt.legend(loc=4 , bbox_to_anchor=(1.2, 0))
plt.show();
在我们对重采样数据应用 PCA 后,模型开始表现得更好。这背后的原因是,PCA降低了数据的复杂性。它基于对具有大方差的变量的重视来创建组件,并且它创建的组件本质上是非共线的,这意味着它照顾大数据集中的共线性。 PCA 还提高了模型的整体执行时间和质量,当我们处理大量变量时,它非常有益。
最佳模型召回分数是带有 PCA 的 Kernalized SVM,准确度为 99.52%。