本项目中使用的数据集由分布在 16 个类别的 452 个不同示例组成。在 452 个例子中,245 个是“正常”人。我们还有 12 种不同类型的心律失常。在所有这些类型的心律失常中,最具代表性的是“冠状动脉疾病”和“Rjgbt 束支传导阻滞”。
import pandas as pd
import numpy as np
import scipy as sp
import math as mt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.impute import SimpleImputer
首先读取数据文件并创建一个数据框来放置我们将运行的所有模型的输出。这样做的目的是为了方便我们比较模型。
#读取csv文件
df=pd.read_csv("arrhythmia.csv",header=None)
一.查看数据集的前五行
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75 | 0 | 190 | 80 | 91 | 193 | 371 | 174 | 121 | -16 | ... | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 | 8 |
1 | 56 | 1 | 165 | 64 | 81 | 174 | 401 | 149 | 39 | 25 | ... | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 | 6 |
2 | 54 | 0 | 172 | 95 | 138 | 163 | 386 | 185 | 102 | 96 | ... | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 | 10 |
3 | 55 | 0 | 175 | 94 | 100 | 202 | 380 | 179 | 143 | 28 | ... | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 | 1 |
4 | 75 | 0 | 190 | 80 | 88 | 181 | 360 | 177 | 103 | -16 | ... | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 | 7 |
5 rows × 280 columns
二.查看数据集的最后五行
df.tail()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
447 | 53 | 1 | 160 | 70 | 80 | 199 | 382 | 154 | 117 | -37 | ... | 0.0 | 4.3 | -5.0 | 0.0 | 0.0 | 0.7 | 0.6 | -4.4 | -0.5 | 1 |
448 | 37 | 0 | 190 | 85 | 100 | 137 | 361 | 201 | 73 | 86 | ... | 0.0 | 15.6 | -1.6 | 0.0 | 0.0 | 0.4 | 2.4 | 38.0 | 62.4 | 10 |
449 | 36 | 0 | 166 | 68 | 108 | 176 | 365 | 194 | 116 | -85 | ... | 0.0 | 16.3 | -28.6 | 0.0 | 0.0 | 1.5 | 1.0 | -44.2 | -33.2 | 2 |
450 | 32 | 1 | 155 | 55 | 93 | 106 | 386 | 218 | 63 | 54 | ... | -0.4 | 12.0 | -0.7 | 0.0 | 0.0 | 0.5 | 2.4 | 25.0 | 46.6 | 1 |
451 | 78 | 1 | 160 | 70 | 79 | 127 | 364 | 138 | 78 | 28 | ... | 0.0 | 10.4 | -1.8 | 0.0 | 0.0 | 0.5 | 1.6 | 21.3 | 32.8 | 1 |
5 rows × 280 columns
三。数据框的基本描述
#数据集维度。
df.shape
(452, 280)
#数据框的简明摘要。
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 452 entries, 0 to 451 Columns: 280 entries, 0 to 279 dtypes: float64(120), int64(155), object(5) memory usage: 988.9+ KB
#数据帧的描述性统计。
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
0 | 452.0 | 46.471239 | 16.466631 | 0.0 | 36.00 | 47.00 | 58.000 | 83.0 |
1 | 452.0 | 0.550885 | 0.497955 | 0.0 | 0.00 | 1.00 | 1.000 | 1.0 |
2 | 452.0 | 166.188053 | 37.170340 | 105.0 | 160.00 | 164.00 | 170.000 | 780.0 |
3 | 452.0 | 68.170354 | 16.590803 | 6.0 | 59.00 | 68.00 | 79.000 | 176.0 |
4 | 452.0 | 88.920354 | 15.364394 | 55.0 | 80.00 | 86.00 | 94.000 | 188.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
275 | 452.0 | 0.514823 | 0.347531 | -0.8 | 0.40 | 0.50 | 0.700 | 2.4 |
276 | 452.0 | 1.222345 | 1.426052 | -6.0 | 0.50 | 1.35 | 2.100 | 6.0 |
277 | 452.0 | 19.326106 | 13.503922 | -44.2 | 11.45 | 18.10 | 25.825 | 88.8 |
278 | 452.0 | 29.473230 | 18.493927 | -38.6 | 17.55 | 27.90 | 41.125 | 115.9 |
279 | 452.0 | 3.880531 | 4.407097 | 1.0 | 1.00 | 1.00 | 6.000 | 16.0 |
275 rows × 8 columns
在浏览数据集时,我们观察到在 279 个属性中,有 5 个属性在表单中缺失值的格式为 '?'。我们将遵循的方法是,首先使用nan替换 '?',然后使用 isnull检测空值的数量
检查数据集中的空值
#统计空值总数
pd.isnull(df).sum().sum()
0
#替换?
df = df.replace('?', np.NaN)
#最终统计数据集中空值的总数
nu=pd.isnull(df).sum().sum()
nu
408
可视化缺失数据的分布:
pd.isnull(df).sum().plot()
plt.xlabel('Columns')
plt.ylabel('Total number of null value in each column')
Text(0, 0.5, 'Total number of null value in each column')
#放大
pd.isnull(df).sum()[7:17].plot(kind="pie")
plt.xlabel('Columns')
plt.ylabel('Total number of null value in each column')
Text(0, 0.5, 'Total number of null value in each column')
#可视化缺失值的确切列
pd.isnull(df).sum()[9:16].plot(kind="bar")
plt.xlabel('Columns')
plt.ylabel('Total number of null value in each column')
Text(0, 0.5, 'Total number of null value in each column')
#删除第 13 列,因为它包含许多缺失值。
df.drop(columns = 13, inplace=True)
使用均值策略和missing_values类型进行插补
# 进行复制以避免更改原始数据(插补时)
new_df = df.copy()
# 创建新列来指示将估算的内容
cols_with_missing = (col for col in new_df.columns
if new_df[col].isnull().any())
for col in cols_with_missing:
new_df[col] = new_df[col].isnull()
# 插补
my_imputer = SimpleImputer()
new_df = pd.DataFrame(my_imputer.fit_transform(new_df))
new_df.columns = df.columns
# 估算数据框
new_df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0.0 | 190.0 | 80.0 | 91.0 | 193.0 | 371.0 | 174.0 | 121.0 | -16.0 | ... | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 | 8.0 |
1 | 56.0 | 1.0 | 165.0 | 64.0 | 81.0 | 174.0 | 401.0 | 149.0 | 39.0 | 25.0 | ... | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 | 6.0 |
2 | 54.0 | 0.0 | 172.0 | 95.0 | 138.0 | 163.0 | 386.0 | 185.0 | 102.0 | 96.0 | ... | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 | 10.0 |
3 | 55.0 | 0.0 | 175.0 | 94.0 | 100.0 | 202.0 | 380.0 | 179.0 | 143.0 | 28.0 | ... | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 | 1.0 |
4 | 75.0 | 0.0 | 190.0 | 80.0 | 88.0 | 181.0 | 360.0 | 177.0 | 103.0 | -16.0 | ... | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 | 7.0 |
5 rows × 279 columns
# 具有零 null 值的数据集。
pd.isnull(new_df).sum().sum()
0
生成最终数据集
#创建列名
final_df_columns=["Age","Sex","Height","Weight","QRS_Dur",
"P-R_Int","Q-T_Int","T_Int","P_Int","QRS","T","P","J","Heart_Rate",
"Q_Wave","R_Wave","S_Wave","R'_Wave","S'_Wave","Int_Def","Rag_R_Nom",
"Diph_R_Nom","Rag_P_Nom","Diph_P_Nom","Rag_T_Nom","Diph_T_Nom",
"DII00", "DII01","DII02", "DII03", "DII04","DII05","DII06","DII07","DII08","DII09","DII10","DII11",
"DIII00","DIII01","DIII02", "DIII03", "DIII04","DIII05","DIII06","DIII07","DIII08","DIII09","DIII10","DIII11",
"AVR00","AVR01","AVR02","AVR03","AVR04","AVR05","AVR06","AVR07","AVR08","AVR09","AVR10","AVR11",
"AVL00","AVL01","AVL02","AVL03","AVL04","AVL05","AVL06","AVL07","AVL08","AVL09","AVL10","AVL11",
"AVF00","AVF01","AVF02","AVF03","AVF04","AVF05","AVF06","AVF07","AVF08","AVF09","AVF10","AVF11",
"V100","V101","V102","V103","V104","V105","V106","V107","V108","V109","V110","V111",
"V200","V201","V202","V203","V204","V205","V206","V207","V208","V209","V210","V211",
"V300","V301","V302","V303","V304","V305","V306","V307","V308","V309","V310","V311",
"V400","V401","V402","V403","V404","V405","V406","V407","V408","V409","V410","V411",
"V500","V501","V502","V503","V504","V505","V506","V507","V508","V509","V510","V511",
"V600","V601","V602","V603","V604","V605","V606","V607","V608","V609","V610","V611",
"JJ_Wave","Amp_Q_Wave","Amp_R_Wave","Amp_S_Wave","R_Prime_Wave","S_Prime_Wave","P_Wave","T_Wave",
"QRSA","QRSTA","DII170","DII171","DII172","DII173","DII174","DII175","DII176","DII177","DII178","DII179",
"DIII180","DIII181","DIII182","DIII183","DIII184","DIII185","DIII186","DIII187","DIII188","DIII189",
"AVR190","AVR191","AVR192","AVR193","AVR194","AVR195","AVR196","AVR197","AVR198","AVR199",
"AVL200","AVL201","AVL202","AVL203","AVL204","AVL205","AVL206","AVL207","AVL208","AVL209",
"AVF210","AVF211","AVF212","AVF213","AVF214","AVF215","AVF216","AVF217","AVF218","AVF219",
"V1220","V1221","V1222","V1223","V1224","V1225","V1226","V1227","V1228","V1229",
"V2230","V2231","V2232","V2233","V2234","V2235","V2236","V2237","V2238","V2239",
"V3240","V3241","V3242","V3243","V3244","V3245","V3246","V3247","V3248","V3249",
"V4250","V4251","V4252","V4253","V4254","V4255","V4256","V4257","V4258","V4259",
"V5260","V5261","V5262","V5263","V5264","V5265","V5266","V5267","V5268","V5269",
"V6270","V6271","V6272","V6273","V6274","V6275","V6276","V6277","V6278","V6279","class"]
#将列名称添加到数据集
new_df.columns=final_df_columns
new_df.to_csv("new data with target class.csv")
new_df.head()
Age | Sex | Height | Weight | QRS_Dur | P-R_Int | Q-T_Int | T_Int | P_Int | QRS | ... | V6271 | V6272 | V6273 | V6274 | V6275 | V6276 | V6277 | V6278 | V6279 | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0.0 | 190.0 | 80.0 | 91.0 | 193.0 | 371.0 | 174.0 | 121.0 | -16.0 | ... | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 | 8.0 |
1 | 56.0 | 1.0 | 165.0 | 64.0 | 81.0 | 174.0 | 401.0 | 149.0 | 39.0 | 25.0 | ... | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 | 6.0 |
2 | 54.0 | 0.0 | 172.0 | 95.0 | 138.0 | 163.0 | 386.0 | 185.0 | 102.0 | 96.0 | ... | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 | 10.0 |
3 | 55.0 | 0.0 | 175.0 | 94.0 | 100.0 | 202.0 | 380.0 | 179.0 | 143.0 | 28.0 | ... | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 | 1.0 |
4 | 75.0 | 0.0 | 190.0 | 80.0 | 88.0 | 181.0 | 360.0 | 177.0 | 103.0 | -16.0 | ... | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 | 7.0 |
5 rows × 279 columns
# 删除目标属性以创建最终数据帧。
final_df = new_df.drop(columns ="class")
final_df.to_csv("FInal Dataset with dropped class Attribute.csv")
final_df.head()
Age | Sex | Height | Weight | QRS_Dur | P-R_Int | Q-T_Int | T_Int | P_Int | QRS | ... | V6270 | V6271 | V6272 | V6273 | V6274 | V6275 | V6276 | V6277 | V6278 | V6279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0.0 | 190.0 | 80.0 | 91.0 | 193.0 | 371.0 | 174.0 | 121.0 | -16.0 | ... | -0.3 | 0.0 | 9.0 | -0.9 | 0.0 | 0.0 | 0.9 | 2.9 | 23.3 | 49.4 |
1 | 56.0 | 1.0 | 165.0 | 64.0 | 81.0 | 174.0 | 401.0 | 149.0 | 39.0 | 25.0 | ... | -0.5 | 0.0 | 8.5 | 0.0 | 0.0 | 0.0 | 0.2 | 2.1 | 20.4 | 38.8 |
2 | 54.0 | 0.0 | 172.0 | 95.0 | 138.0 | 163.0 | 386.0 | 185.0 | 102.0 | 96.0 | ... | 0.9 | 0.0 | 9.5 | -2.4 | 0.0 | 0.0 | 0.3 | 3.4 | 12.3 | 49.0 |
3 | 55.0 | 0.0 | 175.0 | 94.0 | 100.0 | 202.0 | 380.0 | 179.0 | 143.0 | 28.0 | ... | 0.1 | 0.0 | 12.2 | -2.2 | 0.0 | 0.0 | 0.4 | 2.6 | 34.6 | 61.6 |
4 | 75.0 | 0.0 | 190.0 | 80.0 | 88.0 | 181.0 | 360.0 | 177.0 | 103.0 | -16.0 | ... | -0.4 | 0.0 | 13.1 | -3.6 | 0.0 | 0.0 | -0.1 | 3.9 | 25.4 | 62.8 |
5 rows × 278 columns
pd.isnull(final_df).sum().sum()
0
列出所有 16 个类名称的列表。
#带有类名的列表
class_names = ["Normal",
"Ischemic changes (CAD)",
"Old Anterior Myocardial Infraction",
"Old Inferior Myocardial Infraction",
"Sinus tachycardy",
"Sinus bradycardy",
"Ventricular Premature Contraction (PVC)",
"Supraventricular Premature Contraction",
"Left Boundle branch block",
"Right boundle branch block",
"1.Degree AtrioVentricular block",
"2.Degree AV block",
"3.Degree AV block",
"Left Ventricule hypertrophy",
"Atrial Fibrillation or Flutter",
"Others"]
# 仅检查属性的数据类型
final_df[["Age","Heart_Rate"]].dtypes
Age float64 Heart_Rate float64 dtype: object
我们需要根据类属性对数据集进行排序,以计算每个类可用的实例数量
#根据类属性进行排序。
t=new_df.sort_values(by=["class"])
# 计算每个类的实例数量
la = t["class"].value_counts(sort=False).tolist()
la
[245, 44, 15, 15, 13, 25, 3, 2, 9, 50, 4, 5, 22]
#关于类的数据集可视化
labels = class_names
values = la[0:10]
values.extend([0,0,0])
values.extend(la[10:13])
print(values)
#我们用以 10 为底的对数对数据进行标准化,以便能够“以合适的方式绘制它们”
Log_Norm = []
for i in values:
Log_Norm.append(mt.log10(i+1))
fig1, ax1 = plt.subplots(figsize=(16,9))
patches = plt.pie(Log_Norm, autopct='%1.1f%%', startangle=90)
leg = plt.legend( loc = 'best', labels=['%s, %1.1f %%' % (l, s) for l, s in zip(labels, Log_Norm)])
plt.axis('equal')
for text in leg.get_texts():
plt.setp(text, color = 'Black')
plt.tight_layout()
plt.show()
[245, 44, 15, 15, 13, 25, 3, 2, 9, 50, 0, 0, 0, 4, 5, 22]
fdict = {}
j = 0
for i in labels:
fdict[i] = Log_Norm[j]
j+=1
fdf = pd.DataFrame(fdict,index=[0])
fdf = fdf.rename(index={0: ''})
fig, ax = plt.subplots(figsize=(15,10))
fdf.plot(kind="bar",ax=ax)
ax.set_title("(Log)-Histogram of the Classes Distributions ")
ax.set_ylabel('Number of examples')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
#分组描述
import scipy.stats as sp
grouped_mean = new_df.groupby(["class"]).mean()
grouped_median = new_df.groupby(["class"]).median()
grouped_mode = new_df.groupby(["class"]).agg(lambda x: sp.mode(x, axis=None, keepdims=True)[0])
gouped_std = new_df.groupby(['class']).std()
grouped_median.head()
Age | Sex | Height | Weight | QRS_Dur | P-R_Int | Q-T_Int | T_Int | P_Int | QRS | ... | V6270 | V6271 | V6272 | V6273 | V6274 | V6275 | V6276 | V6277 | V6278 | V6279 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
class | |||||||||||||||||||||
1.0 | 46.0 | 1.0 | 163.0 | 68.0 | 84.0 | 156.0 | 367.0 | 160.0 | 91.0 | 41.0 | ... | -0.1 | 0.0 | 9.1 | -0.9 | 0.0 | 0.0 | 0.5 | 1.6 | 19.10 | 31.7 |
2.0 | 55.0 | 1.0 | 160.5 | 70.0 | 90.5 | 159.0 | 366.0 | 176.5 | 91.0 | 37.5 | ... | -0.8 | 0.0 | 9.6 | -1.3 | 0.0 | 0.0 | 0.5 | -0.8 | 20.05 | 11.8 |
3.0 | 50.0 | 0.0 | 170.0 | 74.0 | 93.0 | 163.0 | 351.0 | 196.0 | 96.0 | 48.0 | ... | -0.2 | 0.0 | 6.3 | -2.0 | 0.0 | 0.0 | 0.8 | 0.6 | 12.30 | 19.3 |
4.0 | 57.0 | 0.0 | 165.0 | 76.0 | 93.0 | 157.0 | 357.0 | 161.0 | 95.0 | -2.0 | ... | -0.2 | -0.7 | 6.3 | -0.5 | 0.0 | 0.0 | 0.7 | 0.3 | 10.90 | 12.0 |
5.0 | 34.0 | 1.0 | 160.0 | 51.0 | 82.0 | 155.0 | 302.0 | 153.0 | 81.0 | 52.0 | ... | -0.3 | 0.0 | 5.1 | -2.1 | 0.0 | 0.0 | 0.5 | 0.9 | 6.00 | 10.7 |
5 rows × 278 columns
#相关热图
sns.heatmap(new_df[['Age', 'Sex', 'Height', 'Weight','class']].corr(), cmap='CMRmap',center=0)
plt.show()
#寻找成对关系和异常值
g = sns.PairGrid(final_df, vars=['Age', 'Sex', 'Height', 'Weight'],
hue='Sex', palette='BrBG')
g.map(plt.scatter, alpha=0.8)
g.add_legend();
根据散点图,“身高”和“体重”属性几乎没有异常值。检查身高和体重的最大值
sorted(final_df['Height'], reverse=True)[:10]
[780.0, 608.0, 190.0, 190.0, 190.0, 188.0, 186.0, 186.0, 186.0, 185.0]
世界上最高的人是 272 厘米(1940 年)。他的追随者是 267 厘米(1905 年)和 263.5 厘米(1969 年)。将 780 和 608 分别替换为 180 和 108 厘米
final_df['Height']=final_df['Height'].replace(608,108)
final_df['Height']=final_df['Height'].replace(780,180)
sorted(final_df['Weight'], reverse=True)[:10]
[176.0, 124.0, 110.0, 106.0, 105.0, 105.0, 104.0, 104.0, 100.0, 98.0]
176 公斤 是可能的重量。所以我们将它们保留在数据框中。
g = sns.PairGrid(final_df, vars=['Age', 'Sex', 'Height', 'Weight'],
hue='Sex', palette='RdBu_r')
g.map(plt.scatter, alpha=0.8)
g.add_legend();
sns.boxplot(data=final_df[["QRS_Dur","P-R_Int","Q-T_Int","T_Int","P_Int"]]);
PR 间期是指从 P 波开始一直延伸到 QRS 波群开始的时间段(以毫秒为单位);它的持续时间通常在 120 到 200 毫秒之间。
final_df['P-R_Int'].value_counts().sort_index().head().plot(kind='bar')
plt.xlabel('P-R Interval Values')
plt.ylabel('Count');
final_df['P-R_Int'].value_counts().sort_index().tail().plot(kind='bar')
plt.xlabel('P-R Interval Values')
plt.ylabel('Count');
QT 间期是心脏电周期中 Q 波开始和 T 波结束之间时间的测量。 Q-T 间隔框中出现的异常值数据可能与 T 间隔数据的异常值有关。
final_df['T_Int'].value_counts().sort_index(ascending=False).head().plot(kind='bar')
plt.xlabel('T Interval Values')
plt.ylabel('Count');
final_df['T_Int'].value_counts().sort_index(ascending=False).tail().plot(kind='bar')
plt.xlabel('T Interval Values')
plt.ylabel('Count');
sns.boxplot(data=final_df[["QRS","T","P","J","Heart_Rate"]]);
sns.boxplot(data=final_df[["Q_Wave","R_Wave","S_Wave"]])
sns.swarmplot(data=final_df[["Q_Wave","R_Wave","S_Wave"]]);
d:\anaconda\envs\pytorch2.0\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 78.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) d:\anaconda\envs\pytorch2.0\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 46.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) d:\anaconda\envs\pytorch2.0\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 52.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning)
sns.boxplot(data=final_df[["R'_Wave","S'_Wave","Int_Def","Rag_R_Nom"]]);
final_df["R'_Wave"].value_counts().sort_index(ascending=False)
24.0 1 16.0 1 12.0 2 0.0 448 Name: R'_Wave, dtype: int64
final_df["S'_Wave"].value_counts().sort_index(ascending=False)
0.0 452 Name: S'_Wave, dtype: int64
final_df["Rag_R_Nom"].value_counts().sort_index(ascending=False)
1.0 1 0.0 451 Name: Rag_R_Nom, dtype: int64
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["Diph_R_Nom","Rag_P_Nom","Diph_P_Nom","Rag_T_Nom","Diph_T_Nom"]]);
final_df["Diph_R_Nom"].value_counts().sort_index(ascending=False)
1.0 5 0.0 447 Name: Diph_R_Nom, dtype: int64
final_df["Rag_P_Nom"].value_counts().sort_index(ascending=False)
1.0 5 0.0 447 Name: Rag_P_Nom, dtype: int64
final_df["Diph_P_Nom"].value_counts().sort_index(ascending=False)
1.0 2 0.0 450 Name: Diph_P_Nom, dtype: int64
final_df["Rag_T_Nom"].value_counts().sort_index(ascending=False)
1.0 2 0.0 450 Name: Rag_T_Nom, dtype: int64
final_df["Diph_T_Nom"].value_counts().sort_index(ascending=False)
1.0 4 0.0 448 Name: Diph_T_Nom, dtype: int64
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["DII00", "DII01","DII02", "DII03", "DII04","DII05","DII06","DII07","DII08","DII09","DII10","DII11"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["DIII00","DIII01","DIII02", "DIII03", "DIII04","DIII05","DIII06",
"DIII07","DIII08","DIII09","DIII10","DIII11"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVR00","AVR01","AVR02","AVR03","AVR04","AVR05",
"AVR06","AVR07","AVR08","AVR09","AVR10","AVR11"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVL00","AVL01","AVL02","AVL03","AVL04","AVL05","AVL06","AVL07","AVL08","AVL09","AVL10","AVL11"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVF00","AVF01","AVF02","AVF03","AVF04","AVF05","AVF06","AVF07","AVF08","AVF09","AVF10","AVF11"]]);
sns.set(rc={'figure.figsize':(12,6)})
sns.boxplot(data=final_df[["V100","V101","V102","V103","V104","V105","V106","V107","V108","V109","V110","V111"]]);
final_df["V101"].value_counts().sort_index(ascending=False)
216.0 1 112.0 1 84.0 1 72.0 1 68.0 1 64.0 1 48.0 6 44.0 6 40.0 13 36.0 36 32.0 63 28.0 81 24.0 88 20.0 57 16.0 13 12.0 4 0.0 79 Name: V101, dtype: int64
V101 有一个离群值,但是当我们查看其他集合(V201、V301、V501)时,我们可以看到类似地也有一个离群值。由于我们的数据存在严重偏差,我不能说这些异常值应该被删除。
例如,当我们查看数据时,我们可以看到第 8 类(室上性早搏)只有 2 个实例。或者 3类(心室早搏 (PVC))只有 3 个。我们图中出现的异常值可能属于这些实例,需要保留。
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V200","V201","V202","V203","V204","V205","V206","V207","V208","V209","V210","V211"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V300","V301","V302","V303","V304","V305","V306","V307","V308","V309","V310","V311"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V400","V401","V402","V403","V404","V405","V406","V407","V408","V409","V410","V411"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V500","V501","V502","V503","V504","V505","V506","V507","V508","V509","V510","V511"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V600","V601","V602","V603","V604","V605","V606","V607","V608","V609","V610","V611"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["JJ_Wave","Amp_Q_Wave","Amp_R_Wave","Amp_S_Wave","R_Prime_Wave","S_Prime_Wave","P_Wave","T_Wave"]]);
sns.set(rc={'figure.figsize':(13.7,5.27)})
sns.boxplot(data=final_df[["QRSA","QRSTA","DII170","DII171","DII172","DII173","DII174","DII175","DII176","DII177","DII178","DII179"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["DIII180","DIII181","DIII182","DIII183","DIII184","DIII185","DIII186","DIII187","DIII188","DIII189"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVR190","AVR191","AVR192","AVR193","AVR194","AVR195","AVR196","AVR197","AVR198","AVR199"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVL200","AVL201","AVL202","AVL203","AVL204","AVL205","AVL206","AVL207","AVL208","AVL209"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["AVF210","AVF211","AVF212","AVF213","AVF214","AVF215","AVF216","AVF217","AVF218","AVF219"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V1220","V1221","V1222","V1223","V1224","V1225","V1226","V1227","V1228","V1229"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V2230","V2231","V2232","V2233","V2234","V2235","V2236","V2237","V2238","V2239"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V3240","V3241","V3242","V3243","V3244","V3245","V3246","V3247","V3248","V3249"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V4250","V4251","V4252","V4253","V4254","V4255","V4256","V4257","V4258","V4259"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V5260","V5261","V5262","V5263","V5264","V5265","V5266","V5267","V5268","V5269"]]);
sns.set(rc={'figure.figsize':(11.7,5.27)})
sns.boxplot(data=final_df[["V6270","V6271","V6272","V6273","V6274","V6275","V6276","V6277","V6278","V6279"]]);
class_list = []
j = 1
new_class_names =[]
for i in class_names:
if(i != "1.Degree AtrioVentricular block" and i!= '3.Degree AV block' and i!= "2.Degree AV block" ):
class_list.append([j,i,grouped_mean["Age"][j],grouped_median["Age"][j],grouped_mode["Age"][j],gouped_std["Age"][j]])
new_class_names.append(i)
j+=1
AgeClass = {
"AgeAVG": grouped_mean["Age"],"AgeMdian":grouped_median["Age"],
"AgeMode": grouped_mode["Age"],"AgeStd":gouped_std['Age'],"Class":new_class_names }
AgeClassDF =pd.DataFrame(AgeClass)
fig, ax = plt.subplots(figsize=(15,10))
AgeClassDF.plot(kind="bar",ax=ax)
ax.set_title("Plot of the Age statistics")
ax.set_ylabel('Values of Mean, Median, Mode and Standard Deviation')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
AgeClassDF
AgeAVG | AgeMdian | AgeMode | AgeStd | Class | |
---|---|---|---|---|---|
class | |||||
1.0 | 46.273469 | 46.0 | 0.0 | 14.556092 | Normal |
2.0 | 51.750000 | 55.0 | 0.0 | 14.160418 | Ischemic changes (CAD) |
3.0 | 53.333333 | 50.0 | 0.0 | 11.286317 | Old Anterior Myocardial Infraction |
4.0 | 57.266667 | 57.0 | 0.0 | 11.895177 | Old Inferior Myocardial Infraction |
5.0 | 30.846154 | 34.0 | 0.0 | 27.904140 | Sinus tachycardy |
6.0 | 47.920000 | 46.0 | 0.0 | 13.165359 | Sinus bradycardy |
7.0 | 60.666667 | 67.0 | 0.0 | 18.339393 | Ventricular Premature Contraction (PVC) |
8.0 | 71.000000 | 71.0 | 0.0 | 5.656854 | Supraventricular Premature Contraction |
9.0 | 58.222222 | 66.0 | 0.0 | 21.211894 | Left Boundle branch block |
10.0 | 38.920000 | 38.5 | 0.0 | 17.692198 | Right boundle branch block |
14.0 | 19.500000 | 17.5 | 0.0 | 11.090537 | Left Ventricule hypertrophy |
15.0 | 57.400000 | 58.0 | 0.0 | 6.188699 | Atrial Fibrillation or Flutter |
16.0 | 44.272727 | 43.5 | 0.0 | 19.644900 | Others |
男性和女性基本统计
class_list = []
j = 1
new_class_names =[]
for i in class_names:
if(i != "1.Degree AtrioVentricular block" and i!= '3.Degree AV block' and i!= "2.Degree AV block" ):
class_list.append([j,i,grouped_mean["Sex"][j],grouped_median["Sex"][j],grouped_mode["Sex"][j],gouped_std["Sex"][j]])
new_class_names.append(i)
j+=1
SexClass = {
"SexAVG": grouped_mean["Sex"],"SexMdian":grouped_median["Sex"],
"SexMode": grouped_mode["Sex"],"SexStd":gouped_std['Sex'],"Class":new_class_names }
SexClassDF =pd.DataFrame(SexClass)
fig, ax = plt.subplots(figsize=(15,10))
SexClassDF.plot(kind="bar",ax=ax)
ax.set_title("Plot of the Sex statistics")
ax.set_ylabel('Values of Mean, Median, Mode and Standard Deviation')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
SexClassDF
SexAVG | SexMdian | SexMode | SexStd | Class | |
---|---|---|---|---|---|
class | |||||
1.0 | 0.653061 | 1.0 | 0.0 | 0.476970 | Normal |
2.0 | 0.590909 | 1.0 | 0.0 | 0.497350 | Ischemic changes (CAD) |
3.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | Old Anterior Myocardial Infraction |
4.0 | 0.266667 | 0.0 | 0.0 | 0.457738 | Old Inferior Myocardial Infraction |
5.0 | 0.692308 | 1.0 | 0.0 | 0.480384 | Sinus tachycardy |
6.0 | 0.440000 | 0.0 | 0.0 | 0.506623 | Sinus bradycardy |
7.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | Ventricular Premature Contraction (PVC) |
8.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | Supraventricular Premature Contraction |
9.0 | 0.555556 | 1.0 | 0.0 | 0.527046 | Left Boundle branch block |
10.0 | 0.440000 | 0.0 | 0.0 | 0.501427 | Right boundle branch block |
14.0 | 0.250000 | 0.0 | 0.0 | 0.500000 | Left Ventricule hypertrophy |
15.0 | 0.800000 | 1.0 | 0.0 | 0.447214 | Atrial Fibrillation or Flutter |
16.0 | 0.318182 | 0.0 | 0.0 | 0.476731 | Others |
class_list = []
j = 1
new_class_names =[]
for i in class_names:
if(i != "1.Degree AtrioVentricular block" and i!= '3.Degree AV block' and i!= "2.Degree AV block" ):
class_list.append([j,i,grouped_mean["Weight"][j],grouped_median["Weight"][j],grouped_mode["Weight"][j],gouped_std["Weight"][j]])
new_class_names.append(i)
j+=1
WeightClass = {
"WeightAVG": grouped_mean["Weight"],"WeightMdian":grouped_median["Weight"],
"WeightMode": grouped_mode["Weight"],"WeightStd":gouped_std['Weight'],"Class":new_class_names }
WeightClassDF =pd.DataFrame(WeightClass)
fig, ax = plt.subplots(figsize=(15,10))
WeightClassDF.plot(kind="bar",ax=ax)
ax.set_title("Plot of the Weights statistics")
ax.set_ylabel('Values of Mean, Median, Mode and Standard Deviation')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
WeightClassDF
WeightAVG | WeightMdian | WeightMode | WeightStd | Class | |
---|---|---|---|---|---|
class | |||||
1.0 | 68.669388 | 68.0 | 0.0 | 14.454595 | Normal |
2.0 | 73.363636 | 70.0 | 0.0 | 20.617323 | Ischemic changes (CAD) |
3.0 | 76.866667 | 74.0 | 0.0 | 12.397388 | Old Anterior Myocardial Infraction |
4.0 | 73.866667 | 76.0 | 0.0 | 9.318696 | Old Inferior Myocardial Infraction |
5.0 | 44.615385 | 51.0 | 0.0 | 29.259011 | Sinus tachycardy |
6.0 | 68.000000 | 66.0 | 0.0 | 11.045361 | Sinus bradycardy |
7.0 | 73.000000 | 74.0 | 0.0 | 7.549834 | Ventricular Premature Contraction (PVC) |
8.0 | 79.000000 | 79.0 | 0.0 | 1.414214 | Supraventricular Premature Contraction |
9.0 | 66.444444 | 72.0 | 0.0 | 18.808981 | Left Boundle branch block |
10.0 | 63.300000 | 63.5 | 0.0 | 16.579124 | Right boundle branch block |
14.0 | 50.250000 | 51.5 | 0.0 | 19.653244 | Left Ventricule hypertrophy |
15.0 | 76.000000 | 78.0 | 0.0 | 13.209845 | Atrial Fibrillation or Flutter |
16.0 | 68.136364 | 70.5 | 0.0 | 18.061523 | Others |
class_list = []
j = 1
new_class_names =[]
for i in class_names:
if(i != "1.Degree AtrioVentricular block" and i!= '3.Degree AV block' and i!= "2.Degree AV block" ):
class_list.append([j,i,grouped_mean["Height"][j],grouped_median["Height"][j],grouped_mode["Height"][j],gouped_std["Height"][j]])
new_class_names.append(i)
j+=1
HeightClass = {
"HeightAVG": grouped_mean["Height"],"HeightMdian":grouped_median["Height"],
"HeightMode": grouped_mode["Height"],"HeightStd":gouped_std['Height'],"Class":new_class_names }
HeightClassDF =pd.DataFrame(HeightClass)
fig, ax = plt.subplots(figsize=(15,10))
HeightClassDF.plot(kind="bar",ax=ax)
ax.set_title("Plot of the Heights statistics")
ax.set_ylabel('Values of Mean, Median, Mode and Standard Deviation')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
HeightClassDF
HeightAVG | HeightMdian | HeightMode | HeightStd | Class | |
---|---|---|---|---|---|
class | |||||
1.0 | 164.102041 | 163.0 | 0.0 | 8.048126 | Normal |
2.0 | 163.068182 | 160.5 | 0.0 | 7.456534 | Ischemic changes (CAD) |
3.0 | 170.600000 | 170.0 | 0.0 | 4.579457 | Old Anterior Myocardial Infraction |
4.0 | 166.000000 | 165.0 | 0.0 | 7.973169 | Old Inferior Myocardial Infraction |
5.0 | 233.615385 | 160.0 | 0.0 | 208.500015 | Sinus tachycardy |
6.0 | 165.040000 | 165.0 | 0.0 | 6.996904 | Sinus bradycardy |
7.0 | 178.000000 | 176.0 | 0.0 | 11.135529 | Ventricular Premature Contraction (PVC) |
8.0 | 176.500000 | 176.5 | 0.0 | 19.091883 | Supraventricular Premature Contraction |
9.0 | 155.666667 | 158.0 | 0.0 | 16.209565 | Left Boundle branch block |
10.0 | 163.520000 | 165.0 | 0.0 | 13.699754 | Right boundle branch block |
14.0 | 156.000000 | 167.0 | 0.0 | 24.097026 | Left Ventricule hypertrophy |
15.0 | 161.400000 | 160.0 | 0.0 | 4.098780 | Atrial Fibrillation or Flutter |
16.0 | 165.000000 | 168.0 | 0.0 | 13.288018 | Others |
class_list = []
j = 1
new_class_names =[]
for i in class_names:
if(i != "1.Degree AtrioVentricular block" and i!= '3.Degree AV block' and i!= "2.Degree AV block" ):
class_list.append([j,i,grouped_mean["QRS_Dur"][j],grouped_median["QRS_Dur"][j],grouped_mode["QRS_Dur"][j],gouped_std["QRS_Dur"][j]])
new_class_names.append(i)
j+=1
QRSTClass = {
"QRSDAVG": grouped_mean["QRS_Dur"],"QRSDMdian":grouped_median["QRS_Dur"],
"QRSDMode": grouped_mode["QRS_Dur"],"QRSDStd":gouped_std['QRS_Dur'],"Class":new_class_names }
QRSTClassDF =pd.DataFrame(QRSTClass)
fig, ax = plt.subplots(figsize=(15,10))
QRSTClassDF.plot(kind="bar",ax=ax)
ax.set_title("Plot of the QRS Duration statistics")
ax.set_ylabel('Values of Mean, Median, Mode and Standard Deviation')
ax.set_xlabel('Classes')
fig.tight_layout()
plt.show()
QRSTClassDF
QRSDAVG | QRSDMdian | QRSDMode | QRSDStd | Class | |
---|---|---|---|---|---|
class | |||||
1.0 | 84.277551 | 84.0 | 0.0 | 8.735439 | Normal |
2.0 | 89.818182 | 90.5 | 0.0 | 10.207778 | Ischemic changes (CAD) |
3.0 | 93.333333 | 93.0 | 0.0 | 9.131317 | Old Anterior Myocardial Infraction |
4.0 | 89.866667 | 93.0 | 0.0 | 10.979635 | Old Inferior Myocardial Infraction |
5.0 | 88.000000 | 82.0 | 0.0 | 29.796532 | Sinus tachycardy |
6.0 | 83.160000 | 83.0 | 0.0 | 8.561931 | Sinus bradycardy |
7.0 | 96.333333 | 92.0 | 0.0 | 11.150486 | Ventricular Premature Contraction (PVC) |
8.0 | 98.500000 | 98.5 | 0.0 | 10.606602 | Supraventricular Premature Contraction |
9.0 | 154.444444 | 147.0 | 0.0 | 16.409685 | Left Boundle branch block |
10.0 | 97.580000 | 95.0 | 0.0 | 15.129495 | Right boundle branch block |
14.0 | 99.250000 | 97.5 | 0.0 | 6.396614 | Left Ventricule hypertrophy |
15.0 | 88.400000 | 82.0 | 0.0 | 19.034180 | Atrial Fibrillation or Flutter |
16.0 | 92.136364 | 94.0 | 0.0 | 11.825298 | Others |
sns.jointplot(x="Sex", y="Age", data=final_df, kind='kde')
<seaborn.axisgrid.JointGrid at 0x1cc0210b050>
sns.jointplot(x="Sex", y="Weight", data=final_df, kind='resid')
<seaborn.axisgrid.JointGrid at 0x1cc0ed8b350>
sns.jointplot(x="Age", y="Weight", data=final_df, kind='scatter')
<seaborn.axisgrid.JointGrid at 0x1cc0ffbe1d0>
sns.jointplot(x="QRS", y="Age", data=final_df, kind='reg',color="green")
<seaborn.axisgrid.JointGrid at 0x1cc10398b90>
sns.jointplot(x="QRS", y="Sex", data=final_df, kind='reg',color="navy")
<seaborn.axisgrid.JointGrid at 0x1cc10719ad0>
sns.jointplot(x="QRS", y="Weight", data=final_df, kind='reg',color="brown")
<seaborn.axisgrid.JointGrid at 0x1cc10cdd190>
g = sns.JointGrid(x="Age", y="Weight", data=final_df)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.histplot,color="red")
<seaborn.axisgrid.JointGrid at 0x1cc11dad190>
g = sns.JointGrid(x="Weight", y="QRS", data=final_df)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.histplot)
<seaborn.axisgrid.JointGrid at 0x1cc11844d90>
g = sns.JointGrid(x="Age", y="QRS", data=final_df)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.histplot)
<seaborn.axisgrid.JointGrid at 0x1cc1267c110>