import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
data=pd.read_csv('train.csv')
data.head()
Unnamed: 0 | id | Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 70172 | Male | Loyal Customer | 13 | Personal Travel | Eco Plus | 460 | 3 | 4 | ... | 5 | 4 | 3 | 4 | 4 | 5 | 5 | 25 | 18.0 | neutral or dissatisfied |
1 | 1 | 5047 | Male | disloyal Customer | 25 | Business travel | Business | 235 | 3 | 2 | ... | 1 | 1 | 5 | 3 | 1 | 4 | 1 | 1 | 6.0 | neutral or dissatisfied |
2 | 2 | 110028 | Female | Loyal Customer | 26 | Business travel | Business | 1142 | 2 | 2 | ... | 5 | 4 | 3 | 4 | 4 | 4 | 5 | 0 | 0.0 | satisfied |
3 | 3 | 24026 | Female | Loyal Customer | 25 | Business travel | Business | 562 | 2 | 5 | ... | 2 | 2 | 5 | 3 | 1 | 4 | 2 | 11 | 9.0 | neutral or dissatisfied |
4 | 4 | 119299 | Male | Loyal Customer | 61 | Business travel | Business | 214 | 3 | 3 | ... | 3 | 3 | 4 | 4 | 3 | 3 | 3 | 0 | 0.0 | satisfied |
5 rows × 25 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 103904 entries, 0 to 103903 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 103904 non-null int64 1 id 103904 non-null int64 2 Gender 103904 non-null object 3 Customer Type 103904 non-null object 4 Age 103904 non-null int64 5 Type of Travel 103904 non-null object 6 Class 103904 non-null object 7 Flight Distance 103904 non-null int64 8 Inflight wifi service 103904 non-null int64 9 Departure/Arrival time convenient 103904 non-null int64 10 Ease of Online booking 103904 non-null int64 11 Gate location 103904 non-null int64 12 Food and drink 103904 non-null int64 13 Online boarding 103904 non-null int64 14 Seat comfort 103904 non-null int64 15 Inflight entertainment 103904 non-null int64 16 On-board service 103904 non-null int64 17 Leg room service 103904 non-null int64 18 Baggage handling 103904 non-null int64 19 Checkin service 103904 non-null int64 20 Inflight service 103904 non-null int64 21 Cleanliness 103904 non-null int64 22 Departure Delay in Minutes 103904 non-null int64 23 Arrival Delay in Minutes 103594 non-null float64 24 satisfaction 103904 non-null object dtypes: float64(1), int64(19), object(5) memory usage: 19.8+ MB
data.shape
(103904, 25)
data.describe()
Unnamed: 0 | id | Age | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | Food and drink | Online boarding | Seat comfort | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103904.000000 | 103594.000000 |
mean | 51951.500000 | 64924.210502 | 39.379706 | 1189.448375 | 2.729683 | 3.060296 | 2.756901 | 2.976883 | 3.202129 | 3.250375 | 3.439396 | 3.358158 | 3.382363 | 3.351055 | 3.631833 | 3.304290 | 3.640428 | 3.286351 | 14.815618 | 15.178678 |
std | 29994.645522 | 37463.812252 | 15.114964 | 997.147281 | 1.327829 | 1.525075 | 1.398929 | 1.277621 | 1.329533 | 1.349509 | 1.319088 | 1.332991 | 1.288354 | 1.315605 | 1.180903 | 1.265396 | 1.175663 | 1.312273 | 38.230901 | 38.698682 |
min | 0.000000 | 1.000000 | 7.000000 | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 25975.750000 | 32533.750000 | 27.000000 | 414.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 |
50% | 51951.500000 | 64856.500000 | 40.000000 | 843.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 3.000000 | 4.000000 | 3.000000 | 0.000000 | 0.000000 |
75% | 77927.250000 | 97368.250000 | 51.000000 | 1743.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 5.000000 | 4.000000 | 4.000000 | 4.000000 | 5.000000 | 4.000000 | 5.000000 | 4.000000 | 12.000000 | 13.000000 |
max | 103903.000000 | 129880.000000 | 85.000000 | 4983.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 1592.000000 | 1584.000000 |
data.isna().sum()
Unnamed: 0 0 id 0 Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Inflight wifi service 0 Departure/Arrival time convenient 0 Ease of Online booking 0 Gate location 0 Food and drink 0 Online boarding 0 Seat comfort 0 Inflight entertainment 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Inflight service 0 Cleanliness 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 310 satisfaction 0 dtype: int64
data.dropna(axis=0, inplace=True)
data.isna().sum()
Unnamed: 0 0 id 0 Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Inflight wifi service 0 Departure/Arrival time convenient 0 Ease of Online booking 0 Gate location 0 Food and drink 0 Online boarding 0 Seat comfort 0 Inflight entertainment 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Inflight service 0 Cleanliness 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 0 satisfaction 0 dtype: int64
### Encoding ###
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
data["Customer Type"] = le.fit_transform(data["Customer Type"])
data["Type of Travel"] = le.fit_transform(data["Type of Travel"])
data["satisfaction"] = le.fit_transform(data["satisfaction"])
### Labeling ###
data["Class"] = data["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})
实例化LabelEncoder:
le = LabelEncoder()
这一步创建了一个LabelEncoder对象le,用于后续的标签编码。
对各个特征进行编码:
对于数据集data中的每个分类特征,LabelEncoder的fit_transform方法被用来转换其值。
data["Gender"] = le.fit_transform(data["Gender"]):对Gender特征进行编码,将文本标签(如"Male","Female")转换为数字(如0, 1)。
data["Customer Type"] = le.fit_transform(data["Customer Type"]):对Customer Type特征进行同样的处理。
data["Type of Travel"] = le.fit_transform(data["Type of Travel"]):对Type of Travel特征进行编码。
data["satisfaction"] = le.fit_transform(data["satisfaction"]):对satisfaction特征进行编码。
fit_transform方法首先将标签拟合到数据上,然后将它们转换为适当的数值标签。
手动标签替换
除了使用LabelEncoder,代码还手动替换了Class特征中的标签。
data["Class"] = data["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})
这一行代码将Class特征中的每个类别("Eco", "Eco Plus", "Business")映射到一个具体的数字(1, 2, 3)。这是一种更直接的方法来进行类别编码,特别是当类别的数量不多,且您希望指定特定的数值时。
X_train = data.drop("satisfaction", axis=1)
y_train= data["satisfaction"]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
models = pd.DataFrame(columns=["Model Name","Accuracy Score"])
model_list = [("Random Forest Classifier",RandomForestClassifier(random_state=42)),
("Support Vector Machines",SVC(random_state=42)),
("Decision Tree Classifier", DecisionTreeClassifier(random_state=42)),
("KNeighbors Classifier",KNeighborsClassifier(n_neighbors=2)),
("Gaussian Naive Bayes", GaussianNB())]
testData=pd.read_csv('test.csv')
testData.dropna(axis=0, inplace=True)
### Encoding ###
le = LabelEncoder()
testData["Gender"] = le.fit_transform(testData["Gender"])
testData["Customer Type"] = le.fit_transform(testData["Customer Type"])
testData["Type of Travel"] = le.fit_transform(testData["Type of Travel"])
testData["satisfaction"] = le.fit_transform(testData["satisfaction"])
### Labeling ###
testData["Class"] = testData["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})
Xtest = testData.drop("satisfaction", axis=1)
ytest= testData["satisfaction"]
Xtest = scaler.fit_transform(Xtest)
# 创建一个空的DataFrame来存储结果
models = pd.DataFrame(columns=["Model Name", "Accuracy Score"])
# 循环遍历模型
for algoName, model in model_list:
model.fit(X_train, y_train)
predictions = model.predict(Xtest)
score = accuracy_score(ytest, predictions)
new_row = {"Model Name": algoName, "Accuracy Score": score}
# 使用 pd.concat 而不是 append
models = pd.concat([models, pd.DataFrame([new_row])], ignore_index=True)
# 对模型按照准确率降序排列
models = models.sort_values(by="Accuracy Score", ascending=False)
# 显示模型及其准确率
models
Model Name | Accuracy Score | |
---|---|---|
0 | Random Forest Classifier | 0.963890 |
1 | Support Vector Machines | 0.956050 |
2 | Decision Tree Classifier | 0.945700 |
3 | KNeighbors Classifier | 0.909319 |
4 | Gaussian Naive Bayes | 0.861970 |