导入必要的库¶

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

读取数据¶

In [2]:
data=pd.read_csv('train.csv')
In [3]:
data.head()
Out[3]:
Unnamed: 0 id Gender Customer Type Age Type of Travel Class Flight Distance Inflight wifi service Departure/Arrival time convenient ... Inflight entertainment On-board service Leg room service Baggage handling Checkin service Inflight service Cleanliness Departure Delay in Minutes Arrival Delay in Minutes satisfaction
0 0 70172 Male Loyal Customer 13 Personal Travel Eco Plus 460 3 4 ... 5 4 3 4 4 5 5 25 18.0 neutral or dissatisfied
1 1 5047 Male disloyal Customer 25 Business travel Business 235 3 2 ... 1 1 5 3 1 4 1 1 6.0 neutral or dissatisfied
2 2 110028 Female Loyal Customer 26 Business travel Business 1142 2 2 ... 5 4 3 4 4 4 5 0 0.0 satisfied
3 3 24026 Female Loyal Customer 25 Business travel Business 562 2 5 ... 2 2 5 3 1 4 2 11 9.0 neutral or dissatisfied
4 4 119299 Male Loyal Customer 61 Business travel Business 214 3 3 ... 3 3 4 4 3 3 3 0 0.0 satisfied

5 rows × 25 columns

In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      103904 non-null  int64  
 12  Food and drink                     103904 non-null  int64  
 13  Online boarding                    103904 non-null  int64  
 14  Seat comfort                       103904 non-null  int64  
 15  Inflight entertainment             103904 non-null  int64  
 16  On-board service                   103904 non-null  int64  
 17  Leg room service                   103904 non-null  int64  
 18  Baggage handling                   103904 non-null  int64  
 19  Checkin service                    103904 non-null  int64  
 20  Inflight service                   103904 non-null  int64  
 21  Cleanliness                        103904 non-null  int64  
 22  Departure Delay in Minutes         103904 non-null  int64  
 23  Arrival Delay in Minutes           103594 non-null  float64
 24  satisfaction                       103904 non-null  object 
dtypes: float64(1), int64(19), object(5)
memory usage: 19.8+ MB
In [5]:
data.shape
Out[5]:
(103904, 25)
In [6]:
data.describe()
Out[6]:
Unnamed: 0 id Age Flight Distance Inflight wifi service Departure/Arrival time convenient Ease of Online booking Gate location Food and drink Online boarding Seat comfort Inflight entertainment On-board service Leg room service Baggage handling Checkin service Inflight service Cleanliness Departure Delay in Minutes Arrival Delay in Minutes
count 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103904.000000 103594.000000
mean 51951.500000 64924.210502 39.379706 1189.448375 2.729683 3.060296 2.756901 2.976883 3.202129 3.250375 3.439396 3.358158 3.382363 3.351055 3.631833 3.304290 3.640428 3.286351 14.815618 15.178678
std 29994.645522 37463.812252 15.114964 997.147281 1.327829 1.525075 1.398929 1.277621 1.329533 1.349509 1.319088 1.332991 1.288354 1.315605 1.180903 1.265396 1.175663 1.312273 38.230901 38.698682
min 0.000000 1.000000 7.000000 31.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 25975.750000 32533.750000 27.000000 414.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 3.000000 3.000000 3.000000 2.000000 0.000000 0.000000
50% 51951.500000 64856.500000 40.000000 843.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 4.000000 4.000000 4.000000 4.000000 4.000000 3.000000 4.000000 3.000000 0.000000 0.000000
75% 77927.250000 97368.250000 51.000000 1743.000000 4.000000 4.000000 4.000000 4.000000 4.000000 4.000000 5.000000 4.000000 4.000000 4.000000 5.000000 4.000000 5.000000 4.000000 12.000000 13.000000
max 103903.000000 129880.000000 85.000000 4983.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 1592.000000 1584.000000

数据清洗¶

In [7]:
data.isna().sum()
Out[7]:
Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64
In [8]:
data.dropna(axis=0, inplace=True)
In [9]:
data.isna().sum()
Out[9]:
Unnamed: 0                           0
id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64
In [10]:
### Encoding ###
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
data["Customer Type"] = le.fit_transform(data["Customer Type"])
data["Type of Travel"] = le.fit_transform(data["Type of Travel"])
data["satisfaction"] = le.fit_transform(data["satisfaction"])

### Labeling ###
data["Class"] = data["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})

实例化LabelEncoder:

le = LabelEncoder()

这一步创建了一个LabelEncoder对象le,用于后续的标签编码。

对各个特征进行编码:

对于数据集data中的每个分类特征,LabelEncoder的fit_transform方法被用来转换其值。

data["Gender"] = le.fit_transform(data["Gender"]):对Gender特征进行编码,将文本标签(如"Male","Female")转换为数字(如0, 1)。

data["Customer Type"] = le.fit_transform(data["Customer Type"]):对Customer Type特征进行同样的处理。

data["Type of Travel"] = le.fit_transform(data["Type of Travel"]):对Type of Travel特征进行编码。

data["satisfaction"] = le.fit_transform(data["satisfaction"]):对satisfaction特征进行编码。

fit_transform方法首先将标签拟合到数据上,然后将它们转换为适当的数值标签。

手动标签替换

除了使用LabelEncoder,代码还手动替换了Class特征中的标签。

data["Class"] = data["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})

这一行代码将Class特征中的每个类别("Eco", "Eco Plus", "Business")映射到一个具体的数字(1, 2, 3)。这是一种更直接的方法来进行类别编码,特别是当类别的数量不多,且您希望指定特定的数值时。

模型训练¶

In [11]:
X_train = data.drop("satisfaction", axis=1)
y_train= data["satisfaction"]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
models = pd.DataFrame(columns=["Model Name","Accuracy Score"])
In [12]:
model_list = [("Random Forest Classifier",RandomForestClassifier(random_state=42)),
             ("Support Vector Machines",SVC(random_state=42)),
             ("Decision Tree Classifier", DecisionTreeClassifier(random_state=42)),
             ("KNeighbors Classifier",KNeighborsClassifier(n_neighbors=2)),
             ("Gaussian Naive Bayes", GaussianNB())]

模型验证¶

In [13]:
testData=pd.read_csv('test.csv')
testData.dropna(axis=0, inplace=True)

### Encoding ###
le = LabelEncoder()
testData["Gender"] = le.fit_transform(testData["Gender"])
testData["Customer Type"] = le.fit_transform(testData["Customer Type"])
testData["Type of Travel"] = le.fit_transform(testData["Type of Travel"])
testData["satisfaction"] = le.fit_transform(testData["satisfaction"])

### Labeling ###
testData["Class"] = testData["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})

Xtest = testData.drop("satisfaction", axis=1)
ytest= testData["satisfaction"]
Xtest = scaler.fit_transform(Xtest)
In [14]:
# 创建一个空的DataFrame来存储结果
models = pd.DataFrame(columns=["Model Name", "Accuracy Score"])

# 循环遍历模型
for algoName, model in model_list:
    model.fit(X_train, y_train)
    predictions = model.predict(Xtest)
    score = accuracy_score(ytest, predictions)
    new_row = {"Model Name": algoName, "Accuracy Score": score}

    # 使用 pd.concat 而不是 append
    models = pd.concat([models, pd.DataFrame([new_row])], ignore_index=True)

# 对模型按照准确率降序排列
models = models.sort_values(by="Accuracy Score", ascending=False)

# 显示模型及其准确率
models
Out[14]:
Model Name Accuracy Score
0 Random Forest Classifier 0.963890
1 Support Vector Machines 0.956050
2 Decision Tree Classifier 0.945700
3 KNeighbors Classifier 0.909319
4 Gaussian Naive Bayes 0.861970
In [ ]: