import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


data=pd.read_csv('train.csv')


data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      103904 non-null  int64  
 12  Food and drink                     103904 non-null  int64  
 13  Online boarding                    103904 non-null  int64  
 14  Seat comfort                       103904 non-null  int64  
 15  Inflight entertainment             103904 non-null  int64  
 16  On-board service                   103904 non-null  int64  
 17  Leg room service                   103904 non-null  int64  
 18  Baggage handling                   103904 non-null  int64  
 19  Checkin service                    103904 non-null  int64  
 20  Inflight service                   103904 non-null  int64  
 21  Cleanliness                        103904 non-null  int64  
 22  Departure Delay in Minutes         103904 non-null  int64  
 23  Arrival Delay in Minutes           103594 non-null  float64
 24  satisfaction                       103904 non-null  object 
dtypes: float64(1), int64(19), object(5)
memory usage: 19.8+ MB


data.shape

(103904, 25)


data.describe()


data.isna().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64


data.dropna(axis=0, inplace=True)


data.isna().sum()

Unnamed: 0                           0
id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64


### Encoding ###
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
data["Customer Type"] = le.fit_transform(data["Customer Type"])
data["Type of Travel"] = le.fit_transform(data["Type of Travel"])
data["satisfaction"] = le.fit_transform(data["satisfaction"])

### Labeling ###
data["Class"] = data["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})


X_train = data.drop("satisfaction", axis=1)
y_train= data["satisfaction"]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
models = pd.DataFrame(columns=["Model Name","Accuracy Score"])


model_list = [("Random Forest Classifier",RandomForestClassifier(random_state=42)),
             ("Support Vector Machines",SVC(random_state=42)),
             ("Decision Tree Classifier", DecisionTreeClassifier(random_state=42)),
             ("KNeighbors Classifier",KNeighborsClassifier(n_neighbors=2)),
             ("Gaussian Naive Bayes", GaussianNB())]


testData=pd.read_csv('test.csv')
testData.dropna(axis=0, inplace=True)

### Encoding ###
le = LabelEncoder()
testData["Gender"] = le.fit_transform(testData["Gender"])
testData["Customer Type"] = le.fit_transform(testData["Customer Type"])
testData["Type of Travel"] = le.fit_transform(testData["Type of Travel"])
testData["satisfaction"] = le.fit_transform(testData["satisfaction"])

### Labeling ###
testData["Class"] = testData["Class"].replace({"Eco":1,"Eco Plus":2,"Business":3})

Xtest = testData.drop("satisfaction", axis=1)
ytest= testData["satisfaction"]
Xtest = scaler.fit_transform(Xtest)


# 创建一个空的DataFrame来存储结果
models = pd.DataFrame(columns=["Model Name", "Accuracy Score"])

# 循环遍历模型
for algoName, model in model_list:
    model.fit(X_train, y_train)
    predictions = model.predict(Xtest)
    score = accuracy_score(ytest, predictions)
    new_row = {"Model Name": algoName, "Accuracy Score": score}

    # 使用 pd.concat 而不是 append
    models = pd.concat([models, pd.DataFrame([new_row])], ignore_index=True)

# 对模型按照准确率降序排列
models = models.sort_values(by="Accuracy Score", ascending=False)

# 显示模型及其准确率
models

	Unnamed: 0	id	Gender	Customer Type	Age	Type of Travel	Class	Flight Distance	Inflight wifi service	Departure/Arrival time convenient	...	Inflight entertainment	On-board service	Leg room service	Baggage handling	Checkin service	Inflight service	Cleanliness	Departure Delay in Minutes	Arrival Delay in Minutes	satisfaction
0	0	70172	Male	Loyal Customer	13	Personal Travel	Eco Plus	460	3	4	...	5	4	3	4	4	5	5	25	18.0	neutral or dissatisfied
1	1	5047	Male	disloyal Customer	25	Business travel	Business	235	3	2	...	1	1	5	3	1	4	1	1	6.0	neutral or dissatisfied
2	2	110028	Female	Loyal Customer	26	Business travel	Business	1142	2	2	...	5	4	3	4	4	4	5	0	0.0	satisfied
3	3	24026	Female	Loyal Customer	25	Business travel	Business	562	2	5	...	2	2	5	3	1	4	2	11	9.0	neutral or dissatisfied
4	4	119299	Male	Loyal Customer	61	Business travel	Business	214	3	3	...	3	3	4	4	3	3	3	0	0.0	satisfied

	Unnamed: 0	id	Age	Flight Distance	Inflight wifi service	Departure/Arrival time convenient	Ease of Online booking	Gate location	Food and drink	Online boarding	Seat comfort	Inflight entertainment	On-board service	Leg room service	Baggage handling	Checkin service	Inflight service	Cleanliness	Departure Delay in Minutes	Arrival Delay in Minutes
count	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103904.000000	103594.000000
mean	51951.500000	64924.210502	39.379706	1189.448375	2.729683	3.060296	2.756901	2.976883	3.202129	3.250375	3.439396	3.358158	3.382363	3.351055	3.631833	3.304290	3.640428	3.286351	14.815618	15.178678
std	29994.645522	37463.812252	15.114964	997.147281	1.327829	1.525075	1.398929	1.277621	1.329533	1.349509	1.319088	1.332991	1.288354	1.315605	1.180903	1.265396	1.175663	1.312273	38.230901	38.698682
min	0.000000	1.000000	7.000000	31.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	25975.750000	32533.750000	27.000000	414.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	3.000000	3.000000	3.000000	2.000000	0.000000	0.000000
50%	51951.500000	64856.500000	40.000000	843.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	4.000000	4.000000	4.000000	4.000000	4.000000	3.000000	4.000000	3.000000	0.000000	0.000000
75%	77927.250000	97368.250000	51.000000	1743.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	5.000000	4.000000	4.000000	4.000000	5.000000	4.000000	5.000000	4.000000	12.000000	13.000000
max	103903.000000	129880.000000	85.000000	4983.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	1592.000000	1584.000000

导入必要的库¶

读取数据¶

数据清洗¶

模型训练¶

模型验证¶

	Model Name	Accuracy Score
0	Random Forest Classifier	0.963890
1	Support Vector Machines	0.956050
2	Decision Tree Classifier	0.945700
3	KNeighbors Classifier	0.909319
4	Gaussian Naive Bayes	0.861970