In [39]:
# 导入必要的库
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
In [40]:
# 加载数据集
df = pd.read_csv('admission_predict.csv')

探索数据集¶

In [41]:
# 返回数据集的行数和列数
df.shape
Out[41]:
(500, 9)
In [42]:
# 当 head(num) 时返回前 x 行数。如果没有数字,则返回 5
df.head()
Out[42]:
Serial No. GRE Score TOEFL Score University Rating SOP LOR CGPA Research Chance of Admit
0 1 337 118 4 4.5 4.5 9.65 1 0.92
1 2 324 107 4 4.0 4.5 8.87 1 0.76
2 3 316 104 3 3.0 3.5 8.00 1 0.72
3 4 322 110 3 3.5 2.5 8.67 1 0.80
4 5 314 103 2 2.0 3.0 8.21 0 0.65
In [43]:
# 当 tail(num) 时返回前 x 行数。如果没有数字,则返回 5
df.tail()
Out[43]:
Serial No. GRE Score TOEFL Score University Rating SOP LOR CGPA Research Chance of Admit
495 496 332 108 5 4.5 4.0 9.02 1 0.87
496 497 337 117 5 5.0 5.0 9.87 1 0.96
497 498 330 120 5 4.5 5.0 9.56 1 0.93
498 499 312 103 4 4.0 5.0 8.43 0 0.73
499 500 327 113 4 4.5 4.5 9.04 0 0.84
In [44]:
# 返回一个包含所有列标题的对象
df.columns
Out[44]:
Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')
In [45]:
# 返回所有列的基本信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB
In [46]:
# 返回数字列的基本统计信息
df.describe().T
Out[46]:
count mean std min 25% 50% 75% max
Serial No. 500.0 250.50000 144.481833 1.00 125.7500 250.50 375.25 500.00
GRE Score 500.0 316.47200 11.295148 290.00 308.0000 317.00 325.00 340.00
TOEFL Score 500.0 107.19200 6.081868 92.00 103.0000 107.00 112.00 120.00
University Rating 500.0 3.11400 1.143512 1.00 2.0000 3.00 4.00 5.00
SOP 500.0 3.37400 0.991004 1.00 2.5000 3.50 4.00 5.00
LOR 500.0 3.48400 0.925450 1.00 3.0000 3.50 4.00 5.00
CGPA 500.0 8.57644 0.604813 6.80 8.1275 8.56 9.04 9.92
Research 500.0 0.56000 0.496884 0.00 0.0000 1.00 1.00 1.00
Chance of Admit 500.0 0.72174 0.141140 0.34 0.6300 0.72 0.82 0.97
In [47]:
# 为每列返回不同的数据类型(float、int、string、bool 等)
df.dtypes
Out[47]:
Serial No.             int64
GRE Score              int64
TOEFL Score            int64
University Rating      int64
SOP                  float64
LOR                  float64
CGPA                 float64
Research               int64
Chance of Admit      float64
dtype: object
In [48]:
# 对于具有空值的列返回 true,否则返回 false
df.isnull().any()
Out[48]:
Serial No.           False
GRE Score            False
TOEFL Score          False
University Rating    False
SOP                  False
LOR                  False
CGPA                 False
Research             False
Chance of Admit      False
dtype: bool
In [49]:
# 使用适当的名称重命名列
df = df.rename(columns={'GRE Score': 'GRE', 'TOEFL Score': 'TOEFL', 'LOR ': 'LOR', 'Chance of Admit ': 'Probability'})
df.head()
Out[49]:
Serial No. GRE TOEFL University Rating SOP LOR CGPA Research Probability
0 1 337 118 4 4.5 4.5 9.65 1 0.92
1 2 324 107 4 4.0 4.5 8.87 1 0.76
2 3 316 104 3 3.0 3.5 8.00 1 0.72
3 4 322 110 3 3.5 2.5 8.67 1 0.80
4 5 314 103 2 2.0 3.0 8.21 0 0.65

数据可视化¶

In [50]:
# 可视化特征 GRE
fig = plt.hist(df['GRE'], rwidth=0.7)
plt.title("Distribution of GRE Scores")
plt.xlabel('GRE Scores')
plt.ylabel('Count')
plt.show()
In [51]:
# 可视化托福功能
fig = plt.hist(df['TOEFL'], rwidth=0.7)
plt.title('Distribution of TOEFL Scores')
plt.xlabel('TOEFL Scores')
plt.ylabel('Count')
plt.show()
In [52]:
# 可视化大学排名
fig = plt.hist(df['University Rating'], rwidth=0.7)
plt.title('Distribution of University Rating')
plt.xlabel('University Rating')
plt.ylabel('Count')
plt.show()
In [53]:
# 可视化个人陈述评分
fig = plt.hist(df['SOP'], rwidth=0.7)
plt.title('Distribution of SOP')
plt.xlabel('SOP Rating')
plt.ylabel('Count')
plt.show()
In [54]:
# 可视化推荐信评分
fig = plt.hist(df['LOR'], rwidth=0.7)
plt.title('Distribution of LOR Rating')
plt.xlabel('LOR Rating')
plt.ylabel('Count')
plt.show()
In [55]:
# 可视化本科GPA
fig = plt.hist(df['CGPA'], rwidth=0.7)
plt.title('Distribution of CGPA')
plt.xlabel('CGPA')
plt.ylabel('Count')
plt.show()
In [56]:
# 可视化科研发表
fig = plt.hist(df['Research'], rwidth=0.7)
plt.title('Distribution of Research Papers')
plt.xlabel('Research')
plt.ylabel('Count')
plt.show()

数据清理¶

In [57]:
# 删除序列号列
df.drop('Serial No.', axis='columns', inplace=True)
df.head()
Out[57]:
GRE TOEFL University Rating SOP LOR CGPA Research Probability
0 337 118 4 4.5 4.5 9.65 1 0.92
1 324 107 4 4.0 4.5 8.87 1 0.76
2 316 104 3 3.0 3.5 8.00 1 0.72
3 322 110 3 3.5 2.5 8.67 1 0.80
4 314 103 2 2.0 3.0 8.21 0 0.65
In [58]:
# 将 ['GRE','TOEFL','University Rating','SOP','LOR','CGPA'] 中的 0 值替换为 NaN
df_copy = df.copy(deep=True)
df_copy[['GRE','TOEFL','University Rating','SOP','LOR','CGPA']] = df_copy[['GRE','TOEFL','University Rating','SOP','LOR','CGPA']].replace(0, np.NaN)
df_copy.isnull().sum()
Out[58]:
GRE                  0
TOEFL                0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Probability          0
dtype: int64

创建模型¶

In [59]:
# 将数据集拆分为特征和标签
X = df_copy.drop('Probability', axis='columns')
y = df_copy['Probability']
In [60]:
# 使用 GridSearchCV 找到该问题的最佳算法
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
In [61]:
# 创建一个函数来计算该问题的最佳模型
def find_best_model(X, y):
    models = {
        'linear_regression': {
            'model': LinearRegression(),
            'parameters': {
                'fit_intercept': [True, False]
            }
        },
        
        'lasso': {
            'model': Lasso(),
            'parameters': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        
        'svr': {
            'model': SVR(),
            'parameters': {
                'gamma': ['auto','scale']
            }
        },
        
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'parameters': {
                'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],  
                'splitter': ['best', 'random']
            }
        },
        
        'random_forest': {
            'model': RandomForestRegressor(),
            'parameters': {
                'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],   
                'n_estimators': [5, 10, 15, 20]
            }
        },
        
        'knn': {
            'model': KNeighborsRegressor(algorithm='auto'),
            'parameters': {
                'n_neighbors': [2,5,10,20]
            }
        }
    }
    
    scores = []
    for model_name, model_params in models.items():
        gs = GridSearchCV(model_params['model'], model_params['parameters'], cv=5, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': model_name,
            'best_parameters': gs.best_params_,
            'score': gs.best_score_
        })
        
    return pd.DataFrame(scores, columns=['model','best_parameters','score'])
        
find_best_model(X, y)
Out[61]:
model best_parameters score
0 linear_regression {'fit_intercept': True} 0.810802
1 lasso {'alpha': 1, 'selection': 'random'} 0.215088
2 svr {'gamma': 'scale'} 0.654099
3 decision_tree {'criterion': 'absolute_error', 'splitter': 'b... 0.606752
4 random_forest {'criterion': 'absolute_error', 'n_estimators'... 0.780361
5 knn {'n_neighbors': 20} 0.723017

由于线性回归算法的精度最高,因此该问题选择的模型是线性回归。¶

In [62]:
# 使用 cross_val_score 获得最高准确度
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LinearRegression( ), X, y, cv=5)
print('Highest Accuracy : {}%'.format(round(sum(scores)*100/len(scores)), 3))
Highest Accuracy : 81%
In [63]:
# 将数据集拆分为训练样本和测试样本
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)
print(len(X_train), len(X_test))
400 100
In [64]:
# 创建线性回归模型
model = LinearRegression( )
model.fit(X_train, y_train)
model.score(X_test, y_test)
Out[64]:
0.8214787365889655

使用我们训练的模型预测值¶

In [65]:
import pandas as pd

# Assuming 'model' is your trained LinearRegression model
# Define the column names based on your training data
columns = ['GRE','TOEFL','University Rating','SOP','LOR','CGPA','Research']

# Create a DataFrame for the input data
input_data = pd.DataFrame([[337, 118, 4, 4.5, 4.5, 9.65, 0]], columns=columns)

# Make the prediction
predicted_chance = model.predict(input_data)[0]

# Display the result
print('Chance of getting into UCLA is {}%'.format(round(predicted_chance * 100, 3)))
Chance of getting into UCLA is 92.855%