# 导入必要的库
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
# 加载数据集
df = pd.read_csv('admission_predict.csv')
# 返回数据集的行数和列数
df.shape
(500, 9)
# 当 head(num) 时返回前 x 行数。如果没有数字,则返回 5
df.head()
Serial No. | GRE Score | TOEFL Score | University Rating | SOP | LOR | CGPA | Research | Chance of Admit | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 337 | 118 | 4 | 4.5 | 4.5 | 9.65 | 1 | 0.92 |
1 | 2 | 324 | 107 | 4 | 4.0 | 4.5 | 8.87 | 1 | 0.76 |
2 | 3 | 316 | 104 | 3 | 3.0 | 3.5 | 8.00 | 1 | 0.72 |
3 | 4 | 322 | 110 | 3 | 3.5 | 2.5 | 8.67 | 1 | 0.80 |
4 | 5 | 314 | 103 | 2 | 2.0 | 3.0 | 8.21 | 0 | 0.65 |
# 当 tail(num) 时返回前 x 行数。如果没有数字,则返回 5
df.tail()
Serial No. | GRE Score | TOEFL Score | University Rating | SOP | LOR | CGPA | Research | Chance of Admit | |
---|---|---|---|---|---|---|---|---|---|
495 | 496 | 332 | 108 | 5 | 4.5 | 4.0 | 9.02 | 1 | 0.87 |
496 | 497 | 337 | 117 | 5 | 5.0 | 5.0 | 9.87 | 1 | 0.96 |
497 | 498 | 330 | 120 | 5 | 4.5 | 5.0 | 9.56 | 1 | 0.93 |
498 | 499 | 312 | 103 | 4 | 4.0 | 5.0 | 8.43 | 0 | 0.73 |
499 | 500 | 327 | 113 | 4 | 4.5 | 4.5 | 9.04 | 0 | 0.84 |
# 返回一个包含所有列标题的对象
df.columns
Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit '], dtype='object')
# 返回所有列的基本信息
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Serial No. 500 non-null int64 1 GRE Score 500 non-null int64 2 TOEFL Score 500 non-null int64 3 University Rating 500 non-null int64 4 SOP 500 non-null float64 5 LOR 500 non-null float64 6 CGPA 500 non-null float64 7 Research 500 non-null int64 8 Chance of Admit 500 non-null float64 dtypes: float64(4), int64(5) memory usage: 35.3 KB
# 返回数字列的基本统计信息
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Serial No. | 500.0 | 250.50000 | 144.481833 | 1.00 | 125.7500 | 250.50 | 375.25 | 500.00 |
GRE Score | 500.0 | 316.47200 | 11.295148 | 290.00 | 308.0000 | 317.00 | 325.00 | 340.00 |
TOEFL Score | 500.0 | 107.19200 | 6.081868 | 92.00 | 103.0000 | 107.00 | 112.00 | 120.00 |
University Rating | 500.0 | 3.11400 | 1.143512 | 1.00 | 2.0000 | 3.00 | 4.00 | 5.00 |
SOP | 500.0 | 3.37400 | 0.991004 | 1.00 | 2.5000 | 3.50 | 4.00 | 5.00 |
LOR | 500.0 | 3.48400 | 0.925450 | 1.00 | 3.0000 | 3.50 | 4.00 | 5.00 |
CGPA | 500.0 | 8.57644 | 0.604813 | 6.80 | 8.1275 | 8.56 | 9.04 | 9.92 |
Research | 500.0 | 0.56000 | 0.496884 | 0.00 | 0.0000 | 1.00 | 1.00 | 1.00 |
Chance of Admit | 500.0 | 0.72174 | 0.141140 | 0.34 | 0.6300 | 0.72 | 0.82 | 0.97 |
# 为每列返回不同的数据类型(float、int、string、bool 等)
df.dtypes
Serial No. int64 GRE Score int64 TOEFL Score int64 University Rating int64 SOP float64 LOR float64 CGPA float64 Research int64 Chance of Admit float64 dtype: object
# 对于具有空值的列返回 true,否则返回 false
df.isnull().any()
Serial No. False GRE Score False TOEFL Score False University Rating False SOP False LOR False CGPA False Research False Chance of Admit False dtype: bool
# 使用适当的名称重命名列
df = df.rename(columns={'GRE Score': 'GRE', 'TOEFL Score': 'TOEFL', 'LOR ': 'LOR', 'Chance of Admit ': 'Probability'})
df.head()
Serial No. | GRE | TOEFL | University Rating | SOP | LOR | CGPA | Research | Probability | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 337 | 118 | 4 | 4.5 | 4.5 | 9.65 | 1 | 0.92 |
1 | 2 | 324 | 107 | 4 | 4.0 | 4.5 | 8.87 | 1 | 0.76 |
2 | 3 | 316 | 104 | 3 | 3.0 | 3.5 | 8.00 | 1 | 0.72 |
3 | 4 | 322 | 110 | 3 | 3.5 | 2.5 | 8.67 | 1 | 0.80 |
4 | 5 | 314 | 103 | 2 | 2.0 | 3.0 | 8.21 | 0 | 0.65 |
# 可视化特征 GRE
fig = plt.hist(df['GRE'], rwidth=0.7)
plt.title("Distribution of GRE Scores")
plt.xlabel('GRE Scores')
plt.ylabel('Count')
plt.show()
# 可视化托福功能
fig = plt.hist(df['TOEFL'], rwidth=0.7)
plt.title('Distribution of TOEFL Scores')
plt.xlabel('TOEFL Scores')
plt.ylabel('Count')
plt.show()
# 可视化大学排名
fig = plt.hist(df['University Rating'], rwidth=0.7)
plt.title('Distribution of University Rating')
plt.xlabel('University Rating')
plt.ylabel('Count')
plt.show()
# 可视化个人陈述评分
fig = plt.hist(df['SOP'], rwidth=0.7)
plt.title('Distribution of SOP')
plt.xlabel('SOP Rating')
plt.ylabel('Count')
plt.show()
# 可视化推荐信评分
fig = plt.hist(df['LOR'], rwidth=0.7)
plt.title('Distribution of LOR Rating')
plt.xlabel('LOR Rating')
plt.ylabel('Count')
plt.show()
# 可视化本科GPA
fig = plt.hist(df['CGPA'], rwidth=0.7)
plt.title('Distribution of CGPA')
plt.xlabel('CGPA')
plt.ylabel('Count')
plt.show()
# 可视化科研发表
fig = plt.hist(df['Research'], rwidth=0.7)
plt.title('Distribution of Research Papers')
plt.xlabel('Research')
plt.ylabel('Count')
plt.show()
# 删除序列号列
df.drop('Serial No.', axis='columns', inplace=True)
df.head()
GRE | TOEFL | University Rating | SOP | LOR | CGPA | Research | Probability | |
---|---|---|---|---|---|---|---|---|
0 | 337 | 118 | 4 | 4.5 | 4.5 | 9.65 | 1 | 0.92 |
1 | 324 | 107 | 4 | 4.0 | 4.5 | 8.87 | 1 | 0.76 |
2 | 316 | 104 | 3 | 3.0 | 3.5 | 8.00 | 1 | 0.72 |
3 | 322 | 110 | 3 | 3.5 | 2.5 | 8.67 | 1 | 0.80 |
4 | 314 | 103 | 2 | 2.0 | 3.0 | 8.21 | 0 | 0.65 |
# 将 ['GRE','TOEFL','University Rating','SOP','LOR','CGPA'] 中的 0 值替换为 NaN
df_copy = df.copy(deep=True)
df_copy[['GRE','TOEFL','University Rating','SOP','LOR','CGPA']] = df_copy[['GRE','TOEFL','University Rating','SOP','LOR','CGPA']].replace(0, np.NaN)
df_copy.isnull().sum()
GRE 0 TOEFL 0 University Rating 0 SOP 0 LOR 0 CGPA 0 Research 0 Probability 0 dtype: int64
# 将数据集拆分为特征和标签
X = df_copy.drop('Probability', axis='columns')
y = df_copy['Probability']
# 使用 GridSearchCV 找到该问题的最佳算法
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
# 创建一个函数来计算该问题的最佳模型
def find_best_model(X, y):
models = {
'linear_regression': {
'model': LinearRegression(),
'parameters': {
'fit_intercept': [True, False]
}
},
'lasso': {
'model': Lasso(),
'parameters': {
'alpha': [1,2],
'selection': ['random', 'cyclic']
}
},
'svr': {
'model': SVR(),
'parameters': {
'gamma': ['auto','scale']
}
},
'decision_tree': {
'model': DecisionTreeRegressor(),
'parameters': {
'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
'splitter': ['best', 'random']
}
},
'random_forest': {
'model': RandomForestRegressor(),
'parameters': {
'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
'n_estimators': [5, 10, 15, 20]
}
},
'knn': {
'model': KNeighborsRegressor(algorithm='auto'),
'parameters': {
'n_neighbors': [2,5,10,20]
}
}
}
scores = []
for model_name, model_params in models.items():
gs = GridSearchCV(model_params['model'], model_params['parameters'], cv=5, return_train_score=False)
gs.fit(X, y)
scores.append({
'model': model_name,
'best_parameters': gs.best_params_,
'score': gs.best_score_
})
return pd.DataFrame(scores, columns=['model','best_parameters','score'])
find_best_model(X, y)
model | best_parameters | score | |
---|---|---|---|
0 | linear_regression | {'fit_intercept': True} | 0.810802 |
1 | lasso | {'alpha': 1, 'selection': 'random'} | 0.215088 |
2 | svr | {'gamma': 'scale'} | 0.654099 |
3 | decision_tree | {'criterion': 'absolute_error', 'splitter': 'b... | 0.606752 |
4 | random_forest | {'criterion': 'absolute_error', 'n_estimators'... | 0.780361 |
5 | knn | {'n_neighbors': 20} | 0.723017 |
# 使用 cross_val_score 获得最高准确度
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LinearRegression( ), X, y, cv=5)
print('Highest Accuracy : {}%'.format(round(sum(scores)*100/len(scores)), 3))
Highest Accuracy : 81%
# 将数据集拆分为训练样本和测试样本
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)
print(len(X_train), len(X_test))
400 100
# 创建线性回归模型
model = LinearRegression( )
model.fit(X_train, y_train)
model.score(X_test, y_test)
0.8214787365889655
import pandas as pd
# Assuming 'model' is your trained LinearRegression model
# Define the column names based on your training data
columns = ['GRE','TOEFL','University Rating','SOP','LOR','CGPA','Research']
# Create a DataFrame for the input data
input_data = pd.DataFrame([[337, 118, 4, 4.5, 4.5, 9.65, 0]], columns=columns)
# Make the prediction
predicted_chance = model.predict(input_data)[0]
# Display the result
print('Chance of getting into UCLA is {}%'.format(round(predicted_chance * 100, 3)))
Chance of getting into UCLA is 92.855%