导入模块¶

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

加载数据集¶

In [18]:
df = pd.read_csv('Mall_Customers.csv')
df.head()
Out[18]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [19]:
# 统计信息
df.describe()
Out[19]:
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
In [20]:
# 数据类型信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB

探索性数据分析¶

In [21]:
gender_counts = df['Gender'].value_counts()
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution')
plt.show()
In [22]:
sns.displot(df['Age'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Age Distribution')
plt.show()
In [23]:
sns.displot(df['Annual Income (k$)'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Annual Income (k$)')
plt.ylabel('Density')
plt.title('Annual Income (k$) Distribution')
plt.show()
In [24]:
sns.displot(df['Spending Score (1-100)'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Spending Score (1-100)')
plt.ylabel('Density')
plt.title('Spending Score (1-100) Distribution')
plt.show()

相关矩阵¶

In [25]:
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm')
Out[25]:
<Axes: >

聚类¶

In [26]:
df.head()
Out[26]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [27]:
# 基于 2 个特征的聚类
df1 = df[['Annual Income (k$)', 'Spending Score (1-100)']]
df1.head()
Out[27]:
Annual Income (k$) Spending Score (1-100)
0 15 39
1 15 81
2 16 6
3 16 77
4 17 40
In [28]:
# 散点图
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df1)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Scatter Plot of Annual Income vs. Spending Score')
plt.show()
In [29]:
import warnings 
warnings.filterwarnings("ignore", category=UserWarning) 
from sklearn.cluster import KMeans

errors = []
for i in range(1, 11): 
    kmeans = KMeans(n_clusters=i, random_state=0,n_init=10, max_iter=300 )
    kmeans.fit(df1)
    errors.append(kmeans.inertia_) 
In [30]:
# 绘制肘部法的结果
plt.figure(figsize=(13,6))
plt.plot(range(1,11), errors)
plt.plot(range(1,11), errors, linewidth=3, color='red', marker='8')
plt.xlabel('No. of clusters')
plt.ylabel('WCSS')
plt.xticks(np.arange(1,11,1))
plt.show()
In [31]:
km = KMeans(n_clusters=5, n_init=10)
km.fit(df1)

y = km.predict(df1)
df1.loc[:, 'Label'] = y

df1.head()
C:\Users\1\AppData\Local\Temp\ipykernel_21088\2186447201.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[:, 'Label'] = y
Out[31]:
Annual Income (k$) Spending Score (1-100) Label
0 15 39 0
1 15 81 3
2 16 6 0
3 16 77 3
4 17 40 0
In [32]:
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df1, hue='Label', s=50, palette=['red', 'green', 'brown', 'blue', 'orange'])
Out[32]:
<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>
In [33]:
# 基于 3 个特征的聚类
df2 = df[['Annual Income (k$)', 'Spending Score (1-100)', 'Age']]
df2.head()
Out[33]:
Annual Income (k$) Spending Score (1-100) Age
0 15 39 19
1 15 81 21
2 16 6 20
3 16 77 23
4 17 40 31
In [34]:
import warnings 
warnings.filterwarnings("ignore", category=UserWarning) 
from sklearn.cluster import KMeans

errors = []
for i in range(1, 11): 
    kmeans = KMeans(n_clusters=i, random_state=0,n_init=10, max_iter=300 )
    kmeans.fit(df2)
    errors.append(kmeans.inertia_) 
In [35]:
# 绘制肘部法的结果
plt.figure(figsize=(13,6))
plt.plot(range(1,11), errors)
plt.plot(range(1,11), errors, linewidth=3, color='red', marker='8')
plt.xlabel('No. of clusters')
plt.ylabel('WCSS')
plt.xticks(np.arange(1,11,1))
plt.show()
In [39]:
km = KMeans(n_clusters=5, n_init=10)
km.fit(df2)

y = km.predict(df2)
df2.loc[:, 'Label'] = y

df2.head()
C:\Users\1\AppData\Local\Temp\ipykernel_21088\4218386864.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:, 'Label'] = y
Out[39]:
Annual Income (k$) Spending Score (1-100) Age Label
0 15 39 19 4
1 15 81 21 0
2 16 6 20 4
3 16 77 23 0
4 17 40 31 4
In [40]:
# 3d 散点图
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(df2['Age'][df2['Label']==0], df2['Annual Income (k$)'][df2['Label']==0], df2['Spending Score (1-100)'][df2['Label']==0], c='red', s=50)
ax.scatter(df2['Age'][df2['Label']==1], df2['Annual Income (k$)'][df2['Label']==1], df2['Spending Score (1-100)'][df2['Label']==1], c='green', s=50)
ax.scatter(df2['Age'][df2['Label']==2], df2['Annual Income (k$)'][df2['Label']==2], df2['Spending Score (1-100)'][df2['Label']==2], c='blue', s=50)
ax.scatter(df2['Age'][df2['Label']==3], df2['Annual Income (k$)'][df2['Label']==3], df2['Spending Score (1-100)'][df2['Label']==3], c='brown', s=50)
ax.scatter(df2['Age'][df2['Label']==4], df2['Annual Income (k$)'][df2['Label']==4], df2['Spending Score (1-100)'][df2['Label']==4], c='orange', s=50)
ax.view_init(30, 190)
ax.set_xlabel('Age')
ax.set_ylabel('Annual Income')
ax.set_zlabel('Spending Score')
plt.show()