import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv('Mall_Customers.csv')
df.head()


# 统计信息
df.describe()


# 数据类型信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


gender_counts = df['Gender'].value_counts()
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution')
plt.show()


sns.displot(df['Age'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Age Distribution')
plt.show()


sns.displot(df['Annual Income (k$)'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Annual Income (k$)')
plt.ylabel('Density')
plt.title('Annual Income (k$) Distribution')
plt.show()


sns.displot(df['Spending Score (1-100)'], kde=True)  # kde=True添加核密度估计曲线
plt.xlabel('Spending Score (1-100)')
plt.ylabel('Density')
plt.title('Spending Score (1-100) Distribution')
plt.show()


corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm')

<Axes: >


df.head()


# 基于 2 个特征的聚类
df1 = df[['Annual Income (k$)', 'Spending Score (1-100)']]
df1.head()


# 散点图
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df1)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Scatter Plot of Annual Income vs. Spending Score')
plt.show()


import warnings 
warnings.filterwarnings("ignore", category=UserWarning) 
from sklearn.cluster import KMeans

errors = []
for i in range(1, 11): 
    kmeans = KMeans(n_clusters=i, random_state=0,n_init=10, max_iter=300 )
    kmeans.fit(df1)
    errors.append(kmeans.inertia_)


# 绘制肘部法的结果
plt.figure(figsize=(13,6))
plt.plot(range(1,11), errors)
plt.plot(range(1,11), errors, linewidth=3, color='red', marker='8')
plt.xlabel('No. of clusters')
plt.ylabel('WCSS')
plt.xticks(np.arange(1,11,1))
plt.show()


km = KMeans(n_clusters=5, n_init=10)
km.fit(df1)

y = km.predict(df1)
df1.loc[:, 'Label'] = y

df1.head()

C:\Users\1\AppData\Local\Temp\ipykernel_21088\2186447201.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[:, 'Label'] = y


sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df1, hue='Label', s=50, palette=['red', 'green', 'brown', 'blue', 'orange'])

<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>


# 基于 3 个特征的聚类
df2 = df[['Annual Income (k$)', 'Spending Score (1-100)', 'Age']]
df2.head()


import warnings 
warnings.filterwarnings("ignore", category=UserWarning) 
from sklearn.cluster import KMeans

errors = []
for i in range(1, 11): 
    kmeans = KMeans(n_clusters=i, random_state=0,n_init=10, max_iter=300 )
    kmeans.fit(df2)
    errors.append(kmeans.inertia_)


# 绘制肘部法的结果
plt.figure(figsize=(13,6))
plt.plot(range(1,11), errors)
plt.plot(range(1,11), errors, linewidth=3, color='red', marker='8')
plt.xlabel('No. of clusters')
plt.ylabel('WCSS')
plt.xticks(np.arange(1,11,1))
plt.show()


km = KMeans(n_clusters=5, n_init=10)
km.fit(df2)

y = km.predict(df2)
df2.loc[:, 'Label'] = y

df2.head()

C:\Users\1\AppData\Local\Temp\ipykernel_21088\4218386864.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:, 'Label'] = y


# 3d 散点图
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(df2['Age'][df2['Label']==0], df2['Annual Income (k$)'][df2['Label']==0], df2['Spending Score (1-100)'][df2['Label']==0], c='red', s=50)
ax.scatter(df2['Age'][df2['Label']==1], df2['Annual Income (k$)'][df2['Label']==1], df2['Spending Score (1-100)'][df2['Label']==1], c='green', s=50)
ax.scatter(df2['Age'][df2['Label']==2], df2['Annual Income (k$)'][df2['Label']==2], df2['Spending Score (1-100)'][df2['Label']==2], c='blue', s=50)
ax.scatter(df2['Age'][df2['Label']==3], df2['Annual Income (k$)'][df2['Label']==3], df2['Spending Score (1-100)'][df2['Label']==3], c='brown', s=50)
ax.scatter(df2['Age'][df2['Label']==4], df2['Annual Income (k$)'][df2['Label']==4], df2['Spending Score (1-100)'][df2['Label']==4], c='orange', s=50)
ax.view_init(30, 190)
ax.set_xlabel('Age')
ax.set_ylabel('Annual Income')
ax.set_zlabel('Spending Score')
plt.show()

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40

	CustomerID	Age	Annual Income (k$)	Spending Score (1-100)
count	200.000000	200.000000	200.000000	200.000000
mean	100.500000	38.850000	60.560000	50.200000
std	57.879185	13.969007	26.264721	25.823522
min	1.000000	18.000000	15.000000	1.000000
25%	50.750000	28.750000	41.500000	34.750000
50%	100.500000	36.000000	61.500000	50.000000
75%	150.250000	49.000000	78.000000	73.000000
max	200.000000	70.000000	137.000000	99.000000

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40

	Annual Income (k$)	Spending Score (1-100)
0	15	39
1	15	81
2	16	6
3	16	77
4	17	40

	Annual Income (k$)	Spending Score (1-100)	Label
0	15	39	0
1	15	81	3
2	16	6	0
3	16	77	3
4	17	40	0

导入模块¶

加载数据集¶

探索性数据分析¶

相关矩阵¶

聚类¶