import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
加载数据集
netflix = pd.read_csv('netflix_titles.csv')
netflix.head()
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 81145628 | Movie | Norm of the North: King Sized Adventure | Richard Finn, Tim Maltby | Alan Marriott, Andrew Toth, Brian Dobson, Cole... | United States, India, South Korea, China | September 9, 2019 | 2019 | TV-PG | 90 min | Children & Family Movies, Comedies | Before planning an awesome wedding for his gra... |
1 | 80117401 | Movie | Jandino: Whatever it Takes | NaN | Jandino Asporaat | United Kingdom | September 9, 2016 | 2016 | TV-MA | 94 min | Stand-Up Comedy | Jandino Asporaat riffs on the challenges of ra... |
2 | 70234439 | TV Show | Transformers Prime | NaN | Peter Cullen, Sumalee Montano, Frank Welker, J... | United States | September 8, 2018 | 2013 | TV-Y7-FV | 1 Season | Kids' TV | With the help of three human allies, the Autob... |
3 | 80058654 | TV Show | Transformers: Robots in Disguise | NaN | Will Friedle, Darren Criss, Constance Zimmer, ... | United States | September 8, 2018 | 2016 | TV-Y7 | 1 Season | Kids' TV | When a prison ship crash unleashes hundreds of... |
4 | 80125979 | Movie | #realityhigh | Fernando Lebrija | Nesta Cooper, Kate Walsh, John Michael Higgins... | United States | September 8, 2017 | 2017 | TV-14 | 99 min | Comedies | When nerdy high schooler Dani finally attracts... |
让我们检查一下数据集的形状
netflix.shape
(6234, 12)
netflix.count()
show_id 6234 type 6234 title 6234 director 4265 cast 5664 country 5758 date_added 6223 release_year 6234 rating 6224 duration 6234 listed_in 6234 description 6234 dtype: int64
netflix_shows = netflix[netflix['type']=='TV Shows']
netflix_movies = netflix[netflix['type']=='Movie']
sns.set(style='darkgrid')
ax = sns.countplot(x='type', data=netflix, palette='Set2')
很明显,Netflix 上的电影比电视节目多
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])
数量最多的电影采用“TV-MA”评级。“TV-MA”是电视家长指南为专为成熟观众设计的电视节目指定的评级。
第二大的是“TV-14”,代表可能不适合 14 岁以下儿童的内容。
第三大是非常流行的“R”级。R级电影是指被评估为含有可能不适合17岁以下儿童观看的内容的电影
imdb_ratings=pd.read_csv('IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('IMDb movies.csv', usecols=['title','year','genre'])
imdb_ratings.head()
weighted_average_vote | |
---|---|
0 | 5.9 |
1 | 6.1 |
2 | 5.8 |
3 | 5.2 |
4 | 7.0 |
imdb_titles.head()
title | year | genre | |
---|---|---|---|
0 | Miss Jerry | 1894 | Romance |
1 | The Story of the Kelly Gang | 1906 | Biography, Crime, Drama |
2 | Den sorte drøm | 1911 | Drama |
3 | Cleopatra | 1912 | Drama, History |
4 | L'Inferno | 1911 | Adventure, Drama, Fantasy |
ratings = pd.DataFrame({'Title':imdb_titles.title,
'Release Year':imdb_titles.year,
'Rating':imdb_ratings.weighted_average_vote,
'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title', 'Release Year', 'Rating'],inplace=True)
ratings.shape
(85852, 4)
对评分数据集和 Netflix 数据集执行内连接,以获取在 IMDB 上有评分且在 Netflix 上可用的内容。
ratings.dropna()
joint_data = ratings.merge(netflix, left_on='Title', right_on='title', how='inner')
joint_data = joint_data.sort_values(by='Rating', ascending=False)
import plotly.express as px
top_rated = joint_data[0:10]
fig = px.sunburst(
top_rated,path=['title', 'country'],values='Rating', color='Rating')
fig.show()
plt.figure(figsize=(12,10))
sns.set(style='darkgrid')
ax = sns.countplot(y='release_year', data=netflix_movies, palette='Set2', order=netflix_movies['release_year'].value_counts().index[0:15])
2017年和2018年是大部分电影上映的年份
countries={}
netflix_movies['country']=netflix_movies['country'].fillna('Unknown')
cou=list(netflix_movies['country'])
for i in cou:
#打印(一)
i=list(i.split(','))
if len(i)==1:
if i in list(countries.keys()):
countries[i]+=1
else:
countries[i[0]]=1
else:
for j in i:
if j in list(countries.keys()):
countries[j]+=1
else:
countries[j]=1
countries_fin={}
for country,no in countries.items():
country=country.replace(' ','')
if country in list(countries_fin.keys()):
countries_fin[country]+=no
else:
countries_fin[country]=no
countries_fin={k: v for k, v in sorted(countries_fin.items(), key=lambda item: item[1], reverse= True)}
plt.figure(figsize=(8,8))
ax = sns.barplot(x=list(countries_fin.keys())[0:10],y=list(countries_fin.values())[0:10])
ax.set_xticklabels(list(countries_fin.keys())[0:10],rotation = 90)
[Text(0, 0, 'UnitedStates'), Text(1, 0, 'France'), Text(2, 0, 'UnitedKingdom'), Text(3, 0, 'Canada'), Text(4, 0, 'Germany'), Text(5, 0, 'Belgium'), Text(6, 0, 'China'), Text(7, 0, 'Spain'), Text(8, 0, 'India'), Text(9, 0, 'Australia')]
netflix_movies['duration'].isnull().sum()
0
netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(float)
netflix_movies['duration']
0 90.0 1 94.0 4 99.0 6 110.0 7 60.0 ... 5577 70.0 5578 102.0 5579 88.0 5580 109.0 6231 60.0 Name: duration, Length: 4265, dtype: float64
sns.set(style='darkgrid')
sns.kdeplot(netflix_movies['duration'], shade=True)
<Axes: xlabel='duration', ylabel='Density'>
因此,Netflix 上的大量电影时长都在 75-120 分钟之间。考虑到相当多的观众无法一口气看完3小时的电影,这是可以接受的。
from collections import Counter
genres = list(netflix_movies['listed_in'])
gen=[]
for i in genres:
i = list(i.split(','))
for j in i:
gen.append(j.replace(' ',""))
g = Counter(gen)
from wordcloud import WordCloud
text = list(set(gen))
plt.rcParams['figure.figsize'] = (5, 5)
wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}
fig, ax = plt.subplots()
fig = plt.figure(figsize = (14, 10))
x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# 设置标题
ax.set_title("Genres")
Text(0.5, 1.0, 'Genres')
<Figure size 1400x1000 with 0 Axes>
因此,很明显,国际电影、戏剧和喜剧是 Netflix 上内容量最高的三大类型。
TF-IDF(词频-逆文档频率(TF-IDF))分数是单词在文档中出现的频率,根据该单词出现的文档数量进行加权。这样做是为了降低情节概述中频繁出现的单词的重要性,从而降低它们在计算最终相似度分数时的重要性。
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
#用空字符串替换NaN
netflix['description'] = netflix['description'].fillna('')
#通过拟合变换构建所需的TF-IDF矩阵
# 数据
tfidf_matrix = tfidf.fit_transform(netflix['description'])
#O/p tfidf_matrix 的形状
tfidf_matrix.shape
(6234, 16151)
#线性核
from sklearn.metrics.pairwise import linear_kernel
#余弦相似度
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(netflix.index, index = netflix['title']).drop_duplicates()
indices
title Norm of the North: King Sized Adventure 0 Jandino: Whatever it Takes 1 Transformers Prime 2 Transformers: Robots in Disguise 3 #realityhigh 4 ... Red vs. Blue 6229 Maron 6230 Little Baby Bum: Nursery Rhyme Friends 6231 A Young Doctor's Notebook and Other Stories 6232 Friends 6233 Length: 6234, dtype: int64
def get_recommendation(title, cosine_sim=cosine_sim):
idx = indices[title]
#获取所有电影与该电影的成对相似度分数
sim_scores = list(enumerate(cosine_sim[idx]))
#根据相似度分数对电影进行排序
sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
#获取10部最相似的电影的分数
sim_scores = sim_scores[1:11]
#获取电影索引
movie_indices = [i[0] for i in sim_scores]
#返回前10部相似电影
return netflix['title'].iloc[movie_indices]
get_recommendation('Peaky Blinders')
296 Our Godfather 4491 Don 2015 The Fear 4852 Jonathan Strange & Mr Norrell 1231 The Prison 3737 Power Rangers Zeo 5986 The Tudors 1753 Once Upon a Time in Mumbaai 5494 The Legend of Michael Mishra 1142 Shelby American Name: title, dtype: object
get_recommendation('Friends')
5659 BoJack Horseman 5987 Episodes 20 Manhattan Romance 3923 Studio 54 5830 Dad's Army 5843 Trailer Park Boys 4381 Single Ladies Senior 1524 Warehoused 5445 O-Negative, Love Can’t Be Designed 2594 Life Story Name: title, dtype: object
get_recommendation('Narcos')
1583 Miss Dynamite 4857 El Cartel 2 1257 Narcos: Mexico 5939 El Chapo 5773 Top Boy 5162 Cocaine 732 Street Flow 1480 Raja Natwarlal 1833 Two Graves 3444 Historia de un clan Name: title, dtype: object
根据以下因素进行过滤:
用空字符串填充空值
filledna=netflix.fillna('')
filledna.head()
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 81145628 | Movie | Norm of the North: King Sized Adventure | Richard Finn, Tim Maltby | Alan Marriott, Andrew Toth, Brian Dobson, Cole... | United States, India, South Korea, China | September 9, 2019 | 2019 | TV-PG | 90 min | Children & Family Movies, Comedies | Before planning an awesome wedding for his gra... |
1 | 80117401 | Movie | Jandino: Whatever it Takes | Jandino Asporaat | United Kingdom | September 9, 2016 | 2016 | TV-MA | 94 min | Stand-Up Comedy | Jandino Asporaat riffs on the challenges of ra... | |
2 | 70234439 | TV Show | Transformers Prime | Peter Cullen, Sumalee Montano, Frank Welker, J... | United States | September 8, 2018 | 2013 | TV-Y7-FV | 1 Season | Kids' TV | With the help of three human allies, the Autob... | |
3 | 80058654 | TV Show | Transformers: Robots in Disguise | Will Friedle, Darren Criss, Constance Zimmer, ... | United States | September 8, 2018 | 2016 | TV-Y7 | 1 Season | Kids' TV | When a prison ship crash unleashes hundreds of... | |
4 | 80125979 | Movie | #realityhigh | Fernando Lebrija | Nesta Cooper, Kate Walsh, John Michael Higgins... | United States | September 8, 2017 | 2017 | TV-14 | 99 min | Comedies | When nerdy high schooler Dani finally attracts... |
清理数据,使所有单词小写
def clean_data(x):
return str.lower(x.replace(" ",""))
#要过滤模型的特征
features=['title', 'director', 'cast', 'listed_in', 'description']
filledna=filledna[features]
for feature in features:
filledna[feature] = filledna[feature].apply(clean_data)
filledna.head()
title | director | cast | listed_in | description | |
---|---|---|---|---|---|
0 | normofthenorth:kingsizedadventure | richardfinn,timmaltby | alanmarriott,andrewtoth,briandobson,colehoward... | children&familymovies,comedies | beforeplanninganawesomeweddingforhisgrandfathe... |
1 | jandino:whateverittakes | jandinoasporaat | stand-upcomedy | jandinoasporaatriffsonthechallengesofraisingki... | |
2 | transformersprime | petercullen,sumaleemontano,frankwelker,jeffrey... | kids'tv | withthehelpofthreehumanallies,theautobotsoncea... | |
3 | transformers:robotsindisguise | willfriedle,darrencriss,constancezimmer,kharyp... | kids'tv | whenaprisonshipcrashunleasheshundredsofdecepti... | |
4 | #realityhigh | fernandolebrija | nestacooper,katewalsh,johnmichaelhiggins,keith... | comedies | whennerdyhighschoolerdanifinallyattractstheint... |
创建所有行的“词袋”
def create_soup(x):
return x['title']+ ' '+ x['director']+ ' '+ x['cast']+ ' ' + x['listed_in']+ ' ' +x['description']
filledna['soup'] = filledna.apply(create_soup, axis=1)
从这里开始,代码基本上与上面的模型类似,除了使用count vectorizer而不是tfidf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])
def get_recommendation_new(title, cosine_sim=cosine_sim):
title = title.replace(' ', '').lower()
idx = indices[title]
#获取所有电影与该电影的成对相似度分数
sim_scores = list(enumerate(cosine_sim[idx]))
#根据相似度分数对电影进行排序
sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
#获取10部最相似的电影的分数
sim_scores = sim_scores[1:11]
#获取电影索引
movie_indices = [i[0] for i in sim_scores]
#返回前10部相似电影
return netflix['title'].iloc[movie_indices]
get_recommendation_new('Friends', cosine_sim2)
5987 Episodes 5980 The Andy Griffith Show 6225 Frasier 5699 Still Game 5639 Toast of London 5830 Dad's Army 6094 Pee-wee's Playhouse 5976 Cheers 5981 The Twilight Zone (Original Series) 6179 The IT Crowd Name: title, dtype: object
get_recommendation_new('Narcos', cosine_sim2)
1257 Narcos: Mexico 5586 Marvel's Iron Fist 5777 Queen of the South 5610 Person of Interest 5897 Shooter 5915 Marvel's Jessica Jones 2836 Smoking 3891 Altered Carbon 5713 Wild District 6085 El señor de los Cielos Name: title, dtype: object