In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

加载数据集

In [40]:
netflix = pd.read_csv('netflix_titles.csv')
netflix.head()
Out[40]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
1 80117401 Movie Jandino: Whatever it Takes NaN Jandino Asporaat United Kingdom September 9, 2016 2016 TV-MA 94 min Stand-Up Comedy Jandino Asporaat riffs on the challenges of ra...
2 70234439 TV Show Transformers Prime NaN Peter Cullen, Sumalee Montano, Frank Welker, J... United States September 8, 2018 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob...
3 80058654 TV Show Transformers: Robots in Disguise NaN Will Friedle, Darren Criss, Constance Zimmer, ... United States September 8, 2018 2016 TV-Y7 1 Season Kids' TV When a prison ship crash unleashes hundreds of...
4 80125979 Movie #realityhigh Fernando Lebrija Nesta Cooper, Kate Walsh, John Michael Higgins... United States September 8, 2017 2017 TV-14 99 min Comedies When nerdy high schooler Dani finally attracts...

让我们检查一下数据集的形状

In [41]:
netflix.shape
Out[41]:
(6234, 12)
In [42]:
netflix.count()
Out[42]:
show_id         6234
type            6234
title           6234
director        4265
cast            5664
country         5758
date_added      6223
release_year    6234
rating          6224
duration        6234
listed_in       6234
description     6234
dtype: int64
  • 让我们检查一下表演与电影
In [43]:
netflix_shows = netflix[netflix['type']=='TV Shows']
In [44]:
netflix_movies = netflix[netflix['type']=='Movie']

可视化时间¶

In [45]:
sns.set(style='darkgrid')
ax = sns.countplot(x='type', data=netflix, palette='Set2')

很明显,Netflix 上的电影比电视节目多

电影收视率分析¶

In [46]:
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])
  • 数量最多的电影采用“TV-MA”评级。“TV-MA”是电视家长指南为专为成熟观众设计的电视节目指定的评级。

  • 第二大的是“TV-14”,代表可能不适合 14 岁以下儿童的内容。

  • 第三大是非常流行的“R”级。R级电影是指被评估为含有可能不适合17岁以下儿童观看的内容的电影

分析 IMDB 评分以获得 Netflix 上评分最高的电影¶

In [47]:
imdb_ratings=pd.read_csv('IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('IMDb movies.csv', usecols=['title','year','genre'])
In [48]:
imdb_ratings.head()
Out[48]:
weighted_average_vote
0 5.9
1 6.1
2 5.8
3 5.2
4 7.0
In [49]:
imdb_titles.head()
Out[49]:
title year genre
0 Miss Jerry 1894 Romance
1 The Story of the Kelly Gang 1906 Biography, Crime, Drama
2 Den sorte drøm 1911 Drama
3 Cleopatra 1912 Drama, History
4 L'Inferno 1911 Adventure, Drama, Fantasy
In [50]:
ratings = pd.DataFrame({'Title':imdb_titles.title,
                       'Release Year':imdb_titles.year,
                       'Rating':imdb_ratings.weighted_average_vote,
                       'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title', 'Release Year', 'Rating'],inplace=True)
ratings.shape
Out[50]:
(85852, 4)

对评分数据集和 Netflix 数据集执行内连接,以获取在 IMDB 上有评分且在 Netflix 上可用的内容。

In [51]:
ratings.dropna()
joint_data = ratings.merge(netflix, left_on='Title', right_on='title', how='inner')
joint_data = joint_data.sort_values(by='Rating', ascending=False)
In [52]:
import plotly.express as px
top_rated = joint_data[0:10]
fig = px.sunburst(
top_rated,path=['title', 'country'],values='Rating', color='Rating')
fig.show()
In [53]:
plt.figure(figsize=(12,10))
sns.set(style='darkgrid')
ax = sns.countplot(y='release_year', data=netflix_movies, palette='Set2', order=netflix_movies['release_year'].value_counts().index[0:15])

2017年和2018年是大部分电影上映的年份

In [54]:
countries={}
netflix_movies['country']=netflix_movies['country'].fillna('Unknown')
cou=list(netflix_movies['country'])
for i in cou:
#打印(一)
    i=list(i.split(','))
    if len(i)==1:
        if i in list(countries.keys()):
            countries[i]+=1
        else:
            countries[i[0]]=1
    else:
        for j in i:
            if j in list(countries.keys()):
                countries[j]+=1
            else:
                countries[j]=1
In [55]:
countries_fin={}
for country,no in countries.items():
    country=country.replace(' ','')
    if country in list(countries_fin.keys()):
        countries_fin[country]+=no
    else:
        countries_fin[country]=no
        
countries_fin={k: v for k, v in sorted(countries_fin.items(), key=lambda item: item[1], reverse= True)}

十大电影内容创作国¶

In [56]:
plt.figure(figsize=(8,8))
ax = sns.barplot(x=list(countries_fin.keys())[0:10],y=list(countries_fin.values())[0:10])
ax.set_xticklabels(list(countries_fin.keys())[0:10],rotation = 90)
Out[56]:
[Text(0, 0, 'UnitedStates'),
 Text(1, 0, 'France'),
 Text(2, 0, 'UnitedKingdom'),
 Text(3, 0, 'Canada'),
 Text(4, 0, 'Germany'),
 Text(5, 0, 'Belgium'),
 Text(6, 0, 'China'),
 Text(7, 0, 'Spain'),
 Text(8, 0, 'India'),
 Text(9, 0, 'Australia')]

分析电影时长¶

In [57]:
netflix_movies['duration'].isnull().sum()
Out[57]:
0
In [58]:
netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
In [59]:
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(float)
In [60]:
netflix_movies['duration']
Out[60]:
0        90.0
1        94.0
4        99.0
6       110.0
7        60.0
        ...  
5577     70.0
5578    102.0
5579     88.0
5580    109.0
6231     60.0
Name: duration, Length: 4265, dtype: float64

绘制电影的时长¶

In [61]:
sns.set(style='darkgrid')
sns.kdeplot(netflix_movies['duration'], shade=True)
Out[61]:
<Axes: xlabel='duration', ylabel='Density'>

因此,Netflix 上的大量电影时长都在 75-120 分钟之间。考虑到相当多的观众无法一口气看完3小时的电影,这是可以接受的。

In [62]:
from collections import Counter

genres = list(netflix_movies['listed_in'])
gen=[]

for i in genres:
    i = list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',""))
        
g = Counter(gen)

WorldCloud 流派¶

In [63]:
from wordcloud import WordCloud 

text = list(set(gen))
plt.rcParams['figure.figsize'] = (5, 5)

wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))

plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

分析不同影视流派数量¶

In [64]:
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}


fig, ax = plt.subplots()

fig = plt.figure(figsize = (14, 10))
x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# 设置标题
ax.set_title("Genres")
Out[64]:
Text(0.5, 1.0, 'Genres')
<Figure size 1400x1000 with 0 Axes>

因此,很明显,国际电影、戏剧和喜剧是 Netflix 上内容量最高的三大类型。

推荐系统¶

TF-IDF(词频-逆文档频率(TF-IDF))分数是单词在文档中出现的频率,根据该单词出现的文档数量进行加权。这样做是为了降低情节概述中频繁出现的单词的重要性,从而降低它们在计算最终相似度分数时的重要性。

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [66]:
tfidf = TfidfVectorizer(stop_words='english')

#用空字符串替换NaN
netflix['description'] = netflix['description'].fillna('')

#通过拟合变换构建所需的TF-IDF矩阵
# 数据
tfidf_matrix = tfidf.fit_transform(netflix['description'])

#O/p tfidf_matrix 的形状
tfidf_matrix.shape
Out[66]:
(6234, 16151)
In [67]:
#线性核
from sklearn.metrics.pairwise import linear_kernel

#余弦相似度
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
In [68]:
indices = pd.Series(netflix.index, index = netflix['title']).drop_duplicates()
In [69]:
indices
Out[69]:
title
Norm of the North: King Sized Adventure           0
Jandino: Whatever it Takes                        1
Transformers Prime                                2
Transformers: Robots in Disguise                  3
#realityhigh                                      4
                                               ... 
Red vs. Blue                                   6229
Maron                                          6230
Little Baby Bum: Nursery Rhyme Friends         6231
A Young Doctor's Notebook and Other Stories    6232
Friends                                        6233
Length: 6234, dtype: int64
In [70]:
def get_recommendation(title, cosine_sim=cosine_sim):
    idx = indices[title]
    
#获取所有电影与该电影的成对相似度分数
    sim_scores = list(enumerate(cosine_sim[idx]))
    
#根据相似度分数对电影进行排序
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
#获取10部最相似的电影的分数
    sim_scores = sim_scores[1:11]
    
#获取电影索引
    movie_indices = [i[0] for i in sim_scores]
    
#返回前10部相似电影
    return netflix['title'].iloc[movie_indices]

关键词“浴血黑帮” 的推荐¶

In [71]:
get_recommendation('Peaky Blinders')
Out[71]:
296                     Our Godfather
4491                              Don
2015                         The Fear
4852    Jonathan Strange & Mr Norrell
1231                       The Prison
3737                Power Rangers Zeo
5986                       The Tudors
1753      Once Upon a Time in Mumbaai
5494     The Legend of Michael Mishra
1142                  Shelby American
Name: title, dtype: object

关键词“朋友”的推荐¶

In [72]:
get_recommendation('Friends')
Out[72]:
5659                       BoJack Horseman
5987                              Episodes
20                       Manhattan Romance
3923                             Studio 54
5830                            Dad's Army
5843                     Trailer Park Boys
4381                  Single Ladies Senior
1524                            Warehoused
5445    O-Negative, Love Can’t Be Designed
2594                            Life Story
Name: title, dtype: object

关键词“毒枭”的推荐¶

In [73]:
get_recommendation('Narcos')
Out[73]:
1583          Miss Dynamite
4857            El Cartel 2
1257         Narcos: Mexico
5939               El Chapo
5773                Top Boy
5162                Cocaine
732             Street Flow
1480         Raja Natwarlal
1833             Two Graves
3444    Historia de un clan
Name: title, dtype: object

可以看到模型表现不错,但是不太准确。因此,模型中加入了更多的指标来提高性能。¶

基于多个指标的内容过滤¶

根据以下因素进行过滤:

  • Title
  • Cast
  • Director
  • Listed in
  • Plot

用空字符串填充空值

In [74]:
filledna=netflix.fillna('')
filledna.head()
Out[74]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
1 80117401 Movie Jandino: Whatever it Takes Jandino Asporaat United Kingdom September 9, 2016 2016 TV-MA 94 min Stand-Up Comedy Jandino Asporaat riffs on the challenges of ra...
2 70234439 TV Show Transformers Prime Peter Cullen, Sumalee Montano, Frank Welker, J... United States September 8, 2018 2013 TV-Y7-FV 1 Season Kids' TV With the help of three human allies, the Autob...
3 80058654 TV Show Transformers: Robots in Disguise Will Friedle, Darren Criss, Constance Zimmer, ... United States September 8, 2018 2016 TV-Y7 1 Season Kids' TV When a prison ship crash unleashes hundreds of...
4 80125979 Movie #realityhigh Fernando Lebrija Nesta Cooper, Kate Walsh, John Michael Higgins... United States September 8, 2017 2017 TV-14 99 min Comedies When nerdy high schooler Dani finally attracts...

清理数据,使所有单词小写

In [75]:
def clean_data(x):
    return str.lower(x.replace(" ",""))
In [76]:
#要过滤模型的特征
features=['title', 'director', 'cast', 'listed_in', 'description']
filledna=filledna[features]
In [77]:
for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)
    
filledna.head()
Out[77]:
title director cast listed_in description
0 normofthenorth:kingsizedadventure richardfinn,timmaltby alanmarriott,andrewtoth,briandobson,colehoward... children&familymovies,comedies beforeplanninganawesomeweddingforhisgrandfathe...
1 jandino:whateverittakes jandinoasporaat stand-upcomedy jandinoasporaatriffsonthechallengesofraisingki...
2 transformersprime petercullen,sumaleemontano,frankwelker,jeffrey... kids'tv withthehelpofthreehumanallies,theautobotsoncea...
3 transformers:robotsindisguise willfriedle,darrencriss,constancezimmer,kharyp... kids'tv whenaprisonshipcrashunleasheshundredsofdecepti...
4 #realityhigh fernandolebrija nestacooper,katewalsh,johnmichaelhiggins,keith... comedies whennerdyhighschoolerdanifinallyattractstheint...

创建所有行的“词袋”

In [78]:
def create_soup(x):
    return x['title']+ ' '+ x['director']+ ' '+ x['cast']+ ' ' + x['listed_in']+ ' ' +x['description']
In [79]:
filledna['soup'] = filledna.apply(create_soup, axis=1)

从这里开始,代码基本上与上面的模型类似,除了使用count vectorizer而不是tfidf

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
In [81]:
filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])
In [82]:
def get_recommendation_new(title, cosine_sim=cosine_sim):
    title = title.replace(' ', '').lower()
    idx = indices[title]
    
#获取所有电影与该电影的成对相似度分数
    sim_scores = list(enumerate(cosine_sim[idx]))
    
#根据相似度分数对电影进行排序
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
#获取10部最相似的电影的分数
    sim_scores = sim_scores[1:11]
    
#获取电影索引
    movie_indices = [i[0] for i in sim_scores]
    
#返回前10部相似电影
    return netflix['title'].iloc[movie_indices]
In [84]:
get_recommendation_new('Friends', cosine_sim2)
Out[84]:
5987                               Episodes
5980                 The Andy Griffith Show
6225                                Frasier
5699                             Still Game
5639                        Toast of London
5830                             Dad's Army
6094                    Pee-wee's Playhouse
5976                                 Cheers
5981    The Twilight Zone (Original Series)
6179                           The IT Crowd
Name: title, dtype: object
In [85]:
get_recommendation_new('Narcos', cosine_sim2)
Out[85]:
1257            Narcos: Mexico
5586        Marvel's Iron Fist
5777        Queen of the South
5610        Person of Interest
5897                   Shooter
5915    Marvel's Jessica Jones
2836                   Smoking
3891            Altered Carbon
5713             Wild District
6085    El señor de los Cielos
Name: title, dtype: object