import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


netflix = pd.read_csv('netflix_titles.csv')
netflix.head()


netflix.shape

(6234, 12)


netflix.count()

show_id         6234
type            6234
title           6234
director        4265
cast            5664
country         5758
date_added      6223
release_year    6234
rating          6224
duration        6234
listed_in       6234
description     6234
dtype: int64


netflix_shows = netflix[netflix['type']=='TV Shows']


netflix_movies = netflix[netflix['type']=='Movie']


sns.set(style='darkgrid')
ax = sns.countplot(x='type', data=netflix, palette='Set2')


plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])


imdb_ratings=pd.read_csv('IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('IMDb movies.csv', usecols=['title','year','genre'])


imdb_ratings.head()


imdb_titles.head()


ratings = pd.DataFrame({'Title':imdb_titles.title,
                       'Release Year':imdb_titles.year,
                       'Rating':imdb_ratings.weighted_average_vote,
                       'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title', 'Release Year', 'Rating'],inplace=True)
ratings.shape

(85852, 4)


ratings.dropna()
joint_data = ratings.merge(netflix, left_on='Title', right_on='title', how='inner')
joint_data = joint_data.sort_values(by='Rating', ascending=False)


import plotly.express as px
top_rated = joint_data[0:10]
fig = px.sunburst(
top_rated,path=['title', 'country'],values='Rating', color='Rating')
fig.show()


plt.figure(figsize=(12,10))
sns.set(style='darkgrid')
ax = sns.countplot(y='release_year', data=netflix_movies, palette='Set2', order=netflix_movies['release_year'].value_counts().index[0:15])


countries={}
netflix_movies['country']=netflix_movies['country'].fillna('Unknown')
cou=list(netflix_movies['country'])
for i in cou:
#打印（一）
    i=list(i.split(','))
    if len(i)==1:
        if i in list(countries.keys()):
            countries[i]+=1
        else:
            countries[i[0]]=1
    else:
        for j in i:
            if j in list(countries.keys()):
                countries[j]+=1
            else:
                countries[j]=1


countries_fin={}
for country,no in countries.items():
    country=country.replace(' ','')
    if country in list(countries_fin.keys()):
        countries_fin[country]+=no
    else:
        countries_fin[country]=no
        
countries_fin={k: v for k, v in sorted(countries_fin.items(), key=lambda item: item[1], reverse= True)}


plt.figure(figsize=(8,8))
ax = sns.barplot(x=list(countries_fin.keys())[0:10],y=list(countries_fin.values())[0:10])
ax.set_xticklabels(list(countries_fin.keys())[0:10],rotation = 90)

[Text(0, 0, 'UnitedStates'),
 Text(1, 0, 'France'),
 Text(2, 0, 'UnitedKingdom'),
 Text(3, 0, 'Canada'),
 Text(4, 0, 'Germany'),
 Text(5, 0, 'Belgium'),
 Text(6, 0, 'China'),
 Text(7, 0, 'Spain'),
 Text(8, 0, 'India'),
 Text(9, 0, 'Australia')]


netflix_movies['duration'].isnull().sum()

0


netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')


netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(float)


netflix_movies['duration']

0        90.0
1        94.0
4        99.0
6       110.0
7        60.0
        ...  
5577     70.0
5578    102.0
5579     88.0
5580    109.0
6231     60.0
Name: duration, Length: 4265, dtype: float64


sns.set(style='darkgrid')
sns.kdeplot(netflix_movies['duration'], shade=True)

<Axes: xlabel='duration', ylabel='Density'>


from collections import Counter

genres = list(netflix_movies['listed_in'])
gen=[]

for i in genres:
    i = list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',""))
        
g = Counter(gen)


from wordcloud import WordCloud 

text = list(set(gen))
plt.rcParams['figure.figsize'] = (5, 5)

wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))

plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()


g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}


fig, ax = plt.subplots()

fig = plt.figure(figsize = (14, 10))
x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# 设置标题
ax.set_title("Genres")

Text(0.5, 1.0, 'Genres')

<Figure size 1400x1000 with 0 Axes>


from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(stop_words='english')

#用空字符串替换NaN
netflix['description'] = netflix['description'].fillna('')

#通过拟合变换构建所需的TF-IDF矩阵
＃ 数据
tfidf_matrix = tfidf.fit_transform(netflix['description'])

#O/p tfidf_matrix 的形状
tfidf_matrix.shape

(6234, 16151)


#线性核
from sklearn.metrics.pairwise import linear_kernel

#余弦相似度
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


indices = pd.Series(netflix.index, index = netflix['title']).drop_duplicates()


indices

title
Norm of the North: King Sized Adventure           0
Jandino: Whatever it Takes                        1
Transformers Prime                                2
Transformers: Robots in Disguise                  3
#realityhigh                                      4
                                               ... 
Red vs. Blue                                   6229
Maron                                          6230
Little Baby Bum: Nursery Rhyme Friends         6231
A Young Doctor's Notebook and Other Stories    6232
Friends                                        6233
Length: 6234, dtype: int64


def get_recommendation(title, cosine_sim=cosine_sim):
    idx = indices[title]
    
#获取所有电影与该电影的成对相似度分数
    sim_scores = list(enumerate(cosine_sim[idx]))
    
#根据相似度分数对电影进行排序
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
#获取10部最相似的电影的分数
    sim_scores = sim_scores[1:11]
    
#获取电影索引
    movie_indices = [i[0] for i in sim_scores]
    
#返回前10部相似电影
    return netflix['title'].iloc[movie_indices]


get_recommendation('Peaky Blinders')

296                     Our Godfather
4491                              Don
2015                         The Fear
4852    Jonathan Strange & Mr Norrell
1231                       The Prison
3737                Power Rangers Zeo
5986                       The Tudors
1753      Once Upon a Time in Mumbaai
5494     The Legend of Michael Mishra
1142                  Shelby American
Name: title, dtype: object


get_recommendation('Friends')

5659                       BoJack Horseman
5987                              Episodes
20                       Manhattan Romance
3923                             Studio 54
5830                            Dad's Army
5843                     Trailer Park Boys
4381                  Single Ladies Senior
1524                            Warehoused
5445    O-Negative, Love Can’t Be Designed
2594                            Life Story
Name: title, dtype: object


get_recommendation('Narcos')

1583          Miss Dynamite
4857            El Cartel 2
1257         Narcos: Mexico
5939               El Chapo
5773                Top Boy
5162                Cocaine
732             Street Flow
1480         Raja Natwarlal
1833             Two Graves
3444    Historia de un clan
Name: title, dtype: object


filledna=netflix.fillna('')
filledna.head()


def clean_data(x):
    return str.lower(x.replace(" ",""))


#要过滤模型的特征
features=['title', 'director', 'cast', 'listed_in', 'description']
filledna=filledna[features]


for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)
    
filledna.head()


def create_soup(x):
    return x['title']+ ' '+ x['director']+ ' '+ x['cast']+ ' ' + x['listed_in']+ ' ' +x['description']


filledna['soup'] = filledna.apply(create_soup, axis=1)


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])


def get_recommendation_new(title, cosine_sim=cosine_sim):
    title = title.replace(' ', '').lower()
    idx = indices[title]
    
#获取所有电影与该电影的成对相似度分数
    sim_scores = list(enumerate(cosine_sim[idx]))
    
#根据相似度分数对电影进行排序
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
#获取10部最相似的电影的分数
    sim_scores = sim_scores[1:11]
    
#获取电影索引
    movie_indices = [i[0] for i in sim_scores]
    
#返回前10部相似电影
    return netflix['title'].iloc[movie_indices]


get_recommendation_new('Friends', cosine_sim2)

5987                               Episodes
5980                 The Andy Griffith Show
6225                                Frasier
5699                             Still Game
5639                        Toast of London
5830                             Dad's Army
6094                    Pee-wee's Playhouse
5976                                 Cheers
5981    The Twilight Zone (Original Series)
6179                           The IT Crowd
Name: title, dtype: object


get_recommendation_new('Narcos', cosine_sim2)

1257            Narcos: Mexico
5586        Marvel's Iron Fist
5777        Queen of the South
5610        Person of Interest
5897                   Shooter
5915    Marvel's Jessica Jones
2836                   Smoking
3891            Altered Carbon
5713             Wild District
6085    El señor de los Cielos
Name: title, dtype: object

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	81145628	Movie	Norm of the North: King Sized Adventure	Richard Finn, Tim Maltby	Alan Marriott, Andrew Toth, Brian Dobson, Cole...	United States, India, South Korea, China	September 9, 2019	2019	TV-PG	90 min	Children & Family Movies, Comedies	Before planning an awesome wedding for his gra...
1	80117401	Movie	Jandino: Whatever it Takes	NaN	Jandino Asporaat	United Kingdom	September 9, 2016	2016	TV-MA	94 min	Stand-Up Comedy	Jandino Asporaat riffs on the challenges of ra...
2	70234439	TV Show	Transformers Prime	NaN	Peter Cullen, Sumalee Montano, Frank Welker, J...	United States	September 8, 2018	2013	TV-Y7-FV	1 Season	Kids' TV	With the help of three human allies, the Autob...
3	80058654	TV Show	Transformers: Robots in Disguise	NaN	Will Friedle, Darren Criss, Constance Zimmer, ...	United States	September 8, 2018	2016	TV-Y7	1 Season	Kids' TV	When a prison ship crash unleashes hundreds of...
4	80125979	Movie	#realityhigh	Fernando Lebrija	Nesta Cooper, Kate Walsh, John Michael Higgins...	United States	September 8, 2017	2017	TV-14	99 min	Comedies	When nerdy high schooler Dani finally attracts...

	weighted_average_vote
0	5.9
1	6.1
2	5.8
3	5.2
4	7.0

	title	year	genre
0	Miss Jerry	1894	Romance
1	The Story of the Kelly Gang	1906	Biography, Crime, Drama
2	Den sorte drøm	1911	Drama
3	Cleopatra	1912	Drama, History
4	L'Inferno	1911	Adventure, Drama, Fantasy

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	81145628	Movie	Norm of the North: King Sized Adventure	Richard Finn, Tim Maltby	Alan Marriott, Andrew Toth, Brian Dobson, Cole...	United States, India, South Korea, China	September 9, 2019	2019	TV-PG	90 min	Children & Family Movies, Comedies	Before planning an awesome wedding for his gra...
1	80117401	Movie	Jandino: Whatever it Takes		Jandino Asporaat	United Kingdom	September 9, 2016	2016	TV-MA	94 min	Stand-Up Comedy	Jandino Asporaat riffs on the challenges of ra...
2	70234439	TV Show	Transformers Prime		Peter Cullen, Sumalee Montano, Frank Welker, J...	United States	September 8, 2018	2013	TV-Y7-FV	1 Season	Kids' TV	With the help of three human allies, the Autob...
3	80058654	TV Show	Transformers: Robots in Disguise		Will Friedle, Darren Criss, Constance Zimmer, ...	United States	September 8, 2018	2016	TV-Y7	1 Season	Kids' TV	When a prison ship crash unleashes hundreds of...
4	80125979	Movie	#realityhigh	Fernando Lebrija	Nesta Cooper, Kate Walsh, John Michael Higgins...	United States	September 8, 2017	2017	TV-14	99 min	Comedies	When nerdy high schooler Dani finally attracts...

	title	director	cast	listed_in	description
0	normofthenorth:kingsizedadventure	richardfinn,timmaltby	alanmarriott,andrewtoth,briandobson,colehoward...	children&familymovies,comedies	beforeplanninganawesomeweddingforhisgrandfathe...
1	jandino:whateverittakes		jandinoasporaat	stand-upcomedy	jandinoasporaatriffsonthechallengesofraisingki...
2	transformersprime		petercullen,sumaleemontano,frankwelker,jeffrey...	kids'tv	withthehelpofthreehumanallies,theautobotsoncea...
3	transformers:robotsindisguise		willfriedle,darrencriss,constancezimmer,kharyp...	kids'tv	whenaprisonshipcrashunleasheshundredsofdecepti...
4	#realityhigh	fernandolebrija	nestacooper,katewalsh,johnmichaelhiggins,keith...	comedies	whennerdyhighschoolerdanifinallyattractstheint...

可视化时间¶

电影收视率分析¶

分析 IMDB 评分以获得 Netflix 上评分最高的电影¶

十大电影内容创作国¶

分析电影时长¶

绘制电影的时长¶

WorldCloud 流派¶

分析不同影视流派数量¶

推荐系统¶

关键词“浴血黑帮” 的推荐¶

关键词“朋友”的推荐¶

关键词“毒枭”的推荐¶

可以看到模型表现不错，但是不太准确。因此，模型中加入了更多的指标来提高性能。¶

基于多个指标的内容过滤¶