import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
data = pd.read_csv('data.csv')
print(data.columns)
data.shape
Index(['index', 'title', 'genre', 'summary'], dtype='object')
(4657, 4)
data.head()
index | title | genre | summary | |
---|---|---|---|---|
0 | 0 | Drowned Wednesday | fantasy | Drowned Wednesday is the first Trustee among ... |
1 | 1 | The Lost Hero | fantasy | As the book opens, Jason awakens on a school ... |
2 | 2 | The Eyes of the Overworld | fantasy | Cugel is easily persuaded by the merchant Fia... |
3 | 3 | Magic's Promise | fantasy | The book opens with Herald-Mage Vanyel return... |
4 | 4 | Taran Wanderer | fantasy | Taran and Gurgi have returned to Caer Dallben... |
data['genre'].value_counts().plot(kind='barh')
<Axes: >
data['genre_id'] = data['genre'].factorize()[0]
data['genre_id'].value_counts()
5 1023 0 876 1 647 3 600 4 600 2 500 7 111 6 100 8 100 9 100 Name: genre_id, dtype: int64
data['summary'] = data['summary'].apply(lambda x: x.lower())
data.head()
index | title | genre | summary | genre_id | |
---|---|---|---|---|---|
0 | 0 | Drowned Wednesday | fantasy | drowned wednesday is the first trustee among ... | 0 |
1 | 1 | The Lost Hero | fantasy | as the book opens, jason awakens on a school ... | 0 |
2 | 2 | The Eyes of the Overworld | fantasy | cugel is easily persuaded by the merchant fia... | 0 |
3 | 3 | Magic's Promise | fantasy | the book opens with herald-mage vanyel return... | 0 |
4 | 4 | Taran Wanderer | fantasy | taran and gurgi have returned to caer dallben... | 0 |
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')
def getEmbedding(sentence):
return model.encode(sentence)
.gitattributes: 0%| | 0.00/1.18k [00:00<?, ?B/s]
1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
README.md: 0%| | 0.00/10.6k [00:00<?, ?B/s]
config.json: 0%| | 0.00/571 [00:00<?, ?B/s]
config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
data_config.json: 0%| | 0.00/39.3k [00:00<?, ?B/s]
pytorch_model.bin: 0%| | 0.00/438M [00:00<?, ?B/s]
sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/363 [00:00<?, ?B/s]
train_script.py: 0%| | 0.00/13.1k [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
embeddings = data['summary'].apply(lambda x: getEmbedding(x))
input_data = []
for item in embeddings:
input_data.append(item.tolist())
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
ngram_range=(1, 2),
stop_words='english')
# We transform each complaint into a vector
features = tfidf.fit_transform(data.summary).toarray()
labels = data.genre_id
print("Each of the %d synopsis is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))
Each of the 4657 synopsis is represented by 20152 features (TF-IDF score of unigrams and bigrams)
models = [
RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
LinearSVC(),
MultinomialNB(),
LogisticRegression(random_state=0),
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df_tfidf = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df_tfidf
model_name | fold_idx | accuracy | |
---|---|---|---|
0 | RandomForestClassifier | 0 | 0.353004 |
1 | RandomForestClassifier | 1 | 0.362661 |
2 | RandomForestClassifier | 2 | 0.365199 |
3 | RandomForestClassifier | 3 | 0.352309 |
4 | RandomForestClassifier | 4 | 0.303974 |
5 | LinearSVC | 0 | 0.701717 |
6 | LinearSVC | 1 | 0.689914 |
7 | LinearSVC | 2 | 0.711063 |
8 | LinearSVC | 3 | 0.718582 |
9 | LinearSVC | 4 | 0.616541 |
10 | MultinomialNB | 0 | 0.542918 |
11 | MultinomialNB | 1 | 0.524678 |
12 | MultinomialNB | 2 | 0.517723 |
13 | MultinomialNB | 3 | 0.519871 |
14 | MultinomialNB | 4 | 0.470462 |
15 | LogisticRegression | 0 | 0.642704 |
16 | LogisticRegression | 1 | 0.635193 |
17 | LogisticRegression | 2 | 0.670247 |
18 | LogisticRegression | 3 | 0.682062 |
19 | LogisticRegression | 4 | 0.562836 |
models = [
RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
LinearSVC(),
LogisticRegression(random_state=0, max_iter=1000)
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, input_data, data['genre'].factorize()[0], scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df
model_name | fold_idx | accuracy | |
---|---|---|---|
0 | RandomForestClassifier | 0 | 0.524678 |
1 | RandomForestClassifier | 1 | 0.498927 |
2 | RandomForestClassifier | 2 | 0.503759 |
3 | RandomForestClassifier | 3 | 0.502685 |
4 | RandomForestClassifier | 4 | 0.479055 |
5 | LinearSVC | 0 | 0.713519 |
6 | LinearSVC | 1 | 0.688841 |
7 | LinearSVC | 2 | 0.737916 |
8 | LinearSVC | 3 | 0.738990 |
9 | LinearSVC | 4 | 0.654135 |
10 | LogisticRegression | 0 | 0.695279 |
11 | LogisticRegression | 1 | 0.663090 |
12 | LogisticRegression | 2 | 0.730397 |
13 | LogisticRegression | 3 | 0.726101 |
14 | LogisticRegression | 4 | 0.662728 |
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
X_train, X_test, y_train, y_test = train_test_split(input_data,
labels,
test_size=0.25,
random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification report
print('\t\t\t\t\tCLASSIFICATIION METRICS\n')
print(classification_report(y_test, y_pred,
target_names= data['genre'].unique()))
CLASSIFICATIION METRICS precision recall f1-score support fantasy 0.77 0.77 0.77 233 science 0.74 0.72 0.73 167 crime 0.66 0.59 0.62 126 history 0.76 0.77 0.76 145 horror 0.66 0.59 0.62 132 thriller 0.67 0.79 0.72 247 psychology 0.79 0.76 0.78 25 romance 0.63 0.34 0.44 35 sports 0.86 0.83 0.85 30 travel 0.89 1.00 0.94 25 accuracy 0.72 1165 macro avg 0.74 0.72 0.72 1165 weighted avg 0.72 0.72 0.72 1165
genre_id_df = data[['genre', 'genre_id']].drop_duplicates()
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
xticklabels=genre_id_df.genre.values,
yticklabels=genre_id_df.genre.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - LinearSVC\n", size=16);
import pickle
with open('embeddings.pkl', 'wb') as file:
pickle.dump(embeddings, file)
with open('embeddings.pkl', 'rb') as file:
embeddings = pickle.load(file)
from sentence_transformers import SentenceTransformer
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-mpnet-base-v2')
def getEmbedding(sentence):
# Generate and return the embedding for the provided sentence
return model.encode(sentence)
# Example usage
test_summary = "Teenager Max McGrath (Ben Winchell) discovers that his body can generate the most powerful energy in the universe. Steel (Josh Brener) is a funny, slightly rebellious, techno-organic extraterrestrial who wants to utilize Max's skills. When the two meet, they combine together to become Max Steel, a superhero with unmatched strength on Earth. They soon learn to rely on each other when Max Steel must square off against an unstoppable enemy from another galaxy."
test_summary_embed = getEmbedding(test_summary)
X_train, X_test, y_train, y_test = train_test_split(input_data, data['genre'],
test_size=0.25,
random_state = 0)
trained_model = LinearSVC().fit(X_train, y_train)
print(trained_model.predict([test_summary_embed]))
['science']