import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score


data = pd.read_csv('data.csv')


print(data.columns)
data.shape

Index(['index', 'title', 'genre', 'summary'], dtype='object')

(4657, 4)


data.head()


data['genre'].value_counts().plot(kind='barh')

<Axes: >


data['genre_id'] = data['genre'].factorize()[0]


data['genre_id'].value_counts()

5    1023
0     876
1     647
3     600
4     600
2     500
7     111
6     100
8     100
9     100
Name: genre_id, dtype: int64


data['summary'] = data['summary'].apply(lambda x: x.lower())


data.head()


from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

def getEmbedding(sentence):
    return model.encode(sentence)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


embeddings = data['summary'].apply(lambda x: getEmbedding(x))


input_data = []
for item in embeddings:
    input_data.append(item.tolist())


from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')
# We transform each complaint into a vector
features = tfidf.fit_transform(data.summary).toarray()
labels = data.genre_id
print("Each of the %d synopsis is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 4657 synopsis is represented by 20152 features (TF-IDF score of unigrams and bigrams)


models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df_tfidf = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


cv_df_tfidf


models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    LogisticRegression(random_state=0, max_iter=1000)
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, input_data, data['genre'].factorize()[0], scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


cv_df


from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


X_train, X_test, y_train, y_test = train_test_split(input_data, 
                                                               labels, 
                                                               test_size=0.25, 
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Classification report
print('\t\t\t\t\tCLASSIFICATIION METRICS\n')
print(classification_report(y_test, y_pred, 
                                    target_names= data['genre'].unique()))

					CLASSIFICATIION METRICS

              precision    recall  f1-score   support

     fantasy       0.77      0.77      0.77       233
     science       0.74      0.72      0.73       167
       crime       0.66      0.59      0.62       126
     history       0.76      0.77      0.76       145
      horror       0.66      0.59      0.62       132
    thriller       0.67      0.79      0.72       247
  psychology       0.79      0.76      0.78        25
     romance       0.63      0.34      0.44        35
      sports       0.86      0.83      0.85        30
      travel       0.89      1.00      0.94        25

    accuracy                           0.72      1165
   macro avg       0.74      0.72      0.72      1165
weighted avg       0.72      0.72      0.72      1165


genre_id_df = data[['genre', 'genre_id']].drop_duplicates()


conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=genre_id_df.genre.values, 
            yticklabels=genre_id_df.genre.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - LinearSVC\n", size=16);


import pickle


with open('embeddings.pkl', 'wb') as file:
    pickle.dump(embeddings, file)


with open('embeddings.pkl', 'rb') as file:
    embeddings = pickle.load(file)


from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-mpnet-base-v2')

def getEmbedding(sentence):
    # Generate and return the embedding for the provided sentence
    return model.encode(sentence)

# Example usage
test_summary = "Teenager Max McGrath (Ben Winchell) discovers that his body can generate the most powerful energy in the universe. Steel (Josh Brener) is a funny, slightly rebellious, techno-organic extraterrestrial who wants to utilize Max's skills. When the two meet, they combine together to become Max Steel, a superhero with unmatched strength on Earth. They soon learn to rely on each other when Max Steel must square off against an unstoppable enemy from another galaxy."
test_summary_embed = getEmbedding(test_summary)


X_train, X_test, y_train, y_test = train_test_split(input_data, data['genre'], 
                                                    test_size=0.25,
                                                    random_state = 0)
trained_model = LinearSVC().fit(X_train, y_train)


print(trained_model.predict([test_summary_embed]))

['science']

	index	title	genre	summary
0	0	Drowned Wednesday	fantasy	Drowned Wednesday is the first Trustee among ...
1	1	The Lost Hero	fantasy	As the book opens, Jason awakens on a school ...
2	2	The Eyes of the Overworld	fantasy	Cugel is easily persuaded by the merchant Fia...
3	3	Magic's Promise	fantasy	The book opens with Herald-Mage Vanyel return...
4	4	Taran Wanderer	fantasy	Taran and Gurgi have returned to Caer Dallben...

使用 pandas 读取训练数据集¶

数据预处理¶

使用句子转换器模型将文本转换为数值向量表示¶

使用 Tfidf 向量器与深度学习模型sentence embedding进行比较¶

Tfidf 和sentence embedding模型之间的精度比较¶

使用 Tfidf 向量器的文本嵌入：¶

使用sentence embedding模型的文本嵌入¶

注：从上面的比较中我们可以看出，当使用sentence embedding生成的文本嵌入时，所有模型的准确率都更高。因此，我们将使用这些嵌入词进行进一步的模型训练¶

此外，与其他模型相比，LinearSVC 的准确率最高。因此，让我们使用 LinearSVC 来训练我们的基因预测模型吧¶

模型训练¶

保持embedding向量为文件¶

进行预测¶

	model_name	fold_idx	accuracy
0	RandomForestClassifier	0	0.353004
1	RandomForestClassifier	1	0.362661
2	RandomForestClassifier	2	0.365199
3	RandomForestClassifier	3	0.352309
4	RandomForestClassifier	4	0.303974
5	LinearSVC	0	0.701717
6	LinearSVC	1	0.689914
7	LinearSVC	2	0.711063
8	LinearSVC	3	0.718582
9	LinearSVC	4	0.616541
10	MultinomialNB	0	0.542918
11	MultinomialNB	1	0.524678
12	MultinomialNB	2	0.517723
13	MultinomialNB	3	0.519871
14	MultinomialNB	4	0.470462
15	LogisticRegression	0	0.642704
16	LogisticRegression	1	0.635193
17	LogisticRegression	2	0.670247
18	LogisticRegression	3	0.682062
19	LogisticRegression	4	0.562836