In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

使用 pandas 读取训练数据集¶

In [2]:
data = pd.read_csv('data.csv')
In [3]:
print(data.columns)
data.shape
Index(['index', 'title', 'genre', 'summary'], dtype='object')
Out[3]:
(4657, 4)
In [4]:
data.head()
Out[4]:
index title genre summary
0 0 Drowned Wednesday fantasy Drowned Wednesday is the first Trustee among ...
1 1 The Lost Hero fantasy As the book opens, Jason awakens on a school ...
2 2 The Eyes of the Overworld fantasy Cugel is easily persuaded by the merchant Fia...
3 3 Magic's Promise fantasy The book opens with Herald-Mage Vanyel return...
4 4 Taran Wanderer fantasy Taran and Gurgi have returned to Caer Dallben...
In [5]:
data['genre'].value_counts().plot(kind='barh')
Out[5]:
<Axes: >

数据预处理¶

In [6]:
data['genre_id'] = data['genre'].factorize()[0]
In [7]:
data['genre_id'].value_counts()
Out[7]:
5    1023
0     876
1     647
3     600
4     600
2     500
7     111
6     100
8     100
9     100
Name: genre_id, dtype: int64
In [8]:
data['summary'] = data['summary'].apply(lambda x: x.lower())
In [9]:
data.head()
Out[9]:
index title genre summary genre_id
0 0 Drowned Wednesday fantasy drowned wednesday is the first trustee among ... 0
1 1 The Lost Hero fantasy as the book opens, jason awakens on a school ... 0
2 2 The Eyes of the Overworld fantasy cugel is easily persuaded by the merchant fia... 0
3 3 Magic's Promise fantasy the book opens with herald-mage vanyel return... 0
4 4 Taran Wanderer fantasy taran and gurgi have returned to caer dallben... 0

使用句子转换器模型将文本转换为数值向量表示¶

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

def getEmbedding(sentence):
    return model.encode(sentence)
.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]
1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]
README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]
config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]
config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]
data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]
pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]
sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]
tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]
train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]
vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]
In [12]:
embeddings = data['summary'].apply(lambda x: getEmbedding(x))
In [13]:
input_data = []
for item in embeddings:
    input_data.append(item.tolist())

使用 Tfidf 向量器与深度学习模型sentence embedding进行比较¶

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [15]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')
# We transform each complaint into a vector
features = tfidf.fit_transform(data.summary).toarray()
labels = data.genre_id
print("Each of the %d synopsis is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))
Each of the 4657 synopsis is represented by 20152 features (TF-IDF score of unigrams and bigrams)

Tfidf 和sentence embedding模型之间的精度比较¶

使用 Tfidf 向量器的文本嵌入:¶

In [16]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df_tfidf = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
In [17]:
cv_df_tfidf
Out[17]:
model_name fold_idx accuracy
0 RandomForestClassifier 0 0.353004
1 RandomForestClassifier 1 0.362661
2 RandomForestClassifier 2 0.365199
3 RandomForestClassifier 3 0.352309
4 RandomForestClassifier 4 0.303974
5 LinearSVC 0 0.701717
6 LinearSVC 1 0.689914
7 LinearSVC 2 0.711063
8 LinearSVC 3 0.718582
9 LinearSVC 4 0.616541
10 MultinomialNB 0 0.542918
11 MultinomialNB 1 0.524678
12 MultinomialNB 2 0.517723
13 MultinomialNB 3 0.519871
14 MultinomialNB 4 0.470462
15 LogisticRegression 0 0.642704
16 LogisticRegression 1 0.635193
17 LogisticRegression 2 0.670247
18 LogisticRegression 3 0.682062
19 LogisticRegression 4 0.562836

使用sentence embedding模型的文本嵌入¶

In [20]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    LogisticRegression(random_state=0, max_iter=1000)
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, input_data, data['genre'].factorize()[0], scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
In [22]:
cv_df
Out[22]:
model_name fold_idx accuracy
0 RandomForestClassifier 0 0.524678
1 RandomForestClassifier 1 0.498927
2 RandomForestClassifier 2 0.503759
3 RandomForestClassifier 3 0.502685
4 RandomForestClassifier 4 0.479055
5 LinearSVC 0 0.713519
6 LinearSVC 1 0.688841
7 LinearSVC 2 0.737916
8 LinearSVC 3 0.738990
9 LinearSVC 4 0.654135
10 LogisticRegression 0 0.695279
11 LogisticRegression 1 0.663090
12 LogisticRegression 2 0.730397
13 LogisticRegression 3 0.726101
14 LogisticRegression 4 0.662728

注:从上面的比较中我们可以看出,当使用sentence embedding生成的文本嵌入时,所有模型的准确率都更高。因此,我们将使用这些嵌入词进行进一步的模型训练¶

此外,与其他模型相比,LinearSVC 的准确率最高。因此,让我们使用 LinearSVC 来训练我们的基因预测模型吧¶

模型训练¶

In [23]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
In [24]:
X_train, X_test, y_train, y_test = train_test_split(input_data, 
                                                               labels, 
                                                               test_size=0.25, 
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Classification report
print('\t\t\t\t\tCLASSIFICATIION METRICS\n')
print(classification_report(y_test, y_pred, 
                                    target_names= data['genre'].unique()))
					CLASSIFICATIION METRICS

              precision    recall  f1-score   support

     fantasy       0.77      0.77      0.77       233
     science       0.74      0.72      0.73       167
       crime       0.66      0.59      0.62       126
     history       0.76      0.77      0.76       145
      horror       0.66      0.59      0.62       132
    thriller       0.67      0.79      0.72       247
  psychology       0.79      0.76      0.78        25
     romance       0.63      0.34      0.44        35
      sports       0.86      0.83      0.85        30
      travel       0.89      1.00      0.94        25

    accuracy                           0.72      1165
   macro avg       0.74      0.72      0.72      1165
weighted avg       0.72      0.72      0.72      1165

In [25]:
genre_id_df = data[['genre', 'genre_id']].drop_duplicates()
In [26]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=genre_id_df.genre.values, 
            yticklabels=genre_id_df.genre.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - LinearSVC\n", size=16);

保持embedding向量为文件¶

In [27]:
import pickle
In [28]:
with open('embeddings.pkl', 'wb') as file:
    pickle.dump(embeddings, file)
In [29]:
with open('embeddings.pkl', 'rb') as file:
    embeddings = pickle.load(file)

进行预测¶

In [34]:
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-mpnet-base-v2')

def getEmbedding(sentence):
    # Generate and return the embedding for the provided sentence
    return model.encode(sentence)

# Example usage
test_summary = "Teenager Max McGrath (Ben Winchell) discovers that his body can generate the most powerful energy in the universe. Steel (Josh Brener) is a funny, slightly rebellious, techno-organic extraterrestrial who wants to utilize Max's skills. When the two meet, they combine together to become Max Steel, a superhero with unmatched strength on Earth. They soon learn to rely on each other when Max Steel must square off against an unstoppable enemy from another galaxy."
test_summary_embed = getEmbedding(test_summary)

 
In [35]:
X_train, X_test, y_train, y_test = train_test_split(input_data, data['genre'], 
                                                    test_size=0.25,
                                                    random_state = 0)
trained_model = LinearSVC().fit(X_train, y_train)
In [36]:
print(trained_model.predict([test_summary_embed]))
['science']
In [ ]: