import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("cyberbullying_tweets.csv")


df.head()


df['tweet_text'].iloc[47000]

'Call me disgusting then! Because if any non BLACK person, especially a white one, calls me a nigger I’m going to slap the dogshit out of them too...it’s a new day for the Karens around the world.'


df.shape

(47692, 2)


# 检查空值
df.isna().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64


# 检查重复值
df.duplicated().sum()

36


df = df[~df.duplicated()]


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47656 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47656 non-null  object
 1   cyberbullying_type  47656 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


df['cyberbullying_type'].value_counts()

religion               7997
age                    7992
ethnicity              7959
gender                 7948
not_cyberbullying      7937
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64


# 文本标签有五种，展示其分布
sns.displot(data=df,x='cyberbullying_type')
plt.xticks(rotation=90)

([0, 1, 2, 3, 4, 5],
 [Text(0, 0, 'not_cyberbullying'),
  Text(1, 0, 'gender'),
  Text(2, 0, 'religion'),
  Text(3, 0, 'other_cyberbullying'),
  Text(4, 0, 'age'),
  Text(5, 0, 'ethnicity')])


def remove_punctuation(text):
    '''删除标点符号的函数'''
    import string
    # 将标点符号替换为无空格，
    # 这实际上删除了标点符号
    translator = str.maketrans('', '', string.punctuation)
    # 返回去掉标点符号的文本
    return text.translate(translator)


# 查看标点符号
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'


# 应用函数删除标点符号
df['tweet_text'] = df['tweet_text'].apply(remove_punctuation)
df.head(10)


df['tweet_text'].iloc[29966]

'SERIOUSLY not ANOTHER instant restaurant round 😡 MKR'


# 从文本中清除表情符号的功能
import re
def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)


# 应用该功能清除表情符号
df['tweet_text'] = df['tweet_text'].apply(remove_emojis)


df['tweet_text'].iloc[29966]

'SERIOUSLY not ANOTHER instant restaurant round  MKR'


# 下载停用词
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True


# 从nltk library中提取停用词
from nltk.corpus import stopwords
sw = stopwords.words('english')
# 展示停用词
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
       'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
       'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
       'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
       'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
       "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
       'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
       'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
       'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
       'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
      dtype='<U10')


elem_remove = ['no', 'nor', 'not',"aren't", 'couldn',
       "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
       'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
       'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
       'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
       'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
for elem in elem_remove:
 sw.remove(elem)


np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
       'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so',
       'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
       "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
       've', 'y', 'ain', 'aren'], dtype='<U10')


def stopwords(text):
    '''a function for removing the stopword'''
    # 删除停用词并小写所选单词
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # 用空格分隔符连接单词列表
    return " ".join(text)


# 应用函数删除停用词
df['tweet_text'] = df['tweet_text'].apply(stopwords)
df.head(10)


df['tweet_text'].iloc[29966]

'seriously not another instant restaurant round mkr'


# 创建清洗函数使其更干净
def clean(text): 
    text = text.replace('\r', '').replace('\n', ' ')  
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) 
    text = re.sub(r'[^\x00-\x7f]',r'', text)  
    text = re.sub(r"((www.[^s]+)|(http\S+))","",text)  
    text = re.sub('[0-9]+', '', text)  
    text = [word for word in text.split() if len(word)< 14 ] # 删除超过 14 个字符的单词
    text = " ".join(text)
    return text


# 应用清洗功能
df['tweet_text'] = df['tweet_text'].apply(clean)
df.head(10)


# 使用正则表达式来识别和替换常见的缩写形式。
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text


df['tweet_text'] = df['tweet_text'].apply(decontract)


# 清洗和去除重复字符

def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)


df['tweet_text'] = df['tweet_text'].apply(lambda x: clean_repeating_characters(x))


from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+')
df['tweet_text'] = df['tweet_text'].apply(tokenizer.tokenize)


df['tweet_text'][100]

['mirebotan',
 'ramiallolah',
 'support',
 'dont',
 'like',
 'people',
 'like',
 'rami',
 'complain',
 'things',
 'supports',
 'done',
 'others']


import nltk
st = nltk.PorterStemmer()

def text_stemming(text):
    text = [st.stem(word) for word in text]
    return text

df['tweet_text'] = df['tweet_text'].apply(lambda x: text_stemming(x))


df['tweet_text'][100]

['mirebotan',
 'ramiallolah',
 'support',
 'dont',
 'like',
 'peopl',
 'like',
 'rami',
 'complain',
 'thing',
 'support',
 'done',
 'other']


# lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
lm = nltk.WordNetLemmatizer()

def text_lemmatization(text):
    text = [lm.lemmatize(word) for word in text]
    return text

df['tweet_text'] = df['tweet_text'].apply(lambda x: text_lemmatization(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...


df['tweet_text'].head()

0             [word, katandandr, food, crapilici, mkr]
1    [aussietv, white, mkr, theblock, today, sunris...
2                 [classi, whore, red, velvet, cupcak]
3    [jasongio, meh, p, thank, head, not, concern, ...
4    [rudhoeenglish, isi, account, pretend, kurdish...
Name: tweet_text, dtype: object


# 创建不包括 not_cyberbullying 的数据框
new_df = df.copy()
new_df = new_df[new_df['cyberbullying_type'] != 'not_cyberbullying']
word_df = new_df['tweet_text'].apply(lambda x: " ".join(x))
word_df.head()

7945    rape realzvasiyana nema joke drunk gay lesbian...
7946    never saw celebr say anyth like obama b maher ...
7947           mean he gay use gender slur make rape joke
7948                rt raulnovoa alexaim mecaesm feminazi
7949    rape rape fact read one post guy get rape comm...
Name: tweet_text, dtype: object


# 画词云
from wordcloud import WordCloud
plt.figure(figsize= (20,20))
wc = WordCloud(max_words=1000, width= 1600, height= 800, 
                collocations= False).generate(' '.join(word_df))
plt.imshow(wc)

<matplotlib.image.AxesImage at 0x1686a708ed0>


# 原数据
df['tweet_text'] = df['tweet_text'].apply(lambda x : " ".join(x))
df.head()


df.duplicated().sum()

780


df = df[~df.duplicated()]


df.duplicated().sum()

0


df['cyberbullying_type'].value_counts()

religion               7953
age                    7895
ethnicity              7834
not_cyberbullying      7803
gender                 7740
other_cyberbullying    7651
Name: cyberbullying_type, dtype: int64


# 创建情绪特征（cyberbullying_type 的副本）
df['sentiment'] = df['cyberbullying_type']


# 将文本的真值对应成标量
df['cyberbullying_type'] = df['cyberbullying_type'].replace({'not_cyberbullying':0,'religion':1,'age':2,'gender':3,'ethnicity':4,'other_cyberbullying':5})


df.head()


df['cyberbullying_type'].value_counts()

1    7953
2    7895
4    7834
0    7803
3    7740
5    7651
Name: cyberbullying_type, dtype: int64


# applying countvectorizer and TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 


tfidf = TfidfTransformer()
clf = CountVectorizer()

X_cv =  clf.fit_transform(df['tweet_text'])

tf_transformer = TfidfTransformer(use_idf=True).fit(X_cv)
X_tf = tf_transformer.transform(X_cv)


X_tf

<46876x44700 sparse matrix of type '<class 'numpy.float64'>'
	with 602387 stored elements in Compressed Sparse Row format>


# train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf, df['cyberbullying_type'], test_size=0.20, stratify=df['cyberbullying_type'], random_state=10)


#Fitting the model
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

MultinomialNB()

MultinomialNB()


#Evaluating the model
print("Evaluation of Naive Bayes model\n\n")
y_pred_nb = nb_clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred_nb)
print(f"Confusion Matrix:\n{result}")
result1 = classification_report(y_test, y_pred_nb)
print(f"Classification Report:\n{result1}")
result2 = accuracy_score(y_test,y_pred_nb)
print(f"Accuracy:{round(result2,2)}\n\n")

Evaluation of Naive Bayes model


Confusion Matrix:
[[ 512  206  315  119   70  339]
 [   8 1555   11    9    7    1]
 [   1   11 1559    2    5    1]
 [  72   36   48 1332   32   28]
 [   3   55   88   10 1403    8]
 [ 151  105  263  153  148  710]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.33      0.44      1561
           1       0.79      0.98      0.87      1591
           2       0.68      0.99      0.81      1579
           3       0.82      0.86      0.84      1548
           4       0.84      0.90      0.87      1567
           5       0.65      0.46      0.54      1530

    accuracy                           0.75      9376
   macro avg       0.75      0.75      0.73      9376
weighted avg       0.75      0.75      0.73      9376

Accuracy:0.75


#Fitting the model
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()


#Evaluating the model
print("Evaluation of K Nearest Neighbor model\n\n")
y_pred_knn = knn_clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred_knn)
print(f"Confusion Matrix:\n{result}")
result1 = classification_report(y_test, y_pred_knn)
print(f"Classification Report:\n{result1}")
result2 = accuracy_score(y_test,y_pred_knn)
print(f"Accuracy:{round(result2,2)}\n\n")

Evaluation of K Nearest Neighbor model


Confusion Matrix:
[[1469   10    3   19    1   59]
 [1502   86    0    1    0    2]
 [1434    0  139    0    1    5]
 [1329    0    1  212    3    3]
 [1243    0    1    0  317    6]
 [1435    0    3   21    3   68]]
Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.94      0.29      1561
           1       0.90      0.05      0.10      1591
           2       0.95      0.09      0.16      1579
           3       0.84      0.14      0.24      1548
           4       0.98      0.20      0.34      1567
           5       0.48      0.04      0.08      1530

    accuracy                           0.24      9376
   macro avg       0.72      0.24      0.20      9376
weighted avg       0.72      0.24      0.20      9376

Accuracy:0.24


#Fitting the model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier()

RandomForestClassifier()


#Evaluating the model
print("Evaluation of Random Forest model\n\n")
y_pred_rf = rf_clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred_rf)
print(f"Confusion Matrix:\n{result}")
result1 = classification_report(y_test, y_pred_rf)
print(f"Classification Report:\n{result1}")
result2 = accuracy_score(y_test,y_pred_rf)
print(f"Accuracy:{round(result2,2)}\n\n")

Evaluation of Random Forest model


Confusion Matrix:
[[ 790   66   31   61    9  604]
 [  33 1522    0    6    1   29]
 [  19    1 1546    1    0   12]
 [ 124    6    4 1293    5  116]
 [   8    7    0    2 1532   18]
 [ 442   10   19   84   11  964]]
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.51      0.53      1561
           1       0.94      0.96      0.95      1591
           2       0.97      0.98      0.97      1579
           3       0.89      0.84      0.86      1548
           4       0.98      0.98      0.98      1567
           5       0.55      0.63      0.59      1530

    accuracy                           0.82      9376
   macro avg       0.82      0.81      0.81      9376
weighted avg       0.82      0.82      0.82      9376

Accuracy:0.82

网络欺凌攻击分类¶

问题陈述¶

导入科学计算库和数据集¶

Removing punctuations¶

Funtion to remove punctuation¶

将函数应用到每个示例¶

Cleaning emojis from text¶

删除停用词¶

Function to remove stopwords¶

进一步的数据清洗¶

处理英语文本中的缩写，将它们转换为完整形式。¶

文本向量化¶

词干提取¶

词形还原¶

绘制网络欺凌情况下最常出现的单词¶

真值的标签编码¶

为文本创建虚拟变量以将其转换为数字¶

Train-Test Split¶

模型选择¶

模型的训练和验证¶

1. 朴素贝叶斯¶

2. K最邻近算法¶

3. 随机森林¶

结论:¶

	tweet_text	cyberbullying_type
0	In other words #katandandre, your food was cra...	not_cyberbullying
1	Why is #aussietv so white? #MKR #theblock #ImA...	not_cyberbullying
2	@XochitlSuckkks a classy whore? Or more red ve...	not_cyberbullying
3	@Jason_Gio meh. :P thanks for the heads up, b...	not_cyberbullying
4	@RudhoeEnglish This is an ISIS account pretend...	not_cyberbullying

	tweet_text	cyberbullying_type
0	In other words katandandre your food was crapi...	not_cyberbullying
1	Why is aussietv so white MKR theblock ImACeleb...	not_cyberbullying
2	XochitlSuckkks a classy whore Or more red velv...	not_cyberbullying
3	JasonGio meh P thanks for the heads up but no...	not_cyberbullying
4	RudhoeEnglish This is an ISIS account pretendi...	not_cyberbullying
5	Raja5aab Quickieleaks Yes the test of god is t...	not_cyberbullying
6	Itu sekolah ya bukan tempat bully Ga jauh kaya...	not_cyberbullying
7	Karma I hope it bites Kat on the butt She is j...	not_cyberbullying
8	stockputout everything but mostly my priest	not_cyberbullying
9	Rebecca Black Drops Out of School Due to Bullying	not_cyberbullying

	tweet_text	cyberbullying_type
0	words katandandre food crapilicious mkr	not_cyberbullying
1	aussietv white mkr theblock imacelebrityau tod...	not_cyberbullying
2	xochitlsuckkks classy whore red velvet cupcakes	not_cyberbullying
3	jasongio meh p thanks heads not concerned anot...	not_cyberbullying
4	rudhoeenglish isis account pretending kurdish ...	not_cyberbullying
5	raja5aab quickieleaks yes test god good bad in...	not_cyberbullying
6	itu sekolah ya bukan tempat bully ga jauh kaya...	not_cyberbullying
7	karma hope bites kat butt nasty mkr	not_cyberbullying
8	stockputout everything mostly priest	not_cyberbullying
9	rebecca black drops school due bullying	not_cyberbullying

	tweet_text	cyberbullying_type
0	words katandandre food crapilicious mkr	not_cyberbullying
1	aussietv white mkr theblock today sunrise stud...	not_cyberbullying
2	classy whore red velvet cupcakes	not_cyberbullying
3	jasongio meh p thanks heads not concerned anot...	not_cyberbullying
4	rudhoeenglish isis account pretending kurdish ...	not_cyberbullying
5	rajaaab quickieleaks yes test god good bad ind...	not_cyberbullying
6	itu sekolah ya bukan tempat bully ga jauh kaya...	not_cyberbullying
7	karma hope bites kat butt nasty mkr	not_cyberbullying
8	stockputout everything mostly priest	not_cyberbullying
9	rebecca black drops school due bullying	not_cyberbullying

	tweet_text	cyberbullying_type
0	word katandandr food crapilici mkr	not_cyberbullying
1	aussietv white mkr theblock today sunris studi...	not_cyberbullying
2	classi whore red velvet cupcak	not_cyberbullying
3	jasongio meh p thank head not concern anoth an...	not_cyberbullying
4	rudhoeenglish isi account pretend kurdish acco...	not_cyberbullying