In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
fake = pd.read_csv("Fake.csv")
fake['label'] = 0 # 假新闻标记为0
true = pd.read_csv("True.csv")
true['label'] = 1 # 真新闻标记为1
df = pd.concat([fake, true], axis=0) # 合并真假数据集
df.head()
Out[1]:
| title | text | subject | date | label | |
|---|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | Donald Trump just couldn t wish all Americans ... | News | December 31, 2017 | 0 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | House Intelligence Committee Chairman Devin Nu... | News | December 31, 2017 | 0 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | On Friday, it was revealed that former Milwauk... | News | December 30, 2017 | 0 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | On Christmas day, Donald Trump announced that ... | News | December 29, 2017 | 0 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | Pope Francis used his annual Christmas Day mes... | News | December 25, 2017 | 0 |
In [2]:
#查看数据大小
df.shape
Out[2]:
(44898, 5)
In [3]:
#查看数据基本信息
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 44898 entries, 0 to 21416 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 44898 non-null object 1 text 44898 non-null object 2 subject 44898 non-null object 3 date 44898 non-null object 4 label 44898 non-null int64 dtypes: int64(1), object(4) memory usage: 2.1+ MB
In [4]:
#查看数据的描述性统计
df.describe(include='O')
Out[4]:
| title | text | subject | date | |
|---|---|---|---|---|
| count | 44898 | 44898 | 44898 | 44898 |
| unique | 38729 | 38646 | 8 | 2397 |
| top | Factbox: Trump fills top jobs for his administ... | politicsNews | December 20, 2017 | |
| freq | 14 | 627 | 11272 | 182 |
数据预处理¶
统计数据缺失值情况¶
In [6]:
df.isnull().sum()
#可以发现原始数据集并不存在缺失值
Out[6]:
title 0 text 0 subject 0 date 0 label 0 dtype: int64
In [7]:
# 统计数据重复值情况
df.duplicated().sum()
Out[7]:
209
In [8]:
# 删除重复值
df.drop_duplicates(inplace=True)
df.duplicated().sum()
Out[8]:
0
数据可视化¶
In [9]:
# 可视化真假新闻个数
sns.countplot(x='label' ,data=df)
plt.show()
In [10]:
# 可视化各类别新闻个数
plt.figure(figsize=(12,6))
sns.countplot(x='subject' ,data=df)
plt.show()
In [11]:
# 对text变量进行处理:小写字母转换 删除停止词 删除标点符号 删除标签 去除特殊字符
import re
from nltk.corpus import stopwords
import string
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.'''
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
for p in punctuations:
text = text.replace(p, '') # 删除标点符号
return text
def remove_stopword(x):
return [y for y in x if y not in stopwords.words('english')]
df['text'] = df['text'].apply(lambda x: clean_text(x))
In [15]:
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\黄清枫\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\stopwords.zip.
Out[15]:
True
In [16]:
from nltk.tokenize import word_tokenize
def remove_stopwords_from_sentence(sentence):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(sentence)
filtered_tokens = [token for token in tokens if token not in stop_words]
return ' '.join(filtered_tokens)
# 将函数应用于'sentence'列
df['text'] = df['text'].apply(remove_stopwords_from_sentence)
df.head()
Out[16]:
| title | text | subject | date | label | |
|---|---|---|---|---|---|
| 0 | Donald Trump Sends Out Embarrassing New Year’... | donald trump wish americans happy new year lea... | News | December 31, 2017 | 0 |
| 1 | Drunk Bragging Trump Staffer Started Russian ... | house intelligence committee chairman devin nu... | News | December 31, 2017 | 0 |
| 2 | Sheriff David Clarke Becomes An Internet Joke... | friday revealed former milwaukee sheriff david... | News | December 30, 2017 | 0 |
| 3 | Trump Is So Obsessed He Even Has Obama’s Name... | christmas day donald trump announced would bac... | News | December 29, 2017 | 0 |
| 4 | Pope Francis Just Called Out Donald Trump Dur... | pope francis used annual christmas day message... | News | December 25, 2017 | 0 |
In [14]:
# 让我们用词云可视化真假新闻关键词
from wordcloud import WordCloud
text = " ".join(i for i in df[df['label']==1]['text'])
wordcloud = WordCloud( background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('wordcloud for True News')
plt.show()
In [17]:
text = " ".join(i for i in df[df['label']==0]['text'])
wordcloud = WordCloud( background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('wordcloud for Fake News')
plt.show()
特征工程¶
In [18]:
#首先准备建模用到的数据,自变量X和因变量y,接着拆分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
构建模型¶
CountVectorizer¶
In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay,accuracy_score
# Count_Vectorizer
# 创建管道
pipe = Pipeline([('cv', CountVectorizer()),('svc', MultinomialNB())])
# 拟合管道
pipe.fit(X_train, y_train)
# 创建分类报告并绘制混淆矩阵
preds = pipe.predict(X_test)
print(classification_report(y_test, preds))
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)
# Count Vectorizer给出了96%的准确率
precision recall f1-score support
0 0.96 0.95 0.96 4738
1 0.95 0.96 0.95 4200
accuracy 0.96 8938
macro avg 0.95 0.96 0.96 8938
weighted avg 0.96 0.96 0.96 8938
Out[19]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b567e83100>
TF-IDF¶
In [20]:
# 创建管道
pipe = Pipeline([('tfidf', TfidfVectorizer()),('svc', MultinomialNB())])
# 拟合管道
pipe.fit(X_train, y_train)
# 创建分类报告并绘制混淆矩阵
preds = pipe.predict(X_test)
print(classification_report(y_test, preds))
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)
precision recall f1-score support
0 0.95 0.94 0.94 4738
1 0.93 0.94 0.94 4200
accuracy 0.94 8938
macro avg 0.94 0.94 0.94 8938
weighted avg 0.94 0.94 0.94 8938
Out[20]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b553fe7e50>
In [21]:
predict_train = pipe.fit(X_train, y_train).predict(X_train)
# 训练数据集的准确得分
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)
# 在测试数据集上预测目标
predict_test = pipe.predict(X_test)
#测试数据集上的准确度得分
accuracy_test = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)
# 与TF-IDF(0.94)相比,CountVectorizer提供了更好的精度(0.96)
accuracy_score on train dataset : 0.9471623171379822 accuracy_score on test dataset : 0.9407026180353547
In [ ]: