import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
fake = pd.read_csv("Fake.csv")
fake['label'] = 0  # 假新闻标记为0
true = pd.read_csv("True.csv")
true['label'] = 1  # 真新闻标记为1
df = pd.concat([fake, true], axis=0)  # 合并真假数据集
df.head()

#查看数据大小
df.shape

(44898, 5)

#查看数据基本信息
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB

#查看数据的描述性统计
df.describe(include='O')

df.isnull().sum()
#可以发现原始数据集并不存在缺失值

title      0
text       0
subject    0
date       0
label      0
dtype: int64

# 统计数据重复值情况
df.duplicated().sum()

209

# 删除重复值
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

# 可视化真假新闻个数
sns.countplot(x='label' ,data=df)
plt.show()

# 可视化各类别新闻个数
plt.figure(figsize=(12,6))
sns.countplot(x='subject' ,data=df)
plt.show()

# 对text变量进行处理：小写字母转换   删除停止词   删除标点符号   删除标签   去除特殊字符
import re
from nltk.corpus import stopwords
import string


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p, '')  # 删除标点符号
        return text
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]


df['text'] = df['text'].apply(lambda x: clean_text(x))

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\黄清枫\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.

True

from nltk.tokenize import word_tokenize

def remove_stopwords_from_sentence(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)
# 将函数应用于'sentence'列
df['text'] = df['text'].apply(remove_stopwords_from_sentence)
df.head()

# 让我们用词云可视化真假新闻关键词
from wordcloud import WordCloud

text = " ".join(i for i in df[df['label']==1]['text'])
wordcloud = WordCloud( background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('wordcloud for True News')
plt.show()

text = " ".join(i for i in df[df['label']==0]['text'])
wordcloud = WordCloud( background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('wordcloud for Fake News')
plt.show()

#首先准备建模用到的数据，自变量X和因变量y，接着拆分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay,accuracy_score
# Count_Vectorizer
# 创建管道 
pipe = Pipeline([('cv', CountVectorizer()),('svc', MultinomialNB())])

# 拟合管道 
pipe.fit(X_train, y_train)
# 创建分类报告并绘制混淆矩阵
preds = pipe.predict(X_test)
print(classification_report(y_test, preds))
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)
# Count Vectorizer给出了96%的准确率

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      4738
           1       0.95      0.96      0.95      4200

    accuracy                           0.96      8938
   macro avg       0.95      0.96      0.96      8938
weighted avg       0.96      0.96      0.96      8938

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b567e83100>

# 创建管道
pipe = Pipeline([('tfidf', TfidfVectorizer()),('svc', MultinomialNB())])
# 拟合管道 
pipe.fit(X_train, y_train)
# 创建分类报告并绘制混淆矩阵
preds = pipe.predict(X_test)
print(classification_report(y_test, preds))
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      4738
           1       0.93      0.94      0.94      4200

    accuracy                           0.94      8938
   macro avg       0.94      0.94      0.94      8938
weighted avg       0.94      0.94      0.94      8938

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b553fe7e50>

predict_train = pipe.fit(X_train, y_train).predict(X_train)
# 训练数据集的准确得分
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)
# 在测试数据集上预测目标
predict_test = pipe.predict(X_test)
#测试数据集上的准确度得分
accuracy_test = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)
# 与TF-IDF(0.94)相比，CountVectorizer提供了更好的精度(0.96)

accuracy_score on train dataset :  0.9471623171379822
accuracy_score on test dataset :  0.9407026180353547

	title	text	subject	date
0	Donald Trump Sends Out Embarrassing New Year’...	Donald Trump just couldn t wish all Americans ...	News	December 31, 2017
1	Drunk Bragging Trump Staffer Started Russian ...	House Intelligence Committee Chairman Devin Nu...	News	December 31, 2017
2	Sheriff David Clarke Becomes An Internet Joke...	On Friday, it was revealed that former Milwauk...	News	December 30, 2017
3	Trump Is So Obsessed He Even Has Obama’s Name...	On Christmas day, Donald Trump announced that ...	News	December 29, 2017
4	Pope Francis Just Called Out Donald Trump Dur...	Pope Francis used his annual Christmas Day mes...	News	December 25, 2017

	title	text	subject	date
0	Donald Trump Sends Out Embarrassing New Year’...	donald trump wish americans happy new year lea...	News	December 31, 2017
1	Drunk Bragging Trump Staffer Started Russian ...	house intelligence committee chairman devin nu...	News	December 31, 2017
2	Sheriff David Clarke Becomes An Internet Joke...	friday revealed former milwaukee sheriff david...	News	December 30, 2017
3	Trump Is So Obsessed He Even Has Obama’s Name...	christmas day donald trump announced would bac...	News	December 29, 2017
4	Pope Francis Just Called Out Donald Trump Dur...	pope francis used annual christmas day message...	News	December 25, 2017

📘 朴素贝叶斯算法构建真假新闻分类模型/News.ipynb

数据预处理¶

统计数据缺失值情况¶

数据可视化¶

特征工程¶

构建模型¶

CountVectorizer¶

TF-IDF¶

	title	text	subject	date
count	44898	44898	44898	44898
unique	38729	38646	8	2397
top	Factbox: Trump fills top jobs for his administ...		politicsNews	December 20, 2017
freq	14	627	11272	182