In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(font='SimHei')
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('互联网防火墙数据集.csv')
In [2]:
df.head()
df.shape
df.info()
df.describe().T
df.isnull().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 65532 entries, 0 to 65531 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Source Port 65532 non-null int64 1 Destination Port 65532 non-null int64 2 NAT Source Port 65532 non-null int64 3 NAT Destination Port 65532 non-null int64 4 Action 65532 non-null object 5 Bytes 65532 non-null int64 6 Bytes Sent 65532 non-null int64 7 Bytes Received 65532 non-null int64 8 Packets 65532 non-null int64 9 Elapsed Time (sec) 65532 non-null int64 10 pkts_sent 65532 non-null int64 11 pkts_received 65532 non-null int64 dtypes: int64(11), object(1) memory usage: 6.0+ MB
Out[2]:
0
In [6]:
# df.head(5)
In [7]:
sns.countplot(data=df,x='Action')
plt.show()
In [11]:
df['Action'].value_counts()
df['Action'].replace(to_replace={'allow':0,'drop':1,'deny':2,'reset-both':3},inplace=True)
df['Action'].value_counts()
fig = plt.figure(figsize=(18,18))
sns.heatmap(df.corr(),vmax=1,annot=True,linewidths=0.5,cbar=False,cmap='YlGnBu',annot_kws={'fontsize':18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('各个因素之间的相关系数',fontsize=20)
plt.show()
In [12]:
# 准备用于创建模型的数据
from sklearn.model_selection import train_test_split
X = df.drop('Action', axis=1)
y = df["Action"]
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print('训练集大小:',X_train.shape[0])
print('测试集大小:',X_test.shape[0])
训练集大小: 45736 测试集大小: 11434
In [13]:
# 构建逻辑回归模型
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
print('逻辑回归模型准确率:',lr.score(X_test,y_test))
逻辑回归模型准确率: 0.9816337239811089
In [14]:
# 构建KNN模型
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print('KNN模型准确率:',knn.score(X_test,y_test))
KNN模型准确率: 0.993877907993703
In [15]:
# 构建决策树模型
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
print('决策树模型准确率:',tree.score(X_test,y_test))
决策树模型准确率: 0.9971138709113171
In [17]:
# 构建lightgbm模型
from lightgbm import LGBMClassifier
gbm = LGBMClassifier()
gbm.fit(X_train,y_train)
print('lightgbm模型准确率:',gbm.score(X_test,y_test))
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001507 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 2805 [LightGBM] [Info] Number of data points in the train set: 45736, number of used features: 11 [LightGBM] [Info] Start training from score -0.421388 [LightGBM] [Info] Start training from score -1.597830 [LightGBM] [Info] Start training from score -1.962378 [LightGBM] [Info] Start training from score -6.902000 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf lightgbm模型准确率: 0.9983382893125765
In [18]:
# 构建xgboost模型
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
print('xgboost模型准确率:',xgb.score(X_test,y_test))
xgboost模型准确率: 0.998600664684275
In [19]:
from sklearn.metrics import r2_score,confusion_matrix,classification_report,auc,roc_curve
# 模型评估
y_pred = xgb.predict(X_test)
print('模型的R方值:',r2_score(y_test,y_pred))
print('模型混淆矩阵:','\n',confusion_matrix(y_test,y_pred))
print('模型分类报告:','\n',classification_report(y_test,y_pred))
模型的R方值: 0.9969034757638603
模型混淆矩阵:
[[7430 0 0 0]
[ 0 2381 0 0]
[ 1 7 1606 1]
[ 0 0 7 1]]
模型分类报告:
precision recall f1-score support
0 1.00 1.00 1.00 7430
1 1.00 1.00 1.00 2381
2 1.00 0.99 1.00 1615
3 0.50 0.12 0.20 8
accuracy 1.00 11434
macro avg 0.87 0.78 0.80 11434
weighted avg 1.00 1.00 1.00 11434
In [20]:
# 混淆矩阵可视化
fig = plt.figure(figsize=(8,8))
sns.heatmap(confusion_matrix(y_test,y_pred),vmax=1,annot=True,linewidths=0.5,cbar=False,cmap='YlGnBu',annot_kws={'fontsize':18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('XGBoost模型混淆矩阵',fontsize=20)
plt.show()
In [21]:
#打印特征重要性评分
feat_labels = X_train.columns[0:]
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f,j in zip(range(X_train.shape[1]),indices):
index_list.append(feat_labels[j])
value_list.append(importances[j])
print(f + 1, feat_labels[j], importances[j])
plt.figure(figsize=(10,6))
plt.barh(index_list[::-1],value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序',fontsize=14)
plt.show()
11 Source Port 0.00026755093
In [22]:
# 模型预测
res = pd.DataFrame()
res['真实值'] = y_test
res['预测值'] = y_pred
res.sample(10)
Out[22]:
| 真实值 | 预测值 | |
|---|---|---|
| 4035 | 0 | 0 |
| 48232 | 0 | 0 |
| 55197 | 2 | 2 |
| 18073 | 1 | 1 |
| 9346 | 0 | 0 |
| 28252 | 0 | 0 |
| 48257 | 0 | 0 |
| 59796 | 0 | 0 |
| 61184 | 0 | 0 |
| 25549 | 1 | 1 |
In [ ]: