import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(font='SimHei')
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('互联网防火墙数据集.csv')

df.head()
df.shape
df.info()
df.describe().T
df.isnull().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65532 entries, 0 to 65531
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Source Port           65532 non-null  int64 
 1   Destination Port      65532 non-null  int64 
 2   NAT Source Port       65532 non-null  int64 
 3   NAT Destination Port  65532 non-null  int64 
 4   Action                65532 non-null  object
 5   Bytes                 65532 non-null  int64 
 6   Bytes Sent            65532 non-null  int64 
 7   Bytes Received        65532 non-null  int64 
 8   Packets               65532 non-null  int64 
 9   Elapsed Time (sec)    65532 non-null  int64 
 10  pkts_sent             65532 non-null  int64 
 11  pkts_received         65532 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 6.0+ MB

0

# df.head(5)

sns.countplot(data=df,x='Action')
plt.show()

df['Action'].value_counts()
df['Action'].replace(to_replace={'allow':0,'drop':1,'deny':2,'reset-both':3},inplace=True)
df['Action'].value_counts()
fig = plt.figure(figsize=(18,18))
sns.heatmap(df.corr(),vmax=1,annot=True,linewidths=0.5,cbar=False,cmap='YlGnBu',annot_kws={'fontsize':18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('各个因素之间的相关系数',fontsize=20)
plt.show()

# 准备用于创建模型的数据
from sklearn.model_selection import train_test_split
X = df.drop('Action', axis=1)
y = df["Action"]
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print('训练集大小：',X_train.shape[0])
print('测试集大小：',X_test.shape[0])

训练集大小： 45736
测试集大小： 11434

# 构建逻辑回归模型
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
print('逻辑回归模型准确率：',lr.score(X_test,y_test))

逻辑回归模型准确率： 0.9816337239811089

# 构建KNN模型
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print('KNN模型准确率：',knn.score(X_test,y_test))

KNN模型准确率： 0.993877907993703

# 构建决策树模型
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
print('决策树模型准确率：',tree.score(X_test,y_test))

决策树模型准确率： 0.9971138709113171

# 构建lightgbm模型
from lightgbm import LGBMClassifier
gbm = LGBMClassifier()
gbm.fit(X_train,y_train)
print('lightgbm模型准确率：',gbm.score(X_test,y_test))

[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 45736, number of used features: 11
[LightGBM] [Info] Start training from score -0.421388
[LightGBM] [Info] Start training from score -1.597830
[LightGBM] [Info] Start training from score -1.962378
[LightGBM] [Info] Start training from score -6.902000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
lightgbm模型准确率： 0.9983382893125765

# 构建xgboost模型
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
print('xgboost模型准确率：',xgb.score(X_test,y_test))

xgboost模型准确率： 0.998600664684275

from sklearn.metrics import r2_score,confusion_matrix,classification_report,auc,roc_curve
# 模型评估
y_pred = xgb.predict(X_test)
print('模型的R方值：',r2_score(y_test,y_pred))
print('模型混淆矩阵:','\n',confusion_matrix(y_test,y_pred))
print('模型分类报告:','\n',classification_report(y_test,y_pred))

模型的R方值： 0.9969034757638603
模型混淆矩阵: 
 [[7430    0    0    0]
 [   0 2381    0    0]
 [   1    7 1606    1]
 [   0    0    7    1]]
模型分类报告: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7430
           1       1.00      1.00      1.00      2381
           2       1.00      0.99      1.00      1615
           3       0.50      0.12      0.20         8

    accuracy                           1.00     11434
   macro avg       0.87      0.78      0.80     11434
weighted avg       1.00      1.00      1.00     11434

# 混淆矩阵可视化
fig = plt.figure(figsize=(8,8))
sns.heatmap(confusion_matrix(y_test,y_pred),vmax=1,annot=True,linewidths=0.5,cbar=False,cmap='YlGnBu',annot_kws={'fontsize':18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('XGBoost模型混淆矩阵',fontsize=20)
plt.show()

#打印特征重要性评分
feat_labels = X_train.columns[0:]
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f,j in zip(range(X_train.shape[1]),indices):
    index_list.append(feat_labels[j])
    value_list.append(importances[j])
print(f + 1, feat_labels[j], importances[j])
plt.figure(figsize=(10,6))
plt.barh(index_list[::-1],value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序',fontsize=14)
plt.show()

11 Source Port 0.00026755093

# 模型预测
res = pd.DataFrame()
res['真实值'] = y_test
res['预测值'] = y_pred
res.sample(10)

	真实值	预测值
4035	0	0
48232	0	0
55197	2	2
18073	1	1
9346	0	0
28252	0	0
48257	0	0
59796	0	0
61184	0	0
25549	1	1

📘 基于xgboost算法构建互联网防火墙异常行为识别模型/internetfirewall.ipynb