In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
data = pd.read_csv('onlinefoods.csv')
data.head()
Out[1]:
| Age | Gender | Marital Status | Occupation | Monthly Income | Educational Qualifications | Family size | latitude | longitude | Pin code | Output | Feedback | Unnamed: 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20 | Female | Single | Student | No Income | Post Graduate | 4 | 12.9766 | 77.5993 | 560001 | Yes | Positive | Yes |
| 1 | 24 | Female | Single | Student | Below Rs.10000 | Graduate | 3 | 12.9770 | 77.5773 | 560009 | Yes | Positive | Yes |
| 2 | 22 | Male | Single | Student | Below Rs.10000 | Post Graduate | 3 | 12.9551 | 77.6593 | 560017 | Yes | Negative | Yes |
| 3 | 22 | Female | Single | Student | No Income | Graduate | 6 | 12.9473 | 77.5616 | 560019 | Yes | Positive | Yes |
| 4 | 22 | Male | Single | Student | Below Rs.10000 | Post Graduate | 4 | 12.9850 | 77.5533 | 560010 | Yes | Positive | Yes |
In [2]:
data.shape
data.info()
data.describe() # 查看数值型变量的描述性统计
data.describe(include='O') # 查看非数值型变量的描述性统计
data.isnull().sum() # 统计缺失值情况
data.duplicated().sum() # 统计重复值情况
data.drop_duplicates(inplace=True)
data.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 388 entries, 0 to 387 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 388 non-null int64 1 Gender 388 non-null object 2 Marital Status 388 non-null object 3 Occupation 388 non-null object 4 Monthly Income 388 non-null object 5 Educational Qualifications 388 non-null object 6 Family size 388 non-null int64 7 latitude 388 non-null float64 8 longitude 388 non-null float64 9 Pin code 388 non-null int64 10 Output 388 non-null object 11 Feedback 388 non-null object 12 Unnamed: 12 388 non-null object dtypes: float64(2), int64(3), object(8) memory usage: 39.5+ KB
Out[2]:
0
In [3]:
# 年龄和家庭规模的分布
sns.set(style="whitegrid")
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.histplot(data['Age'], bins=15, kde=True, ax=ax[0])
ax[0].set_title('Age Distribution')
ax[0].set_xlabel('Age')
ax[0].set_ylabel('Frequency')
sns.histplot(data['Family size'], bins=6, kde=True, ax=ax[1])
ax[1].set_title('Family Size Distribution')
ax[1].set_xlabel('Family Size')
ax[1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
In [4]:
# 统计图显示了数据集中的性别、婚姻状况、月收入和教育程度的分布
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
sns.countplot(x='Gender', data=data, ax=axes[0, 0])
axes[0, 0].set_title('Gender Distribution')
sns.countplot(x='Marital Status', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Marital Status Distribution')
sns.countplot(x='Monthly Income', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Monthly Income Distribution')
axes[1, 0].tick_params(axis='x', rotation=45)
sns.countplot(x='Educational Qualifications', data=data, ax=axes[1, 1])
axes[1, 1].set_title('Educational Qualifications Distribution')
axes[1, 1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
In [5]:
# 年龄与家庭规模的散点图
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Family size', data=data, hue='Gender', style='Marital Status', s=100)
plt.title('Relationship between Age and Family Size by Gender and Marital Status')
plt.xlabel('Age')
plt.ylabel('Family Size')
plt.legend(title='Gender / Marital Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [6]:
# 该图表显示了不同收入阶层的人下的订单数量
plt.figure(figsize=(12, 6))
sns.countplot(x='Monthly Income', hue='Output', data=data)
plt.title('Income Level vs. Online Ordering Behavior')
plt.xlabel('Monthly Income')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Ordered Online')
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [7]:
# 图表显示,根据他们的反馈,不同性别的人数以及根据他们的反馈,不同教育程度的人数
plt.figure(figsize=(12, 6))
sns.countplot(x='Feedback', hue='Gender', data=data)
plt.title('Feedback by Gender')
plt.xlabel('Feedback')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [8]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Feedback', hue='Educational Qualifications', data=data)
plt.title('Feedback by Educational Qualifications')
plt.xlabel('Feedback')
plt.ylabel('Count')
plt.legend(title='Educational Qualifications', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [9]:
# 盒子图显示了不同婚姻状况和年龄的顾客的订购行为
plt.figure(figsize=(12, 6))
sns.boxplot(x='Marital Status', y='Age', hue='Output', data=data)
plt.title('Ordering Behavior by Marital Status and Age')
plt.xlabel('Marital Status')
plt.ylabel('Age')
plt.legend(title='Ordered Online')
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\categorical.py:632: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas. positions = grouped.grouper.result_index.to_numpy(dtype=float) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\categorical.py:632: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas. positions = grouped.grouper.result_index.to_numpy(dtype=float)
In [10]:
# 该图表显示了不同收入水平人群的正面和负面反馈数量
plt.figure(figsize=(14, 7))
sns.countplot(x='Monthly Income', hue='Feedback', data=data)
plt.title('Income Level and Feedback Sentiment')
plt.xlabel('Monthly Income')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Feedback', loc='upper right')
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [11]:
# 图表显示了不同学历的人在网上订餐的数量
plt.figure(figsize=(14, 7))
sns.countplot(x='Educational Qualifications', hue='Output', data=data)
plt.title('Educational Qualifications and Online Ordering')
plt.xlabel('Educational Qualifications')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Ordered Online', loc='upper right')
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [12]:
# 数据集中不同特征之间的热图
correlation_matrix = data[['Age', 'Family size', 'latitude', 'longitude', 'Pin code']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Heatmap of Correlations Among Numerical Features')
plt.show()
In [13]:
# 小提琴图显示月收入与年龄的关系
plt.figure(figsize=(14, 8))
sns.violinplot(x='Monthly Income', y='Age', data=data)
plt.title('Violin Plots for Monthly Income vs. Age')
plt.xlabel('Monthly Income')
plt.ylabel('Age')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key) D:\Python310\lib\site-packages\seaborn\_base.py:948: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
In [14]:
# 雷达图显示正面反馈数、负面反馈数、教育程度和平均家庭规模之间的关系
data = pd.read_csv('onlinefoods.csv')
data['Output_Numeric'] = data['Output'].map({'Yes': 1, 'No': 0})
data['Positive_Feedback'] = (data['Feedback'] == 'Positive').astype(int)
radar_df_new = data.groupby('Educational Qualifications').agg(
Average_Age=('Age', 'mean'),
Average_Family_Size=('Family size', 'mean'),
Proportion_Positive_Feedback=('Positive_Feedback', 'mean'),
Proportion_Ordering_Online=('Output_Numeric', 'mean')
).reset_index()
scaler = MinMaxScaler()
radar_df_normalized = pd.DataFrame(scaler.fit_transform(radar_df_new.iloc[:, 1:]), columns=radar_df_new.columns[1:])
radar_df_normalized['Educational Qualifications'] = radar_df_new['Educational Qualifications']
categories_new = list(radar_df_normalized)[1:]
N_new = len(categories_new)
angles_new = [n / float(N_new) * 2 * 3.14159265359 for n in range(N_new)]
angles_new += angles_new[:1]
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
plt.xticks(angles_new[:-1], categories_new)
ax.set_rlabel_position(0)
plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
plt.ylim(0,1)
for i, row in radar_df_normalized.iterrows():
data = radar_df_normalized.loc[i].drop('Educational Qualifications').tolist()
data += data[:1]
ax.plot(angles_new, data, linewidth=2, linestyle='solid', label=radar_df_normalized['Educational Qualifications'][i])
ax.fill(angles_new, data, alpha=0.1)
plt.title('Enhanced Radar Chart for Educational Qualifications', size=20, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()
In [16]:
# 下面的条形图显示了正面和负面反馈的数量
data = pd.read_csv('onlinefoods.csv')
sentiment_counts = data['Feedback'].value_counts()
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar')
plt.title('Feedback Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
for a,b in zip(range(2),sentiment_counts.values):
plt.text(a,b,'%d'%b,ha='center',va='bottom',fontsize=14)
plt.show()
In [18]:
# 随机森林机器学习(ML)分类模型预测反馈是积极的还是消极的
# 编码处理
encoder = LabelEncoder()
categorical_features = ['Gender', 'Marital Status', 'Occupation', 'Monthly Income', 'Educational Qualifications', 'Output', 'Unnamed: 12']
for feature in categorical_features:
data[feature] = encoder.fit_transform(data[feature])
# 准备建模数据
X = data.drop(['Feedback', 'latitude', 'longitude', 'Pin code'], axis=1)
y = encoder.fit_transform(data['Feedback'])
# 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建随机森林模型
rf_model = RandomForestClassifier(n_estimators=5, random_state=42)
rf_model.fit(X_train, y_train)
# 模型预测
y_pred = rf_model.predict(X_test)
# 模型评估
accuracy_rf = accuracy_score(y_test, y_pred)
report_rf = classification_report(y_test, y_pred)
print(accuracy_rf)
print(report_rf)
# 我们可以看到,我们得到了相当不错的90%的准确率
0.8974358974358975
precision recall f1-score support
0 0.62 0.73 0.67 11
1 0.95 0.93 0.94 67
accuracy 0.90 78
macro avg 0.78 0.83 0.80 78
weighted avg 0.91 0.90 0.90 78