In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
from scipy.stats import f_oneway
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("smartphonesnew_file.csv")
df.head()
df.shape
df.info()
df.describe()
df.describe(include='O')
df.isnull().sum()
df.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 980 entries, 0 to 979 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Brand 980 non-null object 1 Model 980 non-null object 2 Final Price 980 non-null int64 3 Color 879 non-null float64 4 Free 980 non-null int64 5 Smartphone 960 non-null object 6 num_cores 974 non-null float64 7 processor_speed 938 non-null float64 8 battery_capacity 969 non-null float64 9 fast_charging_available 980 non-null int64 10 fast_charging 769 non-null float64 11 RAM 980 non-null int64 12 Storage 980 non-null int64 13 screen_size 980 non-null float64 14 refresh_rate 980 non-null int64 15 num_rear_cameras 980 non-null int64 16 os 966 non-null object 17 primary_camera_rear 980 non-null float64 18 primary_camera_front 975 non-null float64 19 extended_memory_available 980 non-null int64 20 resolution_height 980 non-null int64 21 resolution_width 980 non-null int64 dtypes: float64(8), int64(10), object(4) memory usage: 168.6+ KB
Out[1]:
0
In [2]:
# 价格分布
sns.histplot(data=df,x='Final Price',kde=True)
plt.show()
In [3]:
# 各品牌手机的价格箱线图
plt.figure(figsize=(12,14))
sns.boxplot(data=df,x='Final Price',y='Brand')
plt.show()
In [4]:
# 各颜色手机的价格箱线图
plt.figure(figsize=(10,12))
sns.boxplot(data=df,x='Final Price',y='Color')
plt.show()
sns.set_palette("RdBu")
plt.title("Distribute of RAM")
sns.countplot(data = df ,x="RAM")
plt.xlabel("Type of Ram")
plt.ylabel("Count")
plt.show()
sns.set_palette("RdBu")
plt.title("Distribute of Storage")
sns.countplot(data = df ,x="Storage")
plt.xlabel("Type of Storage")
plt.ylabel("Count")
plt.show()
In [5]:
# 智能手机品牌在数据集中的分布情况如何?
plt.figure(figsize=(10,5))
plt.title("Top 10 Brands")
figure = sns.barplot(x=df.Brand.value_counts()[0:10].index ,y= df.Brand.value_counts()[0:10].values)
plt.xlabel("Brands")
plt.ylabel("Frequance")
plt.show()
# 前五名是:
# 1 -三星
# 2-Xiaomi
# 3-Apple
# 4-Realme
# 5-OPPO
In [7]:
# 哪个智能手机品牌的平均价格最高?
fig ,ax =plt.subplots(nrows=1 ,ncols= 2 ,sharex=True, figsize=(20,10) )
fig.set_label("avg.Price of Brand")
#ax[0]
figure = sns.barplot(x =df.groupby("Brand")["Final Price"].mean().sort_values(ascending=False).round(3)[0:5].index ,
y=df.groupby("Brand")["Final Price"].mean().sort_values(ascending=False).round(3)[0:5].values ,
ax= ax[1])
ax[0].set_ylabel("avg.Price",fontsize=30)
ax[0].set_xlabel("Brand",fontsize=30)
#ax[1]
ax[0].plot(df.groupby("Brand")["Final Price"].mean().sort_values(ascending=False).round(3)[0:5].index ,
df.groupby("Brand")["Final Price"].mean().sort_values(ascending=False).round(3)[0:5].values ,
"o--")
ax[1].set_ylabel("avg.Price",fontsize=30)
ax[1].set_xlabel("Brand",fontsize=30)
plt.show()
# 平均价格最高的前5款智能手机是:
# 1-Apple
# 2-Lenovo
# 3-Asus
# 4-Realme
# 5-Samsung
In [8]:
# 内存和智能手机的价格之间是否存在关联?
from scipy.stats import f_oneway
cleaned_df = df.dropna(subset=['RAM', 'Final Price'])
storage_categories = cleaned_df['RAM'].unique()
price_arrays = [cleaned_df[cleaned_df['RAM'] == category]['Final Price'] for category in storage_categories]
# 进行单因素方差分析
f_statistic, p_value = f_oneway(*price_arrays)
print("F-statistic:", f_statistic)
print("P-value:", p_value)
# 智能手机的价格因内存类别的不同而有很大差异。
F-statistic: 24.559712806532797 P-value: 1.3274207824101188e-34
In [9]:
# 不同品牌的智能手机颜色分布有何不同?
color_counts = df.groupby(['Brand', 'Color']).size().unstack(fill_value=0)
color_counts.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Smartphone Colors by Brand')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Color', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [ ]: