In [1]:
# 1 Company-字符串-笔记本电脑制造商
# 2 Product-字符串-品牌和型号
# 3 TypeName-字符串-类型(笔记本电脑、超极本、游戏机等)
# 4 Inches-数字-屏幕尺寸
# 5 ScreenResolution-字符串-屏幕分辨率
# 6 Cpu-字符串-中央处理器 (CPU)
# 7 Ram-字符串-笔记本电脑 RAM
# 8 Memory-字符串-硬盘/SSD 内存
# 9 GPU-字符串-图形处理单元 (GPU)
# 10 OpSys-字符串-操作系统
# 11 Weight-字符串-笔记本电脑重量
# 12 Price_euros-数字-价格(欧元)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance
import warnings
warnings.filterwarnings('ignore')
laptop_price = pd.read_csv('laptop_price.csv', encoding='latin-1')
laptop_price.head()
laptop_price.shape
laptop_price.info()
laptop_price.describe()
laptop_price.describe(include='O')
laptop_price.isnull().sum()
laptop_price.duplicated().sum()
df = laptop_price
df = df.drop(['laptop_ID', 'Product'], axis=1)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1303 entries, 0 to 1302 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 laptop_ID 1303 non-null int64 1 Company 1303 non-null object 2 Product 1303 non-null object 3 TypeName 1303 non-null object 4 Inches 1303 non-null float64 5 ScreenResolution 1303 non-null object 6 Cpu 1303 non-null object 7 Ram 1303 non-null object 8 Memory 1303 non-null object 9 Gpu 1303 non-null object 10 OpSys 1303 non-null object 11 Weight 1303 non-null object 12 Price_euros 1303 non-null float64 dtypes: float64(2), int64(1), object(10) memory usage: 132.5+ KB
In [2]:
# 仅从ScreenResolution列中提取分辨率
df['PPI'] = df['ScreenResolution'].str.extract(r'(\d+x\d+)')
# 创建宽度和高度来计算PPI
df[['Width', 'Height']] = df['PPI'].str.split('x', expand=True).astype('int')
# 计算PPI
df['PPI'] = (((df['Width']**2 + df['Height']**2)**0.5)/df['Inches']).round(2)
# 创建基于术语“触摸屏”的触摸屏专栏
df['Touchscreen'] = df['ScreenResolution'].str.contains('Touchscreen',
case=False, regex=True).astype(int)
# 我们可以删除不需要的列
df = df.drop(['Height', 'Width', 'ScreenResolution'], axis=1)
# 提取Ghz数
df['Cpu_Ghz'] = df['Cpu'].str.extract(r'(\d+(\.\d+)?)GHz')[0].astype(float)
# 提取制造商名称
df['Cpu_Manufacturer'] = df['Cpu'].str.split(' ').str[0]
df['Cpu_Manufacturer'].value_counts()
Out[2]:
Cpu_Manufacturer Intel 1240 AMD 62 Samsung 1 Name: count, dtype: int64
In [3]:
# 三星的CPU是一个不必要的异常值,所以我们将放弃它。
# 去除异常值并删除Cpu变量
df = df[df['Cpu_Manufacturer'] != 'Samsung']
df = df.drop(['Cpu'], axis=1)
# 从“Ram”列中提取数值
df['Ram'] = df['Ram'].str[:-2].astype(int)
df['Memory'].unique()
Out[3]:
array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
'500GB HDD', '256GB Flash Storage', '1TB HDD',
'32GB Flash Storage', '128GB SSD + 1TB HDD',
'256GB SSD + 256GB SSD', '64GB Flash Storage',
'256GB SSD + 1TB HDD', '256GB SSD + 2TB HDD', '32GB SSD',
'2TB HDD', '64GB SSD', '1.0TB Hybrid', '512GB SSD + 1TB HDD',
'1TB SSD', '256GB SSD + 500GB HDD', '128GB SSD + 2TB HDD',
'512GB SSD + 512GB SSD', '16GB SSD', '16GB Flash Storage',
'512GB SSD + 256GB SSD', '512GB SSD + 2TB HDD',
'64GB Flash Storage + 1TB HDD', '180GB SSD', '1TB HDD + 1TB HDD',
'32GB HDD', '1TB SSD + 1TB HDD', '512GB Flash Storage',
'128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid', '1.0TB HDD',
'512GB SSD + 1.0TB Hybrid', '256GB SSD + 1.0TB Hybrid'],
dtype=object)
In [4]:
# 这里我们有4种不同类型的内存:SSD, HDD, Flash Storage和Hybrid。我们可以提取存储类型和内存。
In [6]:
df['Memory'] = df['Memory'].str.replace('1TB', '1024GB')
df['Memory'] = df['Memory'].str.replace('1.0TB', '1024GB')
df['Memory'] = df['Memory'].str.replace('2TB', '2048GB')
df['Memory'] = df['Memory'].str.replace('GB', '')
df['Memory'] = df['Memory'].str.replace(' ', '')
df['Storage_1_Type'] = ''
df['Storage_1_Memory'] = 0
df['Storage_2_Type'] = ''
df['Storage_2_Memory'] = 0
storage_types = ['SSD', 'HDD', 'Flash', 'Hybrid']
# Storage_1
for storage_type in storage_types:
condition = df['Memory'].str.contains(storage_type, case=False, regex=True)
df.loc[condition & (df['Storage_1_Type'] == ''), 'Storage_1_Type'] = storage_type
df.loc[condition & (df['Storage_1_Memory'] == 0),'Storage_1_Memory'] = df['Memory'].str.extract(f'(\d+) {storage_type}', expand=False).astype(float)
df['Second_Storage'] = df['Memory'].str.extract(r'(\+\s?\d+\s?\w+)', expand=False).fillna('')
df['Has_Second_Storage'] = df['Second_Storage'].apply(lambda x: False if x == '' else True)
# Storage_2
for storage_type in storage_types:
condition = df['Second_Storage'].str.contains(storage_type, case=False, regex=True)
df.loc[condition & (df['Storage_2_Type'] == ''), 'Storage_2_Type'] = storage_type
df.loc[condition & (df['Storage_2_Memory'] == 0),
'Storage_2_Memory'] = df['Second_Storage'].str.extract(f'(\d+) {storage_type}', expand=False).astype(float)
# 删除不必要的列
df.drop(['Memory', 'Second_Storage', 'Has_Second_Storage'], axis=1, inplace=True)
In [7]:
# 提取制造商名称
df['Gpu_Manufacturer'] = df['Gpu'].str.split(' ').str[0]
df = df.drop(['Gpu'], axis=1)
In [8]:
# 提取出重量数字
df['Weight'] = df['Weight'].str[:-2].astype(float)
df.head()
Out[8]:
| Company | TypeName | Inches | Ram | OpSys | Weight | Price_euros | PPI | Touchscreen | Cpu_Ghz | Cpu_Manufacturer | Storage_1_Type | Storage_1_Memory | Storage_2_Type | Storage_2_Memory | Gpu_Manufacturer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Apple | Ultrabook | 13.3 | 8 | macOS | 1.37 | 1339.69 | 226.98 | 0 | 2.3 | Intel | SSD | 128 | 0 | Intel | |
| 1 | Apple | Ultrabook | 13.3 | 8 | macOS | 1.34 | 898.94 | 127.68 | 0 | 1.8 | Intel | Flash | 128 | 0 | Intel | |
| 2 | HP | Notebook | 15.6 | 8 | No OS | 1.86 | 575.00 | 141.21 | 0 | 2.5 | Intel | SSD | 256 | 0 | Intel | |
| 3 | Apple | Ultrabook | 15.4 | 16 | macOS | 1.83 | 2537.45 | 220.53 | 0 | 2.7 | Intel | SSD | 512 | 0 | AMD | |
| 4 | Apple | Ultrabook | 13.3 | 8 | macOS | 1.37 | 1803.60 | 226.98 | 0 | 3.1 | Intel | SSD | 256 | 0 | Intel |
In [9]:
# 按公司划分的产品数量
plt.figure(figsize=(8, 5))
sns.countplot(x='Company', data=df,
order=df['Company'].value_counts().index, palette='crest')
plt.xticks(rotation=45, ha='right')
plt.title('Number of Products by Company', fontsize=10)
plt.xlabel('Company', fontsize=8)
plt.ylabel('Number of Products', fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()
In [10]:
# 按TypeName, Ram, OpSys划分的产品数量
columns_to_plot = ['TypeName', 'Ram', 'OpSys']
plt.figure(figsize=(10, 4))
for i, feature in enumerate(columns_to_plot):
plt.subplot(1, 3, i + 1)
sns.countplot(x=feature, data=df,
order=df[feature].value_counts().index, palette='crest')
plt.title(f'Number of Products by {feature}', fontsize=10)
plt.xlabel(feature, fontsize=8)
plt.ylabel('Number of Products', fontsize=8)
if i == 0 or i == 2:
plt.tick_params(rotation=45, axis='both', labelsize=8)
else:
plt.tick_params(axis='both', labelsize=8)
plt.tight_layout()
plt.show()
In [11]:
# 重量分布
plt.figure(figsize=(8, 3))
sns.histplot(df['Weight'], color='navy', bins=8, alpha=0.6)
plt.title('Distribution of Weight')
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.show()
In [12]:
# CPU和GPU厂商
features = ['Cpu_Manufacturer', 'Gpu_Manufacturer']
plt.figure(figsize=(8, 3))
for i, feature in enumerate(features, 1):
plt.subplot(1, 2, i)
sns.countplot(x=feature, data=df,
order=df[feature].value_counts().index, palette='crest')
plt.title(f'Count of Products by {feature}', fontsize=10)
plt.xlabel('Number of Products', fontsize=8)
plt.ylabel(feature, fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()
In [13]:
# 按内存类型划分的产品数量
features = ['Storage_1_Type', 'Storage_2_Type']
plt.figure(figsize=(8, 3))
for i, feature in enumerate(features, 1):
plt.subplot(1, 2, i)
sns.countplot(x=feature, data=df,
order=df[feature].value_counts().index, palette='crest')
plt.title(f'Count of Products by {feature}', fontsize=10)
plt.xlabel('Number of Products', fontsize=8)
plt.ylabel(feature, fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()
In [14]:
# 存储内存,重量,PPI,英寸和价格的分布
features = ['Storage_1_Memory', 'Storage_2_Memory', 'Weight', 'PPI', 'Inches', 'Price_euros']
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features, 1):
plt.subplot(2, 3, i)
sns.histplot(df[feature].dropna(), kde=True, color='navy', bins='auto', alpha=0.6)
plt.title(f'Histogram of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
In [15]:
# 各公司平均价格
plt.figure(figsize=(12, 6))
order_by_mean_price = df.groupby('Company')['Price_euros'].mean().sort_values(ascending=False).index
sns.barplot(x='Company', y='Price_euros', data=df, palette='crest', order=order_by_mean_price)
plt.title('Mean Price by Company')
plt.xlabel('Company')
plt.ylabel('Mean Price (euros)')
plt.xticks(rotation=45, ha='right')
plt.show()
In [17]:
# 价格与PPI、Ram和Cpu_Ghz的关系
columns_to_plot = ['PPI', 'Ram', 'Cpu_Ghz']
plt.figure(figsize=(8, 3))
for i, feature in enumerate(columns_to_plot):
plt.subplot(1, 3, i + 1)
sns.scatterplot(x=feature, data=df, y='Price_euros', color='navy', alpha=0.6)
plt.title(f'{feature}', fontsize=10)
plt.xlabel(feature, fontsize=8)
if i == 0:
plt.ylabel('Price euros', fontsize=8)
else: plt.ylabel('')
plt.tick_params(axis='both', labelsize=8)
plt.tight_layout()
plt.show()
In [18]:
# 箱线图:Cpu_Manufacturer和Gpu_Manufacturer与价格
plt.figure(figsize=(8, 4))
for i, feature in enumerate(['Cpu_Manufacturer', 'Gpu_Manufacturer']):
plt.subplot(1, 2, i + 1)
sns.boxplot(x=feature, y='Price_euros', data=df, palette='crest')
plt.title(f'{feature} vs. Price', fontsize=10)
plt.xlabel(feature, fontsize=8)
if i == 0:
plt.ylabel('Price euros', fontsize=8)
else: plt.ylabel('')
plt.tick_params(axis='both', labelsize=8)
plt.tight_layout()
plt.show()
In [19]:
# 相关矩阵
numerical_columns = df.select_dtypes(include=['int', 'float']).columns
corr_matrix = df[numerical_columns].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='crest', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix', fontsize=10)
plt.xticks(rotation=45, fontsize=8)
plt.yticks(fontsize=8)
plt.show()
In [20]:
import plotly.express as px
fig = px.sunburst(df, path=['Company','Ram','TypeName'], values='Price_euros',color='Price_euros')
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
fig.show()
In [ ]: