In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("CO2 Emissions_Canada.csv")
df.head()
Out[1]:
| Make | Model | Vehicle Class | Engine Size(L) | Cylinders | Transmission | Fuel Type | Fuel Consumption City (L/100 km) | Fuel Consumption Hwy (L/100 km) | Fuel Consumption Comb (L/100 km) | Fuel Consumption Comb (mpg) | CO2 Emissions(g/km) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ACURA | ILX | COMPACT | 2.0 | 4 | AS5 | Z | 9.9 | 6.7 | 8.5 | 33 | 196 |
| 1 | ACURA | ILX | COMPACT | 2.4 | 4 | M6 | Z | 11.2 | 7.7 | 9.6 | 29 | 221 |
| 2 | ACURA | ILX HYBRID | COMPACT | 1.5 | 4 | AV7 | Z | 6.0 | 5.8 | 5.9 | 48 | 136 |
| 3 | ACURA | MDX 4WD | SUV - SMALL | 3.5 | 6 | AS6 | Z | 12.7 | 9.1 | 11.1 | 25 | 255 |
| 4 | ACURA | RDX AWD | SUV - SMALL | 3.5 | 6 | AS6 | Z | 12.1 | 8.7 | 10.6 | 27 | 244 |
In [2]:
# 重命名变量
df.rename(columns ={'Make' : "make",
'Model' : "model",
'Vehicle Class' : "vehicle_class",
'Engine Size(L)': "engine_size",
'Cylinders': "cylinders",
'Transmission' : "transmission",
'Fuel Type' : "fuel_type",
'Fuel Consumption City (L/100 km)' : "fuel_cons_city",
'Fuel Consumption Hwy (L/100 km)': "fuel_cons_hwy",
'Fuel Consumption Comb (L/100 km)' :"fuel_cons_comb",
'Fuel Consumption Comb (mpg)' : "fuel_cons_comb_mpg"
, 'CO2 Emissions(g/km)' : "co2"
}, inplace = True)
df.sample(10)
df.shape
df.info()
df.describe()
df.describe(include='O')
df.isna().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7385 entries, 0 to 7384 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 make 7385 non-null object 1 model 7385 non-null object 2 vehicle_class 7385 non-null object 3 engine_size 7385 non-null float64 4 cylinders 7385 non-null int64 5 transmission 7385 non-null object 6 fuel_type 7385 non-null object 7 fuel_cons_city 7385 non-null float64 8 fuel_cons_hwy 7385 non-null float64 9 fuel_cons_comb 7385 non-null float64 10 fuel_cons_comb_mpg 7385 non-null int64 11 co2 7385 non-null int64 dtypes: float64(4), int64(3), object(5) memory usage: 692.5+ KB
Out[2]:
0
In [3]:
sns.boxplot(x = df.make, y = df.co2)
plt.xticks(rotation = 90)
plt.show()
In [4]:
top_models = df.model.value_counts()[:20]
a = sns.countplot(x = "model", data = df, order = top_models.index)
plt.xticks(rotation = 90)
plt.title("Top 20 Model")
plt.xlabel("Model")
plt.ylabel("Count")
plt.bar_label(a.containers[0])
plt.show()
In [5]:
a = sns.countplot(data = df, x = "vehicle_class", order = df.vehicle_class.value_counts().index)
plt.xticks(rotation = 90)
plt.bar_label(a.containers[0])
plt.show()
In [6]:
sns.boxplot(x = "vehicle_class", y = "co2", data = df)
plt.xticks(rotation = 90)
plt.show()
In [8]:
sns.displot(data = df, x = "engine_size", kde = True, bins = 20)
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x1fb20983430>
In [9]:
sns.scatterplot(data = df, x = "co2", y = "engine_size", alpha = 0.6, color = "pink" , edgecolor = "k", s = 50)
plt.xlabel("CO2 Emissions")
plt.ylabel("Engine Size")
plt.title("CO2 Emissions vs Engine Size")
plt.grid(axis = "y", alpha = 0.5)
plt.show()
In [10]:
a = sns.countplot(data = df, x = df.cylinders, palette = "Blues")
plt.bar_label(a.containers[0])
plt.xlabel("Cylinders")
plt.ylabel("Count")
plt.title("Distribution of Cylinders")
plt.grid(axis = "y", linestyle = "--", alpha = 0.4)
sns.despine()
plt.show()
In [11]:
sns.scatterplot(data = df, x = "co2", y = "cylinders", alpha = 0.6, color = "pink" , edgecolor = "k", s = 50)
plt.xlabel("CO2 Emissions")
plt.ylabel("Cylinders")
plt.title("CO2 Emissions vs Cylinders")
plt.grid(axis = "y", alpha = 0.5, linestyle = "--")
plt.show()
In [12]:
import re
df["gears"] = df["transmission"].copy()
df["gears"] = df["gears"].apply(lambda x: re.findall(r'\d+', str(x))[0] if re.findall(r'\d+', str(x)) else "No Gears")
df["transmission"] = df["transmission"].apply(lambda x: re.findall("[A-Za-z]+", str(x))[0] if re.findall("[A-Za-z]+", str(x)) else None)
df = df[['make', 'model', 'vehicle_class', 'engine_size', 'cylinders',
'transmission', 'gears', 'fuel_type', 'fuel_cons_city', 'fuel_cons_hwy',
'fuel_cons_comb', 'fuel_cons_comb_mpg', 'co2']]
transmission_counts = df.transmission.value_counts()
transmission_labels = transmission_counts.index
transmission_values = transmission_counts.values
colors = ['#FF6384', '#36A2EB', '#FFCE56', '#8BC34A', '#FF9800']
plt.figure(figsize = (10, 8))
plt.pie(transmission_values, labels = transmission_labels, autopct = "%.2f%%", colors = colors, wedgeprops = {'linewidth': 2, 'edgecolor': 'white'})
centre_circle = plt.Circle((0, 0), 0.70, fc="white")
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title("Transmission Value Counts", fontsize = 14, fontweight = 'bold')
plt.axis("equal")
plt.show()
In [13]:
plt.figure(figsize = (10, 8))
a = sns.countplot(data = df, x = "gears", palette = "viridis")
plt.bar_label(a.containers[0])
plt.title("Gears Value Counts", fontsize = 14)
plt.xlabel("Gears")
plt.ylabel("Count")
sns.despine()
plt.show()
In [14]:
plt.figure(figsize = (10, 8))
sns.boxplot(x = "gears", y = "co2", data = df, palette = 'Set3')
plt.title("CO2 Emission by Gears", fontsize = 14)
plt.xlabel("Gears")
plt.ylabel("CO2 Emission")
plt.show()
In [15]:
plt.figure(figsize = (10, 8))
sns.boxplot(x = "transmission", y = "co2", data = df, palette = 'husl')
plt.title("CO2 Emission by Transmission", fontsize = 14)
plt.xlabel("Transmission")
plt.ylabel("CO2 Emission")
plt.show()
In [16]:
plt.figure(figsize = (10, 8))
sns.boxplot(x = "fuel_type", y = "co2", data = df, palette = "muted")
plt.title("CO2 Emission by Fuel Type", fontsize = 14)
plt.xlabel("Fuel Type")
plt.ylabel("CO2 Emission")
plt.show()
In [17]:
sns.displot(df.co2, kde = True, color = "brown")
plt.title("CO2 Emission Distribution", fontsize = 14)
plt.xlabel("CO2 Emission")
plt.ylabel("Count")
plt.show()
In [ ]: