In [1]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly. subplots import make_subplots
import plotly.express as px
import country_converter as coco
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
df = pd.read_csv('2024 QS World University Rankings.csv')
df.head()
Out[1]:
| 2024 RANK | 2023 RANK | Institution Name | Country Code | Country | SIZE | FOCUS | RES. | AGE | STATUS | ... | International Faculty Rank | International Students Score | International Students Rank | International Research Network Score | International Research Network Rank | Employment Outcomes Score | Employment Outcomes Rank | Sustainability Score | Sustainability Rank | Overall SCORE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | rank display | rank display2 | institution | location code | location | size | focus | research | age band | status | ... | ifr rank | isr score | isr rank | irn score | irn rank | ger score | ger rank | SUS SCORE | SUS RANK | Overall Score |
| 1 | 1 | 1 | Massachusetts Institute of Technology (MIT) | US | United States | M | CO | VH | 5 | B | ... | 56 | 88.2 | 128 | 94.3 | 58 | 100 | 4 | 95.2 | 51 | 100.0 |
| 2 | 2 | 2 | University of Cambridge | UK | United Kingdom | L | FC | VH | 5 | A | ... | 64 | 95.8 | 85 | 99.9 | 7 | 100 | 6 | 97.3 | 33= | 99.2 |
| 3 | 3 | 4 | University of Oxford | UK | United Kingdom | L | FC | VH | 5 | A | ... | 110 | 98.2 | 60 | 100.0 | 1 | 100 | 3 | 97.8 | 26= | 98.9 |
| 4 | 4 | 5 | Harvard University | US | United States | L | FC | VH | 5 | B | ... | 210 | 66.8 | 223 | 100.0 | 5 | 100 | 1 | 96.7 | 39 | 98.3 |
5 rows × 29 columns
In [2]:
#查看数据大小
df.shape
Out[2]:
(1498, 29)
In [3]:
#查看数据基本信息
df.info
Out[3]:
<bound method DataFrame.info of 2024 RANK 2023 RANK \
0 rank display rank display2
1 1 1
2 2 2
3 3 4
4 4 5
... ... ...
1493 1401+ 1401+
1494 1401+ NaN
1495 1401+ 1201-1400
1496 1401+ 1201-1400
1497 1401+ NaN
Institution Name Country Code \
0 institution location code
1 Massachusetts Institute of Technology (MIT) US
2 University of Cambridge UK
3 University of Oxford UK
4 Harvard University US
... ... ...
1493 University of Oradea RO
1494 University of Stavanger NO
1495 University Politehnica of Timisoara, UPT RO
1496 Western Washington University US
1497 Zürcher Hochschule Winterthur CH
Country SIZE FOCUS RES. AGE STATUS ... \
0 location size focus research age band status ...
1 United States M CO VH 5 B ...
2 United Kingdom L FC VH 5 A ...
3 United Kingdom L FC VH 5 A ...
4 United States L FC VH 5 B ...
... ... ... ... ... ... ... ...
1493 Romania L FC MD 4 A ...
1494 Norway NaN CO NaN 2 NaN ...
1495 Romania M FO VH 5 A ...
1496 United States L CO HI 5 NaN ...
1497 Switzerland NaN CO NaN NaN NaN ...
International Faculty Rank International Students Score \
0 ifr rank isr score
1 56 88.2
2 64 95.8
3 110 98.2
4 210 66.8
... ... ...
1493 701+ 7.0
1494 NaN NaN
1495 701+ 2.9
1496 676 1.4
1497 NaN NaN
International Students Rank International Research Network Score \
0 isr rank irn score
1 128 94.3
2 85 99.9
3 60 100.0
4 223 100.0
... ... ...
1493 701+ 1.8
1494 NaN 26.3
1495 701+ 1.3
1496 701+ 1.0
1497 NaN 1.7
International Research Network Rank Employment Outcomes Score \
0 irn rank ger score
1 58 100
2 7 100
3 1 100
4 5 100
... ... ...
1493 701+ 5.9
1494 489 NaN
1495 701+ 8.5
1496 701+ 7.3
1497 701+ NaN
Employment Outcomes Rank Sustainability Score Sustainability Rank \
0 ger rank SUS SCORE SUS RANK
1 4 95.2 51
2 6 97.3 33=
3 3 97.8 26=
4 1 96.7 39
... ... ... ...
1493 701+ 2.2 701+
1494 NaN NaN NaN
1495 701+ 1 701+
1496 701+ NaN NaN
1497 NaN NaN NaN
Overall SCORE
0 Overall Score
1 100.0
2 99.2
3 98.9
4 98.3
... ...
1493 -
1494 -
1495 -
1496 -
1497 -
[1498 rows x 29 columns]>
In [4]:
#查看数据的描述性统计
df.describe().T
df.columns
Out[4]:
Index(['2024 RANK', '2023 RANK', 'Institution Name', 'Country Code', 'Country',
'SIZE', 'FOCUS', 'RES.', 'AGE', 'STATUS', 'Academic Reputation Score',
'Academic Reputation Rank', 'Employer Reputation Score',
'Employer Reputation Rank', 'Faculty Student Score',
'Faculty Student Rank', 'Citations per Faculty Score',
'Citations per Faculty Rank', 'International Faculty Score',
'International Faculty Rank', 'International Students Score',
'International Students Rank', 'International Research Network Score',
'International Research Network Rank', 'Employment Outcomes Score',
'Employment Outcomes Rank', 'Sustainability Score',
'Sustainability Rank', 'Overall SCORE'],
dtype='object')
In [5]:
categorical_cols = ['Institution Name', 'Country', 'SIZE', 'FOCUS', 'RES.']
def convert_to_int(df):
for col in df.columns:
if (col not in (categorical_cols)):
df[col] = df[col].str.extract('(\d+)').astype(int)
return df
def clean_data(df):
# 删除第一行数据
# df.drop(index=0,axis=0, inplace=True)
df = df.iloc[1:].copy()
# 删除缺失值
df = df.dropna()
df = df.loc[df['Overall SCORE'] != '-']
# 连接排名列、分类列和分数列
rank_df = df[['2024 RANK', '2023 RANK']]
cat_df = df[['Institution Name', 'Country',
'SIZE', 'FOCUS', 'RES.', 'AGE']]
score_cols = [col for col in df.columns if 'score' in col.lower()]
score_df = df[score_cols]
df = pd.concat([rank_df, cat_df, score_df], axis=1)
# 将对象类型转换为int类型
df = convert_to_int(df)
# 创建等级变更列
df['Rank Change'] = df['2023 RANK'] - df['2024 RANK']
# 乘以-1的值RANK 2023和RANK2024较低的排名=更好
df['2023 RANK'] = -1 * df['2023 RANK']
df['2024 RANK'] = -1 * df['2024 RANK']
return df
# df.drop?
df = clean_data(df)
df.head()
Out[5]:
| 2024 RANK | 2023 RANK | Institution Name | Country | SIZE | FOCUS | RES. | AGE | Academic Reputation Score | Employer Reputation Score | Faculty Student Score | Citations per Faculty Score | International Faculty Score | International Students Score | International Research Network Score | Employment Outcomes Score | Sustainability Score | Overall SCORE | Rank Change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | -1 | -1 | Massachusetts Institute of Technology (MIT) | United States | M | CO | VH | 5 | 100 | 100 | 100 | 100 | 100 | 88 | 94 | 100 | 95 | 100 | 0 |
| 2 | -2 | -2 | University of Cambridge | United Kingdom | L | FC | VH | 5 | 100 | 100 | 100 | 92 | 100 | 95 | 99 | 100 | 97 | 99 | 0 |
| 3 | -3 | -4 | University of Oxford | United Kingdom | L | FC | VH | 5 | 100 | 100 | 100 | 90 | 98 | 98 | 100 | 100 | 97 | 98 | 1 |
| 4 | -4 | -5 | Harvard University | United States | L | FC | VH | 5 | 100 | 100 | 98 | 100 | 84 | 66 | 100 | 100 | 96 | 98 | 1 |
| 5 | -5 | -3 | Stanford University | United States | L | FC | VH | 5 | 100 | 100 | 100 | 99 | 99 | 51 | 95 | 100 | 94 | 98 | -2 |
In [6]:
#大多数大学位于美国、中国、澳大利亚、加拿大和俄罗斯
In [7]:
# 大学位置分布图
country_codes = coco.convert(df['Country'],to='ISO3')
country_codes = pd.Series(country_codes)
university_location = country_codes.value_counts()
fig = px.choropleth(locations=university_location.index,
color=university_location.values,
color_continuous_scale=px.colors.sequential.Viridis, template='plotly_white',
title='University Location Distribution Map')
fig.show()
In [8]:
#查看可使用的所有颜色
import plotly.express as px
# 打印出 sequential 模块中的所有颜色图谱名称
for color_scale in dir(px.colors.sequential):
print(color_scale)
Aggrnyl Aggrnyl_r Agsunset Agsunset_r Blackbody Blackbody_r Bluered Bluered_r Blues Blues_r Blugrn Blugrn_r Bluyl Bluyl_r Brwnyl Brwnyl_r BuGn BuGn_r BuPu BuPu_r Burg Burg_r Burgyl Burgyl_r Cividis Cividis_r Darkmint Darkmint_r Electric Electric_r Emrld Emrld_r GnBu GnBu_r Greens Greens_r Greys Greys_r Hot Hot_r Inferno Inferno_r Jet Jet_r Magenta Magenta_r Magma Magma_r Mint Mint_r OrRd OrRd_r Oranges Oranges_r Oryel Oryel_r Peach Peach_r Pinkyl Pinkyl_r Plasma Plasma_r Plotly3 Plotly3_r PuBu PuBuGn PuBuGn_r PuBu_r PuRd PuRd_r Purp Purp_r Purples Purples_r Purpor Purpor_r Rainbow Rainbow_r RdBu RdBu_r RdPu RdPu_r Redor Redor_r Reds Reds_r Sunset Sunset_r Sunsetdark Sunsetdark_r Teal Teal_r Tealgrn Tealgrn_r Turbo Turbo_r Viridis Viridis_r YlGn YlGnBu YlGnBu_r YlGn_r YlOrBr YlOrBr_r YlOrRd YlOrRd_r __all__ __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __spec__ _swatches _swatches_continuous algae algae_r amp amp_r deep deep_r dense dense_r gray gray_r haline haline_r ice ice_r matter matter_r solar solar_r speed speed_r swatches swatches_continuous tempo tempo_r thermal thermal_r turbid turbid_r
In [9]:
# 分类变量的分布
print(df.columns)
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
size_counts = df['SIZE'].value_counts().sort_values(ascending=True)
focus_counts = df['FOCUS'].value_counts().sort_values(ascending=True)
rs_counts = df['RES.'].value_counts().sort_values(ascending=True)
age_counts = df['AGE'].value_counts().sort_values(ascending=True)
categorical_1st = [size_counts, focus_counts, rs_counts, age_counts]
categorical_1abels = ['Size', 'Focus', 'Research', 'Age']
for i, col in enumerate(categorical_1abels):
fig.add_trace(
go.Bar(
x=categorical_1st[i].index.astype('str'),
y=categorical_1st[i].values, name=col,
text=np.round(categorical_1st[i].values, 3),
textposition='auto',
marker_color=px.colors.sequential.Rainbow, textfont=dict(size=12)),
row=int(i/col_n)+1, col=int(i % col_n)+1
)
fig.update_layout(
title_text='Distribution of Categorical Variables', template='plotly_white')
fig.update_traces(marker_line_color='black',
marker_line_width=1.5, opacity=0.8)
fig.show()
Index(['2024 RANK', '2023 RANK', 'Institution Name', 'Country', 'SIZE',
'FOCUS', 'RES.', 'AGE', 'Academic Reputation Score',
'Employer Reputation Score', 'Faculty Student Score',
'Citations per Faculty Score', 'International Faculty Score',
'International Students Score', 'International Research Network Score',
'Employment Outcomes Score', 'Sustainability Score', 'Overall SCORE',
'Rank Change'],
dtype='object')
In [10]:
# 前20名大学分析
# 排名前20的院校名称
data = dict(zip(df['Institution Name'].tolist(), df['2024 RANK'].tolist()))
plt.figure(figsize=(20, 15))
wc = WordCloud(width=800, height=500, max_words=20, colormap='autumn',
background_color='white').generate_from_frequencies(data)
plt.axis('off')
plt.imshow(wc, interpolation='bilinear')
plt.show()
In [ ]:
top20_df = df.iloc[:20] # 只选前20名大学
top20_rank_change = (top20_df.groupby('Institution Name')['Rank Change']
.mean()
.sort_values(ascending=True)) # 或者 False
fig = px.bar(
x=top20_rank_change.index,
y=top20_rank_change.values,
text=np.round(top20_rank_change.values, 2), # 取两位小数
color=top20_rank_change.values,
color_continuous_scale='mrybm'
)
fig.update_layout(
title_text='Top 20 Universities Rank Changes (2023-2024)', # 修改标题
template='plotly_white',
xaxis=dict(title='Institution Name', tickfont=dict(size=16)), # 调整字体大小
yaxis=dict(title='Rank Change', tickfont=dict(size=16))
)
fig.update_traces(
marker_line_color='black',
marker_line_width=1.5,
opacity=0.8
)
fig.show()
In [ ]:
# 排名前20的大学位置分布图
country_codes = coco.convert(top20_df['Country'], to='ISO3')
country_codes = pd.Series(country_codes)
university_location = country_codes.value_counts()
fig = px.choropleth(locations=university_location.index, color=university_location.values,
color_continuous_scale=px.colors.sequential.Rainbow, template='plotly_white', title='University Location Distribution Map')
fig.show()
In [ ]:
## 计算国家平均排名变化
rank_change_avg = df.groupby('Country')['Rank Change'].mean().sort_values()
# 取排名下降最多的 10 个国家(均值最高)
rank_change_down = rank_change_avg.tail(10)
# 取排名上升最多的 10 个国家(均值最低)
rank_change_up = rank_change_avg.head(10)
# 合并数据
rank_change = pd.concat([rank_change_down, rank_change_up])
# 画图
fig = px.bar(
x=rank_change.index,
y=rank_change.values,
text=np.round(rank_change.values, 2), # 显示小数点后 2 位
color=rank_change.values,
color_continuous_scale='oranges' # 选择合适的颜色方案
)
# 更新布局
fig.update_layout(
title_text='Top 10 Countries Rank Up & Down From 2023 to 2024',
template='simple_white',
xaxis=dict(title='Country', tickfont=dict(size=16)), # 修正 tickfont
yaxis=dict(title='Average Rank Change', tickfont=dict(size=16))
)
fig.show()
# One of the following named colorscales:
# ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
# 'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
# 'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
# 'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
# 'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
# 'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
# 'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
# 'orrd', 'oryel', 'oxy', 'peach', 'phase', 'picnic', 'pinkyl',
# 'piyg', 'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn',
# 'puor', 'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu',
# 'rdgy', 'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar',
# 'spectral', 'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn',
# 'tealrose', 'tempo', 'temps', 'thermal', 'tropic', 'turbid',
# 'turbo', 'twilight', 'viridis', 'ylgn', 'ylgnbu', 'ylorbr',
# 'ylorrd'].
In [ ]:
# 平均等级变化和分类变量
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
change_by_size = df.groupby('SIZE')['Rank Change'].mean().sort_values(ascending=True)
change_by_focus = df.groupby('FOCUS')['Rank Change'].mean().sort_values(ascending=True)
change_by_rs = df.groupby('RES.')['Rank Change'].mean().sort_values(ascending=True)
change_by_age = df.groupby('AGE')['Rank Change'].mean().sort_values(ascending=True)
change_by_categories = [change_by_size, change_by_focus, change_by_rs, change_by_age]
categorical_1abels = ['Size', 'Focus', 'Research', 'Age']
for i, col in enumerate(categorical_1abels):
fig.add_trace(
go.Bar(
x=change_by_categories[i].index.astype('str'),
y=change_by_categories[i].values, name=col,
text=np.round(change_by_categories[i].values),
textposition='inside',
marker_color=px.colors.sequential.amp,
textfont=dict(size=12)),
row=int(i/col_n)+1,
col=int(i % col_n)+1)
fig.update_layout(
title_text='Average Rank Change and Categorical Variables', template='plotly_white')
fig.update_traces(marker_line_color='black',
marker_line_width=1.5, opacity=0.8)
fig.show()
In [15]:
# 总分分析
# 总分分布图
score_by_country = df.groupby('Country')['Overall SCORE'].mean()
score_by_country.index = coco.convert(score_by_country.index, to='ISO3')
fig = px.choropleth(locations=score_by_country.index,
color=score_by_country.values,
color_continuous_scale=px.colors.sequential.YlOrRd_r,
template='plotly_white',
title='University Average Overall Score Distributino')
fig.show()
In [16]:
# 计算前10和后10国家的平均总分
top10_score_country = (df.groupby('Country')['Overall SCORE']
.mean()
.sort_values(ascending=False)
.head(10)) # 用 head(10) 代替 [:10]
low10_score_country = (df.groupby('Country')['Overall SCORE']
.mean()
.sort_values(ascending=True) # 这里改为 ascending=True
.head(10)) # 用 head(10) 代替 [-10:]
# 合并数据
score_country = pd.concat([top10_score_country, low10_score_country])
# 画图
fig = px.bar(
x=score_country.index,
y=score_country.values,
text=np.round(score_country.values, 2), # 取两位小数
color=score_country.values,
color_continuous_scale='icefire'
)
# 更新布局
fig.update_layout(
title_text='Top 10 & Bottom 10 Average Overall Score by Country', # 修正标题
template='plotly_white',
xaxis=dict(title='Country', tickfont=dict(size=16)), # 确保格式正确
yaxis=dict(title='Average Overall Score', tickfont=dict(size=16))
)
# 更新条形样式
fig.update_traces(
marker_line_color='black',
marker_line_width=1.5,
opacity=0.8
)
fig.show()
In [17]:
# 获取前10和后10国家
score_country_lst = score_country.index.tolist()
# 过滤出这些国家的数据
score_country_df = df[df['Country'].isin(score_country_lst)]
# 绘制箱线图
fig = px.box(score_country_df,
x='Country',
y='Overall SCORE',
color='Country',
category_orders={'Country': score_country_lst}) # 修正 category_orders
# 更新布局
fig.update_layout(
title_text='Top 10 & Bottom 10 Average Overall Score Countries',
template='plotly_white',
xaxis=dict(title='Country', tickfont=dict(size=16)), # 修正 tickfont
yaxis=dict(title='Average Overall Score', tickfont=dict(size=16))
)
# 更新样式
fig.update_traces(
marker_line_color='black',
marker_line_width=1.5,
opacity=0.8
)
fig.show()
In [18]:
# 分数和分类变量
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
score_by_size = df. groupby(
'SIZE')['Overall SCORE'].mean().sort_values(ascending=True)
score_by_focus = df.groupby(
'FOCUS')['Overall SCORE']. mean().sort_values(ascending=True)
score_by_rs = df.groupby(
'RES.')['Overall SCORE']. mean().sort_values(ascending=True)
score_by_age = df.groupby(
'AGE')['Overall SCORE'].mean().sort_values(ascending=True)
score_by_categories = [score_by_size,
score_by_focus, score_by_rs, score_by_age]
categorical_1abels = ['Size','Focus', 'Research', 'Age']
for i, col in enumerate(categorical_1abels):
fig.add_trace(
go.Bar(
x=score_by_categories[i].index.astype('str'), y=score_by_categories[i].values, name=col,
text=np.round(score_by_categories[i].values), textposition='auto',
marker_color=px.colors.sequential.haline_r, textfont=dict(size=12)),
row=int(i/col_n)+1, col=int(i % col_n)+1)
fig.update_layout(
title_text='Score and Categorical Variables', template='plotly_white')
fig.update_traces(marker_line_color='black',
marker_line_width=1.5, opacity=0.8)
fig.show()
In [19]:
# 只保留数值型列
numeric_df = df.select_dtypes(include=['number'])
# 计算相关性
df_corr = numeric_df.corr()['Overall SCORE'].drop('Overall SCORE').sort_values(ascending=True)
# 画图
fig = px.bar(
x=df_corr.index,
y=df_corr.values,
color=df_corr.values,
text=np.round(df_corr.values, 2),
color_continuous_scale='plasma'
)
# 更新布局
fig.update_layout(
title_text='Correlation between Overall Scores',
template='plotly_white',
xaxis=dict(title='Variables', tickfont=dict(size=14)), # 修正 titlefont_size
yaxis=dict(title='Correlation', tickfont=dict(size=14))
)
# 更新样式
fig.update_traces(
marker_line_color='black',
marker_line_width=1.5,
opacity=0.8
)
fig.show()
In [ ]: