import plotly.graph_objects as go 
import plotly.figure_factory as ff
from plotly. subplots import make_subplots 
import plotly.express as px
import country_converter as coco 
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd 
import numpy as np
df = pd.read_csv('2024 QS World University Rankings.csv')
df.head()

#查看数据大小
df.shape

(1498, 29)

#查看数据基本信息
df.info

<bound method DataFrame.info of          2024 RANK      2023 RANK  \
0     rank display  rank display2   
1                1              1   
2                2              2   
3                3              4   
4                4              5   
...            ...            ...   
1493         1401+          1401+   
1494         1401+            NaN   
1495         1401+      1201-1400   
1496         1401+      1201-1400   
1497         1401+            NaN   

                                  Institution Name   Country Code  \
0                                      institution  location code   
1     Massachusetts Institute of Technology (MIT)              US   
2                          University of Cambridge             UK   
3                             University of Oxford             UK   
4                               Harvard University             US   
...                                            ...            ...   
1493                          University of Oradea             RO   
1494                       University of Stavanger             NO   
1495      University Politehnica of Timisoara, UPT             RO   
1496                 Western Washington University             US   
1497                 Zürcher Hochschule Winterthur             CH   

             Country  SIZE  FOCUS      RES.       AGE  STATUS  ...  \
0           location  size  focus  research  age band  status  ...   
1      United States     M     CO        VH         5       B  ...   
2     United Kingdom     L     FC        VH         5       A  ...   
3     United Kingdom     L     FC        VH         5       A  ...   
4      United States     L     FC        VH         5       B  ...   
...              ...   ...    ...       ...       ...     ...  ...   
1493         Romania     L     FC        MD         4       A  ...   
1494          Norway   NaN     CO       NaN         2     NaN  ...   
1495         Romania     M     FO        VH         5       A  ...   
1496   United States     L     CO        HI         5     NaN  ...   
1497     Switzerland   NaN     CO       NaN       NaN     NaN  ...   

     International Faculty Rank International Students Score  \
0                      ifr rank                    isr score   
1                            56                         88.2   
2                            64                         95.8   
3                           110                         98.2   
4                           210                         66.8   
...                         ...                          ...   
1493                       701+                          7.0   
1494                        NaN                          NaN   
1495                       701+                          2.9   
1496                        676                          1.4   
1497                        NaN                          NaN   

     International Students Rank International Research Network Score  \
0                       isr rank                            irn score   
1                            128                                 94.3   
2                             85                                 99.9   
3                             60                                100.0   
4                            223                                100.0   
...                          ...                                  ...   
1493                        701+                                  1.8   
1494                         NaN                                 26.3   
1495                        701+                                  1.3   
1496                        701+                                  1.0   
1497                         NaN                                  1.7   

     International Research Network Rank Employment Outcomes Score  \
0                               irn rank                 ger score   
1                                     58                       100   
2                                      7                       100   
3                                      1                       100   
4                                      5                       100   
...                                  ...                       ...   
1493                                701+                       5.9   
1494                                 489                       NaN   
1495                                701+                       8.5   
1496                                701+                       7.3   
1497                                701+                       NaN   

     Employment Outcomes Rank Sustainability Score Sustainability Rank  \
0                    ger rank            SUS SCORE            SUS RANK   
1                           4                 95.2                  51   
2                           6                 97.3                 33=   
3                           3                 97.8                 26=   
4                           1                 96.7                  39   
...                       ...                  ...                 ...   
1493                     701+                  2.2                701+   
1494                      NaN                  NaN                 NaN   
1495                     701+                    1                701+   
1496                     701+                  NaN                 NaN   
1497                      NaN                  NaN                 NaN   

      Overall SCORE  
0     Overall Score  
1             100.0  
2              99.2  
3              98.9  
4              98.3  
...             ...  
1493              -  
1494              -  
1495              -  
1496              -  
1497              -  

[1498 rows x 29 columns]>

#查看数据的描述性统计
df.describe().T
df.columns

Index(['2024 RANK', '2023 RANK', 'Institution Name', 'Country Code', 'Country',
       'SIZE', 'FOCUS', 'RES.', 'AGE', 'STATUS', 'Academic Reputation Score',
       'Academic Reputation Rank', 'Employer Reputation Score',
       'Employer Reputation Rank', 'Faculty Student Score',
       'Faculty Student Rank', 'Citations per Faculty Score',
       'Citations per Faculty Rank', 'International Faculty Score',
       'International Faculty Rank', 'International Students Score',
       'International Students Rank', 'International Research Network Score',
       'International Research Network Rank', 'Employment Outcomes Score',
       'Employment Outcomes Rank', 'Sustainability Score',
       'Sustainability Rank', 'Overall SCORE'],
      dtype='object')

categorical_cols = ['Institution Name', 'Country', 'SIZE', 'FOCUS', 'RES.']


def convert_to_int(df):
    for col in df.columns:
        if (col not in (categorical_cols)):
            df[col] = df[col].str.extract('(\d+)').astype(int)
    return df


def clean_data(df):
    # 删除第一行数据
#     df.drop(index=0,axis=0, inplace=True)
    df = df.iloc[1:].copy()
    # 删除缺失值
    df = df.dropna()
    df = df.loc[df['Overall SCORE'] != '-']
    # 连接排名列、分类列和分数列
    rank_df = df[['2024 RANK', '2023 RANK']]
    cat_df = df[['Institution Name', 'Country',
                 'SIZE', 'FOCUS', 'RES.', 'AGE']]
    score_cols = [col for col in df.columns if 'score' in col.lower()]
    score_df = df[score_cols]
    df = pd.concat([rank_df, cat_df, score_df], axis=1)
    # 将对象类型转换为int类型
    df = convert_to_int(df)
    # 创建等级变更列
    df['Rank Change'] = df['2023 RANK'] - df['2024 RANK']
    # 乘以-1的值RANK 2023和RANK2024较低的排名=更好
    df['2023 RANK'] = -1 * df['2023 RANK']
    df['2024 RANK'] = -1 * df['2024 RANK']
    return df

# df.drop?
df = clean_data(df)
df.head()

#大多数大学位于美国、中国、澳大利亚、加拿大和俄罗斯

# 大学位置分布图
country_codes = coco.convert(df['Country'],to='ISO3')
country_codes = pd.Series(country_codes)
university_location = country_codes.value_counts()
fig = px.choropleth(locations=university_location.index,
                    color=university_location.values,
                    color_continuous_scale=px.colors.sequential.Viridis, template='plotly_white',
                    title='University Location Distribution Map')

fig.show()

#查看可使用的所有颜色
import plotly.express as px

# 打印出 sequential 模块中的所有颜色图谱名称
for color_scale in dir(px.colors.sequential):
    print(color_scale)

Aggrnyl
Aggrnyl_r
Agsunset
Agsunset_r
Blackbody
Blackbody_r
Bluered
Bluered_r
Blues
Blues_r
Blugrn
Blugrn_r
Bluyl
Bluyl_r
Brwnyl
Brwnyl_r
BuGn
BuGn_r
BuPu
BuPu_r
Burg
Burg_r
Burgyl
Burgyl_r
Cividis
Cividis_r
Darkmint
Darkmint_r
Electric
Electric_r
Emrld
Emrld_r
GnBu
GnBu_r
Greens
Greens_r
Greys
Greys_r
Hot
Hot_r
Inferno
Inferno_r
Jet
Jet_r
Magenta
Magenta_r
Magma
Magma_r
Mint
Mint_r
OrRd
OrRd_r
Oranges
Oranges_r
Oryel
Oryel_r
Peach
Peach_r
Pinkyl
Pinkyl_r
Plasma
Plasma_r
Plotly3
Plotly3_r
PuBu
PuBuGn
PuBuGn_r
PuBu_r
PuRd
PuRd_r
Purp
Purp_r
Purples
Purples_r
Purpor
Purpor_r
Rainbow
Rainbow_r
RdBu
RdBu_r
RdPu
RdPu_r
Redor
Redor_r
Reds
Reds_r
Sunset
Sunset_r
Sunsetdark
Sunsetdark_r
Teal
Teal_r
Tealgrn
Tealgrn_r
Turbo
Turbo_r
Viridis
Viridis_r
YlGn
YlGnBu
YlGnBu_r
YlGn_r
YlOrBr
YlOrBr_r
YlOrRd
YlOrRd_r
__all__
__builtins__
__cached__
__doc__
__file__
__loader__
__name__
__package__
__spec__
_swatches
_swatches_continuous
algae
algae_r
amp
amp_r
deep
deep_r
dense
dense_r
gray
gray_r
haline
haline_r
ice
ice_r
matter
matter_r
solar
solar_r
speed
speed_r
swatches
swatches_continuous
tempo
tempo_r
thermal
thermal_r
turbid
turbid_r

# 分类变量的分布
print(df.columns)
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
size_counts = df['SIZE'].value_counts().sort_values(ascending=True)
focus_counts = df['FOCUS'].value_counts().sort_values(ascending=True)
rs_counts = df['RES.'].value_counts().sort_values(ascending=True)
age_counts = df['AGE'].value_counts().sort_values(ascending=True)
categorical_1st = [size_counts, focus_counts, rs_counts, age_counts]
categorical_1abels = ['Size', 'Focus', 'Research', 'Age']
for i, col in enumerate(categorical_1abels):
    fig.add_trace(
        go.Bar(
            x=categorical_1st[i].index.astype('str'),
            y=categorical_1st[i].values, name=col,
            text=np.round(categorical_1st[i].values, 3),
            textposition='auto',
            marker_color=px.colors.sequential.Rainbow, textfont=dict(size=12)),
        row=int(i/col_n)+1, col=int(i % col_n)+1
    )
fig.update_layout(
    title_text='Distribution of Categorical Variables', template='plotly_white')

fig.update_traces(marker_line_color='black',
                  marker_line_width=1.5, opacity=0.8)
fig.show()

Index(['2024 RANK', '2023 RANK', 'Institution Name', 'Country', 'SIZE',
       'FOCUS', 'RES.', 'AGE', 'Academic Reputation Score',
       'Employer Reputation Score', 'Faculty Student Score',
       'Citations per Faculty Score', 'International Faculty Score',
       'International Students Score', 'International Research Network Score',
       'Employment Outcomes Score', 'Sustainability Score', 'Overall SCORE',
       'Rank Change'],
      dtype='object')

# 前20名大学分析
# 排名前20的院校名称
data = dict(zip(df['Institution Name'].tolist(), df['2024 RANK'].tolist()))
plt.figure(figsize=(20, 15))
wc = WordCloud(width=800, height=500, max_words=20, colormap='autumn',
               background_color='white').generate_from_frequencies(data)

plt.axis('off')
plt.imshow(wc, interpolation='bilinear')
plt.show()

top20_df = df.iloc[:20]  # 只选前20名大学
top20_rank_change = (top20_df.groupby('Institution Name')['Rank Change']
                     .mean()
                     .sort_values(ascending=True))  # 或者 False

fig = px.bar(
    x=top20_rank_change.index,
    y=top20_rank_change.values,
    text=np.round(top20_rank_change.values, 2),  # 取两位小数
    color=top20_rank_change.values,
    color_continuous_scale='mrybm'
)

fig.update_layout(
    title_text='Top 20 Universities Rank Changes (2023-2024)',  # 修改标题
    template='plotly_white',
    xaxis=dict(title='Institution Name', tickfont=dict(size=16)),  # 调整字体大小
    yaxis=dict(title='Rank Change', tickfont=dict(size=16))
)

fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5,
    opacity=0.8
)

fig.show()

# 排名前20的大学位置分布图
country_codes = coco.convert(top20_df['Country'], to='ISO3')
country_codes = pd.Series(country_codes)
university_location = country_codes.value_counts()
fig = px.choropleth(locations=university_location.index, color=university_location.values,
                    color_continuous_scale=px.colors.sequential.Rainbow, template='plotly_white', title='University Location Distribution Map')
fig.show()

## 计算国家平均排名变化
rank_change_avg = df.groupby('Country')['Rank Change'].mean().sort_values()

# 取排名下降最多的 10 个国家（均值最高）
rank_change_down = rank_change_avg.tail(10)

# 取排名上升最多的 10 个国家（均值最低）
rank_change_up = rank_change_avg.head(10)

# 合并数据
rank_change = pd.concat([rank_change_down, rank_change_up])

# 画图
fig = px.bar(
    x=rank_change.index, 
    y=rank_change.values, 
    text=np.round(rank_change.values, 2),  # 显示小数点后 2 位
    color=rank_change.values, 
    color_continuous_scale='oranges'  # 选择合适的颜色方案
)

# 更新布局
fig.update_layout(
    title_text='Top 10 Countries Rank Up & Down From 2023 to 2024',
    template='simple_white',
    xaxis=dict(title='Country', tickfont=dict(size=16)),  # 修正 tickfont
    yaxis=dict(title='Average Rank Change', tickfont=dict(size=16))
)

fig.show()
# One of the following named colorscales:
#             ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
#              'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
#              'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
#              'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
#              'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
#              'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
#              'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
#              'orrd', 'oryel', 'oxy', 'peach', 'phase', 'picnic', 'pinkyl',
#              'piyg', 'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn',
#              'puor', 'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu',
#              'rdgy', 'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar',
#              'spectral', 'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn',
#              'tealrose', 'tempo', 'temps', 'thermal', 'tropic', 'turbid',
#              'turbo', 'twilight', 'viridis', 'ylgn', 'ylgnbu', 'ylorbr',
#              'ylorrd'].

# 平均等级变化和分类变量
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
change_by_size = df.groupby('SIZE')['Rank Change'].mean().sort_values(ascending=True)
change_by_focus = df.groupby('FOCUS')['Rank Change'].mean().sort_values(ascending=True)
change_by_rs = df.groupby('RES.')['Rank Change'].mean().sort_values(ascending=True)
change_by_age = df.groupby('AGE')['Rank Change'].mean().sort_values(ascending=True)
change_by_categories = [change_by_size, change_by_focus, change_by_rs, change_by_age] 
categorical_1abels = ['Size', 'Focus', 'Research', 'Age']
for i, col in enumerate(categorical_1abels):
    fig.add_trace(
        go.Bar(
            x=change_by_categories[i].index.astype('str'),
            y=change_by_categories[i].values, name=col,
            text=np.round(change_by_categories[i].values),
            textposition='inside',
            marker_color=px.colors.sequential.amp,
            textfont=dict(size=12)),
        row=int(i/col_n)+1,
        col=int(i % col_n)+1)

fig.update_layout(
    title_text='Average Rank Change and Categorical Variables', template='plotly_white')
fig.update_traces(marker_line_color='black',
                  marker_line_width=1.5, opacity=0.8)
fig.show()

# 总分分析
# 总分分布图
score_by_country = df.groupby('Country')['Overall SCORE'].mean()
score_by_country.index = coco.convert(score_by_country.index, to='ISO3')
fig = px.choropleth(locations=score_by_country.index,
                    color=score_by_country.values,
                    color_continuous_scale=px.colors.sequential.YlOrRd_r,
                    template='plotly_white',
                    title='University Average Overall Score Distributino')
fig.show()

# 计算前10和后10国家的平均总分
top10_score_country = (df.groupby('Country')['Overall SCORE']
                       .mean()
                       .sort_values(ascending=False)
                       .head(10))  # 用 head(10) 代替 [:10]

low10_score_country = (df.groupby('Country')['Overall SCORE']
                        .mean()
                        .sort_values(ascending=True)  # 这里改为 ascending=True
                        .head(10))  # 用 head(10) 代替 [-10:]

# 合并数据
score_country = pd.concat([top10_score_country, low10_score_country])

# 画图
fig = px.bar(
    x=score_country.index,
    y=score_country.values,
    text=np.round(score_country.values, 2),  # 取两位小数
    color=score_country.values,
    color_continuous_scale='icefire'
)

# 更新布局
fig.update_layout(
    title_text='Top 10 & Bottom 10 Average Overall Score by Country',  # 修正标题
    template='plotly_white',
    xaxis=dict(title='Country', tickfont=dict(size=16)),  # 确保格式正确
    yaxis=dict(title='Average Overall Score', tickfont=dict(size=16))
)

# 更新条形样式
fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5,
    opacity=0.8
)

fig.show()

# 获取前10和后10国家
score_country_lst = score_country.index.tolist()

# 过滤出这些国家的数据
score_country_df = df[df['Country'].isin(score_country_lst)]

# 绘制箱线图
fig = px.box(score_country_df,
             x='Country',
             y='Overall SCORE',
             color='Country',
             category_orders={'Country': score_country_lst})  # 修正 category_orders

# 更新布局
fig.update_layout(
    title_text='Top 10 & Bottom 10 Average Overall Score Countries',
    template='plotly_white',
    xaxis=dict(title='Country', tickfont=dict(size=16)),  # 修正 tickfont
    yaxis=dict(title='Average Overall Score', tickfont=dict(size=16))
)

# 更新样式
fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5,
    opacity=0.8
)

fig.show()

# 分数和分类变量
row_n = 2
col_n = 2
fig = make_subplots(rows=row_n, cols=col_n)
score_by_size = df. groupby(
    'SIZE')['Overall SCORE'].mean().sort_values(ascending=True)
score_by_focus = df.groupby(
    'FOCUS')['Overall SCORE']. mean().sort_values(ascending=True)
score_by_rs = df.groupby(
    'RES.')['Overall SCORE']. mean().sort_values(ascending=True)
score_by_age = df.groupby(
    'AGE')['Overall SCORE'].mean().sort_values(ascending=True)
score_by_categories = [score_by_size,
                       score_by_focus, score_by_rs, score_by_age]
categorical_1abels = ['Size','Focus', 'Research', 'Age'] 
for i, col in enumerate(categorical_1abels):
    fig.add_trace(
        go.Bar(
            x=score_by_categories[i].index.astype('str'), y=score_by_categories[i].values, name=col,
            text=np.round(score_by_categories[i].values), textposition='auto',
            marker_color=px.colors.sequential.haline_r, textfont=dict(size=12)),
        row=int(i/col_n)+1, col=int(i % col_n)+1)
fig.update_layout(
    title_text='Score and Categorical Variables', template='plotly_white')

fig.update_traces(marker_line_color='black',
                  marker_line_width=1.5, opacity=0.8)
fig.show()

# 只保留数值型列
numeric_df = df.select_dtypes(include=['number'])  

# 计算相关性
df_corr = numeric_df.corr()['Overall SCORE'].drop('Overall SCORE').sort_values(ascending=True)

# 画图
fig = px.bar(
    x=df_corr.index,
    y=df_corr.values,
    color=df_corr.values,
    text=np.round(df_corr.values, 2),
    color_continuous_scale='plasma'
)

# 更新布局
fig.update_layout(
    title_text='Correlation between Overall Scores',
    template='plotly_white',
    xaxis=dict(title='Variables', tickfont=dict(size=14)),  # 修正 titlefont_size
    yaxis=dict(title='Correlation', tickfont=dict(size=14))
)

# 更新样式
fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5,
    opacity=0.8
)

fig.show()

	2024 RANK	2023 RANK	Institution Name	Country Code	Country	SIZE	FOCUS	RES.	AGE	STATUS	...	International Faculty Rank	International Students Score	International Students Rank	International Research Network Score	International Research Network Rank	Employment Outcomes Score	Employment Outcomes Rank	Sustainability Score	Sustainability Rank	Overall SCORE
0	rank display	rank display2	institution	location code	location	size	focus	research	age band	status	...	ifr rank	isr score	isr rank	irn score	irn rank	ger score	ger rank	SUS SCORE	SUS RANK	Overall Score
1	1	1	Massachusetts Institute of Technology (MIT)	US	United States	M	CO	VH	5	B	...	56	88.2	128	94.3	58	100	4	95.2	51	100.0
2	2	2	University of Cambridge	UK	United Kingdom	L	FC	VH	5	A	...	64	95.8	85	99.9	7	100	6	97.3	33=	99.2
3	3	4	University of Oxford	UK	United Kingdom	L	FC	VH	5	A	...	110	98.2	60	100.0	1	100	3	97.8	26=	98.9
4	4	5	Harvard University	US	United States	L	FC	VH	5	B	...	210	66.8	223	100.0	5	100	1	96.7	39	98.3

📘 2024QS大学排名/QS_Rank.ipynb