import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('sentimentdataset.csv')
df.head()

df.shape

(732, 15)

df.info

<bound method DataFrame.info of      Unnamed: 0.1  Unnamed: 0  \
0               0           0   
1               1           1   
2               2           2   
3               3           3   
4               4           4   
..            ...         ...   
727           728         732   
728           729         733   
729           730         734   
730           731         735   
731           732         736   

                                                  Text    Sentiment  \
0     Enjoying a beautiful day at the park!        ...   Positive     
1     Traffic was terrible this morning.           ...   Negative     
2     Just finished an amazing workout! 💪          ...   Positive     
3     Excited about the upcoming weekend getaway!  ...   Positive     
4     Trying out a new recipe for dinner tonight.  ...   Neutral      
..                                                 ...          ...   
727  Collaborating on a science project that receiv...       Happy    
728  Attending a surprise birthday party organized ...       Happy    
729  Successfully fundraising for a school charity ...       Happy    
730  Participating in a multicultural festival, cel...       Happy    
731  Organizing a virtual talent show during challe...       Happy    

               Timestamp                                   User     Platform  \
0    2023-01-15 12:30:00                          User123          Twitter     
1    2023-01-15 08:45:00                          CommuterX        Twitter     
2    2023-01-15 15:45:00                          FitnessFan      Instagram    
3    2023-01-15 18:20:00                          AdventureX       Facebook    
4    2023-01-15 19:55:00                          ChefCook        Instagram    
..                   ...                                    ...          ...   
727  2017-08-18 18:20:00       ScienceProjectSuccessHighSchool     Facebook    
728  2018-06-22 14:15:00            BirthdayPartyJoyHighSchool    Instagram    
729  2019-04-05 17:30:00   CharityFundraisingTriumphHighSchool      Twitter    
730  2020-02-29 20:45:00    MulticulturalFestivalJoyHighSchool     Facebook    
731  2020-11-15 15:15:00    VirtualTalentShowSuccessHighSchool    Instagram    

                                          Hashtags  Retweets  Likes  \
0        #Nature #Park                                  15.0   30.0   
1        #Traffic #Morning                               5.0   10.0   
2        #Fitness #Workout                              20.0   40.0   
3        #Travel #Adventure                              8.0   15.0   
4        #Cooking #Food                                 12.0   25.0   
..                                             ...       ...    ...   
727         #ScienceFairWinner #HighSchoolScience       20.0   39.0   
728    #SurpriseCelebration #HighSchoolFriendship       25.0   48.0   
729      #CommunityGiving #HighSchoolPhilanthropy       22.0   42.0   
730         #CulturalCelebration #HighSchoolUnity       21.0   43.0   
731   #VirtualEntertainment #HighSchoolPositivity       24.0   47.0   

          Country  Year  Month  Day  Hour  
0       USA        2023      1   15    12  
1       Canada     2023      1   15     8  
2     USA          2023      1   15    15  
3       UK         2023      1   15    18  
4      Australia   2023      1   15    19  
..            ...   ...    ...  ...   ...  
727            UK  2017      8   18    18  
728           USA  2018      6   22    14  
729        Canada  2019      4    5    17  
730            UK  2020      2   29    20  
731           USA  2020     11   15    15  

[732 rows x 15 columns]>

df.describe()

# 查看非数值型变量描述性统计
# 'all'：包含所有列（数值型和非数值型）
# ['object']：仅对对象类型（字符串等）列进行描述性统计
# ['number']：仅对数值型列进行描述性统计
# ['bool']：仅对布尔型列进行描述性统计
# ['category']：仅对分类类型列进行描述性统计
# ['datetime']：仅对日期时间类型列进行描述性统计
df.describe(include=['number'])

# 去除空格
df['Text']= df['Text'].str.strip()
df['Sentiment']= df['Sentiment'].str.strip()
df['User']= df['User'].str.strip()
df['Platform']= df['Platform'].str.strip()
df['Hashtags']= df['Hashtags'].str.strip()
df['Country']= df['Country'].str.strip()

df['Sentiment'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Sentiments based on Text')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

df['Platform'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Percentages of Platforms')
plt.legend()
plt.show()

# 计算每个国家/地区每个平台的出现次数
platform_counts = df.groupby(['Country', 'Platform']).size().unstack(fill_value=0)
top_countries = platform_counts.sum(axis=1).sort_values(ascending=True).tail(10)
top_platform_counts = platform_counts.loc[top_countries.index]
plt.figure(figsize=(12, 8))
top_platform_counts.plot(kind='barh', stacked=True, ax=plt.gca())
plt.title('Top 10 Countries by Platform Counts')
plt.xlabel('Country')
plt.ylabel('Count')
plt.legend(title='Platform', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

df['Country'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Country')
plt.legend()
plt.show()

# 问题一:转发次数最多的10个话题标签
H_R = df.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False).plot(kind='bar')
plt.title('Top 10 hashtags retweeted')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()

# Q2:用户喜欢的平台顶部是什么?
top_likes_platform = df.groupby('Platform')['Likes'].sum().nlargest(10)
top_likes_platform.plot(kind='bar')
plt.title('Top Platforms by Total Likes')
plt.xlabel('Platform')
plt.ylabel('Total Likes')
plt.show()

# 问题3:哪个国家的帖子被点赞最多？
top_country_likes=df.groupby('Country')['Likes'].sum().nlargest(10)
top_country_likes.plot(kind='bar')
plt.title('Top country likes')
plt.xlabel('Country')
plt.ylabel('count')
plt.show()

from wordcloud import WordCloud
text = ' '.join(df['Text'])
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for PlayerLine Column')
plt.tight_layout()
plt.show()

Facebook=df[df['Platform']=='Facebook']
Twitter=df[df['Platform']=='Twitter']
Instagram=df[df['Platform']=='Instagram']

# 转发次数最多的10个标签
H_R_f=Facebook.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_f.plot(kind='bar')
plt.title('Top 10 hashtags retweeted in $/ Facebook $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()

# 用户最喜欢谁?
top_likes_platform_F = Facebook.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_F.plot(kind='bar')
plt.title('Top Users by Total Likes IN Facebook')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

# 转发次数最多的10个标签
H_R_t=Twitter.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_t.plot(kind='bar')
plt.title('Top 10 hashtags retweeted in $/ Twitter $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()

# 用户最喜欢谁?
top_likes_platform_t = Twitter.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_t.plot(kind='bar')
plt.title('Top Users by Total Likes IN Twitter')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

# 转发次数最多的10个标签
H_R_i=Instagram.groupby('Hashtags')['Retweets'].max().nlargest(15).sort_values(ascending=False)
H_R_i.plot(kind='bar')
plt.title('Top 15 hashtags retweeted in $/ Instagram $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()

# 用户最喜欢谁?
top_likes_platform_i = Instagram.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_i.plot(kind='bar')
plt.title('Top Users by Total Likes IN Instagram')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()

	Unnamed: 0.1	Unnamed: 0	Retweets	Likes	Year	Month	Day	Hour
count	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000
mean	366.464481	369.740437	21.508197	42.901639	2020.471311	6.122951	15.497268	15.521858
std	211.513936	212.428936	7.061286	14.089848	2.802285	3.411763	8.474553	4.113414
min	0.000000	0.000000	5.000000	10.000000	2010.000000	1.000000	1.000000	0.000000
25%	183.750000	185.750000	17.750000	34.750000	2019.000000	3.000000	9.000000	13.000000
50%	366.500000	370.500000	22.000000	43.000000	2021.000000	6.000000	15.000000	16.000000
75%	549.250000	553.250000	25.000000	50.000000	2023.000000	9.000000	22.000000	19.000000
max	732.000000	736.000000	40.000000	80.000000	2023.000000	12.000000	31.000000	23.000000

	Unnamed: 0.1	Unnamed: 0	Retweets	Likes	Year	Month	Day	Hour
count	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000	732.000000
mean	366.464481	369.740437	21.508197	42.901639	2020.471311	6.122951	15.497268	15.521858
std	211.513936	212.428936	7.061286	14.089848	2.802285	3.411763	8.474553	4.113414
min	0.000000	0.000000	5.000000	10.000000	2010.000000	1.000000	1.000000	0.000000
25%	183.750000	185.750000	17.750000	34.750000	2019.000000	3.000000	9.000000	13.000000
50%	366.500000	370.500000	22.000000	43.000000	2021.000000	6.000000	15.000000	16.000000
75%	549.250000	553.250000	25.000000	50.000000	2023.000000	9.000000	22.000000	19.000000
max	732.000000	736.000000	40.000000	80.000000	2023.000000	12.000000	31.000000	23.000000

	Unnamed: 0.1	Unnamed: 0	Text	Sentiment	Timestamp	User	Platform	Hashtags	Retweets	Likes	Country	Year	Month	Day	Hour
0	0	0	Enjoying a beautiful day at the park! ...	Positive	2023-01-15 12:30:00	User123	Twitter	#Nature #Park	15.0	30.0	USA	2023	1	15	12
1	1	1	Traffic was terrible this morning. ...	Negative	2023-01-15 08:45:00	CommuterX	Twitter	#Traffic #Morning	5.0	10.0	Canada	2023	1	15	8
2	2	2	Just finished an amazing workout! 💪 ...	Positive	2023-01-15 15:45:00	FitnessFan	Instagram	#Fitness #Workout	20.0	40.0	USA	2023	1	15	15
3	3	3	Excited about the upcoming weekend getaway! ...	Positive	2023-01-15 18:20:00	AdventureX	Facebook	#Travel #Adventure	8.0	15.0	UK	2023	1	15	18
4	4	4	Trying out a new recipe for dinner tonight. ...	Neutral	2023-01-15 19:55:00	ChefCook	Instagram	#Cooking #Food	12.0	25.0	Australia	2023	1	15	19

📘 社交媒体情绪数据集可视化分析/Mode.ipynb