In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('sentimentdataset.csv')
df.head()
Out[7]:
| Unnamed: 0.1 | Unnamed: 0 | Text | Sentiment | Timestamp | User | Platform | Hashtags | Retweets | Likes | Country | Year | Month | Day | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | Enjoying a beautiful day at the park! ... | Positive | 2023-01-15 12:30:00 | User123 | #Nature #Park | 15.0 | 30.0 | USA | 2023 | 1 | 15 | 12 | |
| 1 | 1 | 1 | Traffic was terrible this morning. ... | Negative | 2023-01-15 08:45:00 | CommuterX | #Traffic #Morning | 5.0 | 10.0 | Canada | 2023 | 1 | 15 | 8 | |
| 2 | 2 | 2 | Just finished an amazing workout! 💪 ... | Positive | 2023-01-15 15:45:00 | FitnessFan | #Fitness #Workout | 20.0 | 40.0 | USA | 2023 | 1 | 15 | 15 | |
| 3 | 3 | 3 | Excited about the upcoming weekend getaway! ... | Positive | 2023-01-15 18:20:00 | AdventureX | #Travel #Adventure | 8.0 | 15.0 | UK | 2023 | 1 | 15 | 18 | |
| 4 | 4 | 4 | Trying out a new recipe for dinner tonight. ... | Neutral | 2023-01-15 19:55:00 | ChefCook | #Cooking #Food | 12.0 | 25.0 | Australia | 2023 | 1 | 15 | 19 |
In [9]:
df.shape
Out[9]:
(732, 15)
In [10]:
df.info
Out[10]:
<bound method DataFrame.info of Unnamed: 0.1 Unnamed: 0 \
0 0 0
1 1 1
2 2 2
3 3 3
4 4 4
.. ... ...
727 728 732
728 729 733
729 730 734
730 731 735
731 732 736
Text Sentiment \
0 Enjoying a beautiful day at the park! ... Positive
1 Traffic was terrible this morning. ... Negative
2 Just finished an amazing workout! 💪 ... Positive
3 Excited about the upcoming weekend getaway! ... Positive
4 Trying out a new recipe for dinner tonight. ... Neutral
.. ... ...
727 Collaborating on a science project that receiv... Happy
728 Attending a surprise birthday party organized ... Happy
729 Successfully fundraising for a school charity ... Happy
730 Participating in a multicultural festival, cel... Happy
731 Organizing a virtual talent show during challe... Happy
Timestamp User Platform \
0 2023-01-15 12:30:00 User123 Twitter
1 2023-01-15 08:45:00 CommuterX Twitter
2 2023-01-15 15:45:00 FitnessFan Instagram
3 2023-01-15 18:20:00 AdventureX Facebook
4 2023-01-15 19:55:00 ChefCook Instagram
.. ... ... ...
727 2017-08-18 18:20:00 ScienceProjectSuccessHighSchool Facebook
728 2018-06-22 14:15:00 BirthdayPartyJoyHighSchool Instagram
729 2019-04-05 17:30:00 CharityFundraisingTriumphHighSchool Twitter
730 2020-02-29 20:45:00 MulticulturalFestivalJoyHighSchool Facebook
731 2020-11-15 15:15:00 VirtualTalentShowSuccessHighSchool Instagram
Hashtags Retweets Likes \
0 #Nature #Park 15.0 30.0
1 #Traffic #Morning 5.0 10.0
2 #Fitness #Workout 20.0 40.0
3 #Travel #Adventure 8.0 15.0
4 #Cooking #Food 12.0 25.0
.. ... ... ...
727 #ScienceFairWinner #HighSchoolScience 20.0 39.0
728 #SurpriseCelebration #HighSchoolFriendship 25.0 48.0
729 #CommunityGiving #HighSchoolPhilanthropy 22.0 42.0
730 #CulturalCelebration #HighSchoolUnity 21.0 43.0
731 #VirtualEntertainment #HighSchoolPositivity 24.0 47.0
Country Year Month Day Hour
0 USA 2023 1 15 12
1 Canada 2023 1 15 8
2 USA 2023 1 15 15
3 UK 2023 1 15 18
4 Australia 2023 1 15 19
.. ... ... ... ... ...
727 UK 2017 8 18 18
728 USA 2018 6 22 14
729 Canada 2019 4 5 17
730 UK 2020 2 29 20
731 USA 2020 11 15 15
[732 rows x 15 columns]>
In [12]:
df.describe()
Out[12]:
| Unnamed: 0.1 | Unnamed: 0 | Retweets | Likes | Year | Month | Day | Hour | |
|---|---|---|---|---|---|---|---|---|
| count | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 |
| mean | 366.464481 | 369.740437 | 21.508197 | 42.901639 | 2020.471311 | 6.122951 | 15.497268 | 15.521858 |
| std | 211.513936 | 212.428936 | 7.061286 | 14.089848 | 2.802285 | 3.411763 | 8.474553 | 4.113414 |
| min | 0.000000 | 0.000000 | 5.000000 | 10.000000 | 2010.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 183.750000 | 185.750000 | 17.750000 | 34.750000 | 2019.000000 | 3.000000 | 9.000000 | 13.000000 |
| 50% | 366.500000 | 370.500000 | 22.000000 | 43.000000 | 2021.000000 | 6.000000 | 15.000000 | 16.000000 |
| 75% | 549.250000 | 553.250000 | 25.000000 | 50.000000 | 2023.000000 | 9.000000 | 22.000000 | 19.000000 |
| max | 732.000000 | 736.000000 | 40.000000 | 80.000000 | 2023.000000 | 12.000000 | 31.000000 | 23.000000 |
In [15]:
# 查看非数值型变量描述性统计
# 'all':包含所有列(数值型和非数值型)
# ['object']:仅对对象类型(字符串等)列进行描述性统计
# ['number']:仅对数值型列进行描述性统计
# ['bool']:仅对布尔型列进行描述性统计
# ['category']:仅对分类类型列进行描述性统计
# ['datetime']:仅对日期时间类型列进行描述性统计
df.describe(include=['number'])
Out[15]:
| Unnamed: 0.1 | Unnamed: 0 | Retweets | Likes | Year | Month | Day | Hour | |
|---|---|---|---|---|---|---|---|---|
| count | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 | 732.000000 |
| mean | 366.464481 | 369.740437 | 21.508197 | 42.901639 | 2020.471311 | 6.122951 | 15.497268 | 15.521858 |
| std | 211.513936 | 212.428936 | 7.061286 | 14.089848 | 2.802285 | 3.411763 | 8.474553 | 4.113414 |
| min | 0.000000 | 0.000000 | 5.000000 | 10.000000 | 2010.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 183.750000 | 185.750000 | 17.750000 | 34.750000 | 2019.000000 | 3.000000 | 9.000000 | 13.000000 |
| 50% | 366.500000 | 370.500000 | 22.000000 | 43.000000 | 2021.000000 | 6.000000 | 15.000000 | 16.000000 |
| 75% | 549.250000 | 553.250000 | 25.000000 | 50.000000 | 2023.000000 | 9.000000 | 22.000000 | 19.000000 |
| max | 732.000000 | 736.000000 | 40.000000 | 80.000000 | 2023.000000 | 12.000000 | 31.000000 | 23.000000 |
In [16]:
# 去除空格
df['Text']= df['Text'].str.strip()
df['Sentiment']= df['Sentiment'].str.strip()
df['User']= df['User'].str.strip()
df['Platform']= df['Platform'].str.strip()
df['Hashtags']= df['Hashtags'].str.strip()
df['Country']= df['Country'].str.strip()
In [17]:
df['Sentiment'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Sentiments based on Text')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
In [18]:
df['Platform'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Percentages of Platforms')
plt.legend()
plt.show()
In [19]:
# 计算每个国家/地区每个平台的出现次数
platform_counts = df.groupby(['Country', 'Platform']).size().unstack(fill_value=0)
top_countries = platform_counts.sum(axis=1).sort_values(ascending=True).tail(10)
top_platform_counts = platform_counts.loc[top_countries.index]
plt.figure(figsize=(12, 8))
top_platform_counts.plot(kind='barh', stacked=True, ax=plt.gca())
plt.title('Top 10 Countries by Platform Counts')
plt.xlabel('Country')
plt.ylabel('Count')
plt.legend(title='Platform', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
In [20]:
df['Country'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Country')
plt.legend()
plt.show()
In [21]:
# 问题一:转发次数最多的10个话题标签
H_R = df.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False).plot(kind='bar')
plt.title('Top 10 hashtags retweeted')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()
In [22]:
# Q2:用户喜欢的平台顶部是什么?
top_likes_platform = df.groupby('Platform')['Likes'].sum().nlargest(10)
top_likes_platform.plot(kind='bar')
plt.title('Top Platforms by Total Likes')
plt.xlabel('Platform')
plt.ylabel('Total Likes')
plt.show()
In [23]:
# 问题3:哪个国家的帖子被点赞最多?
top_country_likes=df.groupby('Country')['Likes'].sum().nlargest(10)
top_country_likes.plot(kind='bar')
plt.title('Top country likes')
plt.xlabel('Country')
plt.ylabel('count')
plt.show()
In [24]:
from wordcloud import WordCloud
text = ' '.join(df['Text'])
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for PlayerLine Column')
plt.tight_layout()
plt.show()
In [25]:
Facebook=df[df['Platform']=='Facebook']
Twitter=df[df['Platform']=='Twitter']
Instagram=df[df['Platform']=='Instagram']
In [26]:
# 转发次数最多的10个标签
H_R_f=Facebook.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_f.plot(kind='bar')
plt.title('Top 10 hashtags retweeted in $/ Facebook $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()
In [27]:
# 用户最喜欢谁?
top_likes_platform_F = Facebook.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_F.plot(kind='bar')
plt.title('Top Users by Total Likes IN Facebook')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()
In [28]:
# 转发次数最多的10个标签
H_R_t=Twitter.groupby('Hashtags')['Retweets'].max().nlargest(10).sort_values(ascending=False)
H_R_t.plot(kind='bar')
plt.title('Top 10 hashtags retweeted in $/ Twitter $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()
In [29]:
# 用户最喜欢谁?
top_likes_platform_t = Twitter.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_t.plot(kind='bar')
plt.title('Top Users by Total Likes IN Twitter')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()
In [30]:
# 转发次数最多的10个标签
H_R_i=Instagram.groupby('Hashtags')['Retweets'].max().nlargest(15).sort_values(ascending=False)
H_R_i.plot(kind='bar')
plt.title('Top 15 hashtags retweeted in $/ Instagram $/')
plt.xlabel('Hashtags')
plt.ylabel('count')
plt.show()
In [31]:
# 用户最喜欢谁?
top_likes_platform_i = Instagram.groupby('User')['Likes'].sum().nlargest(10)
top_likes_platform_i.plot(kind='bar')
plt.title('Top Users by Total Likes IN Instagram')
plt.xlabel('User')
plt.ylabel('Total Likes')
plt.show()
In [ ]: