abdulrasool commited on
Commit
f6c6a41
·
1 Parent(s): 644d9c6

first commit

Browse files
Files changed (4) hide show
  1. app.py +171 -0
  2. helper.py +161 -0
  3. preprocessor.py +50 -0
  4. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import preprocessor
3
+ import helper
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+
7
+
8
+
9
+ def main():
10
+ st.sidebar.title("Whatsapp Chat Analyzer")
11
+ uploaded_file = st.sidebar.file_uploader("Choose a file")
12
+ if uploaded_file is not None:
13
+ # To read file as bytes:
14
+ bytes_data = uploaded_file.getvalue()
15
+ data = bytes_data.decode("utf-8")
16
+ df = preprocessor.preprocess(data)
17
+
18
+ #fetch unique users
19
+ user_list = df['user'].unique().tolist()
20
+ user_list.remove('group_notification')
21
+ user_list.sort()
22
+ user_list.insert(0,"Overall")
23
+ selected_user = st.sidebar.selectbox("Show analysis wrt",user_list)
24
+ if st.sidebar.button("Show Analysis"):
25
+ num_messages,words, num_media_messages ,num_links = helper.fetch_stats(selected_user,df)
26
+ st.title("Top Statistics")
27
+ col1, col2 , col3, col4 = st.columns(4)
28
+
29
+ with col1:
30
+ st.header("Total Messages")
31
+ st.title(num_messages)
32
+ with col2:
33
+ st.header("Total Words")
34
+ st.title(words)
35
+ with col3:
36
+ st.header("Media shared")
37
+ st.title(num_media_messages)
38
+ with col4:
39
+ st.header("Links shared")
40
+ st.title(num_links)
41
+
42
+ #monthly_timeline
43
+ st.title("Monthly Timeline")
44
+ timeline=helper.monthly_timeline(selected_user,df)
45
+ fig = plt.figure()
46
+ sns.set_style('darkgrid')
47
+ sns.lineplot(x=timeline['time'],y=timeline['message'],color='red')
48
+ plt.xticks(rotation='vertical')
49
+ st.pyplot(fig)
50
+
51
+ # daily timeline
52
+ st.title("Daily Timeline")
53
+ daily_timeline = helper.daily_timeline(selected_user, df)
54
+ fig = plt.figure(figsize=(10, 3))
55
+ sns.set_style('whitegrid')
56
+ sns.lineplot(x=daily_timeline['only_date'],y=daily_timeline['message'], color='purple')
57
+ plt.xticks(rotation='vertical')
58
+ plt.xlabel("date")
59
+ st.pyplot(fig)
60
+
61
+ #activity map
62
+ st.title('Activity Map')
63
+ col1,col2 = st.columns(2)
64
+
65
+ with col1:
66
+ st.header("Most busy day")
67
+ busy_day=helper.week_activity_map(selected_user,df)
68
+ fig = plt.figure()
69
+ sns.set_style('ticks')
70
+ pal=sns.cubehelix_palette(start=2, rot=0, dark=0.5, light=0.9, reverse=True)
71
+ sns.barplot(x=busy_day.index,y=busy_day.values,palette=pal)
72
+ plt.ylabel("messages")
73
+ plt.xticks(rotation='vertical')
74
+ st.pyplot(fig)
75
+
76
+ with col2:
77
+ st.header("Most busy month")
78
+ busy_month=helper.month_activity_map(selected_user,df)
79
+ fig=plt.figure()
80
+ sns.set_style('ticks')
81
+ pal=sns.cubehelix_palette(start=0, rot=0, dark=0.2, light=0.9, reverse=True)
82
+ sns.barplot(x=busy_month.index,y=busy_month.values,palette=pal)
83
+ plt.ylabel("messages")
84
+ plt.xticks(rotation='vertical')
85
+ st.pyplot(fig)
86
+
87
+ st.title("Weekly Activity Map")
88
+ user_heatmap = helper.activity_heatmap(selected_user, df)
89
+ fig = plt.figure(figsize=(13,4))
90
+ cmap = sns.color_palette("viridis", as_cmap=True)
91
+ sns.heatmap(user_heatmap,cmap=cmap,square=True)
92
+ st.pyplot(fig)
93
+
94
+ #finding active users
95
+ if(selected_user=='Overall'):
96
+ st.title('Most active users')
97
+ x,new_df=helper.most_busy_users(df)
98
+ fig=plt.figure()
99
+
100
+ col1,col2 = st.columns(2)
101
+
102
+ with col1:
103
+ pal = sns.color_palette("cubehelix")
104
+ sns.barplot(x=x.index,y=x.values,palette=pal)
105
+ plt.xticks(rotation='vertical')
106
+ plt.ylabel('messages')
107
+ st.pyplot(fig)
108
+ with col2:
109
+ st.dataframe(new_df)
110
+ #word cloud
111
+ st.title('WordCloud')
112
+ df_wc = helper.create_wordcloud(selected_user,df)
113
+ fig,ax = plt.subplots()
114
+ ax.imshow(df_wc)
115
+ st.pyplot(fig)
116
+
117
+ #most common words
118
+ st.title('Most Common Words')
119
+ most_common_df=helper.most_common_words(selected_user,df)
120
+ colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
121
+ '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe',
122
+ '#008080', '#e6beff', '#9a6324', '#fffac8', '#aaffc3',
123
+ '#808000', '#ffd8b1', '#808080', 'lightgreen', 'lightblue']
124
+ # explosion
125
+
126
+ fig = plt.figure()
127
+ # Pie Chart
128
+ plt.pie(most_common_df[1], labels=most_common_df[0], colors=colors,
129
+ autopct='%0.1f%%', pctdistance=0.9, labeldistance=1, rotatelabels=270, startangle=180,
130
+ counterclock=False)
131
+ # draw circle
132
+ centre_circle = plt.Circle((0, 0), 0.50, fc='white')
133
+ fig2 = plt.gcf()
134
+ # Adding Circle in Pie chart
135
+ fig2.gca().add_artist(centre_circle)
136
+ st.pyplot(fig)
137
+
138
+ #emoji analysys
139
+
140
+ emoji_df = helper.emoji_helper(selected_user,df)
141
+ if(emoji_df.shape[0]):
142
+ st.title("Emoji Analysis")
143
+ col1,col2 =st.columns(2)
144
+ with col1:
145
+ st.dataframe(emoji_df)
146
+ with col2:
147
+ fig,ax = plt.subplots()
148
+ plt.rcParams['font.family'] = 'Segoe UI Emoji'
149
+ ax.pie(emoji_df[1].head(min(5,emoji_df.shape[0])),labels=emoji_df[0].head(min(5,emoji_df.shape[0])),autopct="%0.2f")
150
+ st.pyplot(fig)
151
+
152
+ #birth_dates
153
+ if(selected_user=='Overall'):
154
+ birth_data = helper.birth_dates(df)
155
+ if(birth_data.shape[0]):
156
+ st.title("Birth dates of some users.")
157
+ st.dataframe(birth_data)
158
+
159
+ #Sentiment-analysis
160
+ if (selected_user != 'Overall'):
161
+ st.title("Sentiment Analysis")
162
+ sentiment_data,number=helper.sentiment_analysis(selected_user,df)
163
+ fig = plt.figure()
164
+ sns.set_style('ticks')
165
+ pal = sns.cubehelix_palette(start=0.5, rot=0, dark=0.2, light=0.9, reverse=True)
166
+ sns.barplot(x=sentiment_data.index,y=sentiment_data.values,palette=pal)
167
+ plt.xticks(rotation='vertical')
168
+ st.pyplot(fig)
169
+ st.header("Based on random "+str(number)+" messages.")
170
+ st.text("Note : Sentiment Analysis give good results if messages \nare in hinglish (hindi or english or both).")
171
+ main()
helper.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urlextract import URLExtract
2
+ from wordcloud import WordCloud
3
+ import pandas as pd
4
+ from collections import Counter
5
+ import emoji
6
+ import re
7
+ import numpy as np
8
+ import torch
9
+
10
+ extract = URLExtract()
11
+ def fetch_stats(selected_user,df):
12
+ if(selected_user!='Overall'):
13
+ df=df[df['user']==selected_user]
14
+ num_messages = df.shape[0]
15
+ words = []
16
+ for message in df['message']:
17
+ words.extend(message.split())
18
+ num_media_messages = df[df['message']=='<Media omitted>\n'].shape[0]
19
+ links=[]
20
+ for message in df['message']:
21
+ links.extend(extract.find_urls(message))
22
+ return num_messages, len(words), num_media_messages ,len(links)
23
+
24
+ def most_busy_users(df):
25
+ x = df['user'].value_counts()
26
+ x = x.head(min(10, len(x)))
27
+ new_df = round((df['user'].value_counts()/df.shape[0])*100,2).reset_index().rename(columns={'user':'name','count':'percent'})
28
+ return x,new_df
29
+
30
+ def create_wordcloud(selected_user,df):
31
+
32
+ f = open('stop_hinglish.txt', 'r')
33
+ stop_words = f.read()
34
+
35
+ if selected_user != 'Overall':
36
+ df = df[df['user'] == selected_user]
37
+
38
+ temp = df[df['user'] != 'group_notification']
39
+ temp = temp[temp['message'] != '<Media omitted>\n']
40
+
41
+ def remove_stop_words(message):
42
+ y = []
43
+ for word in message.lower().split():
44
+ if word not in stop_words:
45
+ y.append(word)
46
+ return " ".join(y)
47
+
48
+ wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
49
+ temp['message'] = temp['message'].apply(remove_stop_words)
50
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
51
+ return df_wc
52
+
53
+ def most_common_words(selected_user,df):
54
+
55
+ f = open('stop_hinglish.txt','r')
56
+ stop_words = f.read()
57
+
58
+ if selected_user != 'Overall':
59
+ df = df[df['user'] == selected_user]
60
+
61
+ temp = df[df['user'] != 'group_notification']
62
+ temp = temp[temp['message'] != '<Media omitted>\n']
63
+
64
+ words = []
65
+
66
+ for message in temp['message']:
67
+ for word in message.lower().split():
68
+ if (word not in stop_words):
69
+ for c in word:
70
+ if c not in emoji.UNICODE_EMOJI_ENGLISH:
71
+ words.append(word)
72
+ break
73
+
74
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
75
+ return most_common_df
76
+
77
+ def emoji_helper(selected_user,df):
78
+ if (selected_user != 'Overall'):
79
+ df = df[df['user'] == selected_user]
80
+ emojis=[]
81
+ for message in df['message']:
82
+ emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI_ENGLISH])
83
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
84
+ return emoji_df
85
+
86
+ def monthly_timeline(selected_user,df):
87
+ if (selected_user != 'Overall'):
88
+ df = df[df['user'] == selected_user]
89
+
90
+ timeline = df.groupby(['year','month_num','month']).count()['message'].reset_index()
91
+ time=[]
92
+ for i in range(timeline.shape[0]):
93
+ time.append(timeline['month'][i]+"-"+str(timeline['year'][i]))
94
+ timeline['time'] =time
95
+ return timeline
96
+
97
+ def daily_timeline(selected_user,df):
98
+
99
+ if selected_user != 'Overall':
100
+ df = df[df['user'] == selected_user]
101
+
102
+ daily_timeline = df.groupby('only_date').count()['message'].reset_index()
103
+
104
+ return daily_timeline
105
+
106
+ def week_activity_map(selected_user,df):
107
+ if selected_user != 'Overall':
108
+ df = df[df['user'] == selected_user]
109
+ return df['day_name'].value_counts()
110
+
111
+ def month_activity_map(selected_user,df):
112
+ if selected_user != 'Overall':
113
+ df = df[df['user'] == selected_user]
114
+ return df['month'].value_counts()
115
+
116
+ def activity_heatmap(selected_user,df):
117
+
118
+ if selected_user != 'Overall':
119
+ df = df[df['user'] == selected_user]
120
+
121
+ user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
122
+
123
+ return user_heatmap
124
+ def birth_dates(df):
125
+ birthdates = []
126
+ names = []
127
+ for i in range(df.shape[0]):
128
+ msg = df['message'][i].lower()
129
+ if (re.search('happy birthday', msg)):
130
+ if (re.findall('@[A-Za-z0-9]+', df['message'][i])):
131
+ users = re.findall('@[A-Za-z0-9]+', df['message'][i])
132
+ for user in users:
133
+ if user[1:] not in names:
134
+ names.append(user[1:])
135
+ birthdates.append(str(df['month'][i]) + " " + str(df['day'][i]))
136
+ return pd.DataFrame({'contacts':names,'birthdates':birthdates})
137
+
138
+ def sentiment_analysis(selected_user,df):
139
+ if selected_user != 'Overall':
140
+ df = df[df['user'] == selected_user]
141
+ # sample code
142
+ from transformers import BertTokenizer, BertForSequenceClassification
143
+ tokenizer = BertTokenizer.from_pretrained("ganeshkharad/gk-hinglish-sentiment")
144
+ model = BertForSequenceClassification.from_pretrained("ganeshkharad/gk-hinglish-sentiment")
145
+ if df.shape[0]>600:
146
+ df=df.sample(n=600)
147
+
148
+ ans = []
149
+ for i in range(df.shape[0]):
150
+ encoded_input = tokenizer(df['message'].iloc[i], return_tensors='pt')
151
+ output = model(**encoded_input)
152
+ output = np.argmax(output.logits.detach().numpy())
153
+ if (output == 0):
154
+ ans.append('Negative-messages')
155
+ elif (output == 1):
156
+ ans.append('Neutral-messages')
157
+ else:
158
+ ans.append('Positive-messages')
159
+
160
+ # output contains 3 lables LABEL_0 = Negative ,LABEL_1 = Nuetral ,LABEL_2 = Positive
161
+ return pd.Series(Counter(ans)),df.shape[0]
preprocessor.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ def preprocess (data):
4
+ pattern = '\d\d/\d\d/\d\d,\s[0-9]+:\d\d\s[a-z]m\s-\s'
5
+ messages = re.split(pattern, data)
6
+ messages = messages[1:]
7
+ dates = re.findall(pattern, data)
8
+ for i in range(len(dates)):
9
+ dates[i] = re.sub('am', 'AM', dates[i])
10
+ dates[i] = re.sub('pm', 'PM', dates[i])
11
+ df = pd.DataFrame({'user_message': messages, 'message-date': dates})
12
+ df['message-date'] = pd.to_datetime(df['message-date'], format="%d/%m/%y, %I:%M %p - ")
13
+ df.rename(columns={'message-date': 'date'}, inplace=True)
14
+
15
+ # separate users and messages
16
+ users = []
17
+ messages = []
18
+ for message in df['user_message']:
19
+ entry = re.split(':\s', message)
20
+ if entry[1:]:
21
+ users.append(entry[0])
22
+ messages.append(entry[1])
23
+ else:
24
+ users.append('group_notification')
25
+ messages.append(entry[0])
26
+ df['user'] = users
27
+ df['message'] = messages
28
+ df.drop(columns=['user_message'], inplace=True)
29
+
30
+ df['year'] = df['date'].dt.year
31
+ df['month'] = df['date'].dt.month_name()
32
+ df['month_num'] = df['date'].dt.month
33
+ df['only_date']=df['date'].dt.date
34
+ df['day'] = df['date'].dt.day
35
+ df['day_name']=df['date'].dt.day_name()
36
+ df['hour'] = df['date'].dt.hour
37
+ df['minute'] = df['date'].dt.minute
38
+
39
+ period = []
40
+ for hour in df[['day_name', 'hour']]['hour']:
41
+ if hour == 23:
42
+ period.append(str(hour) + "-" + str('00'))
43
+ elif hour == 0:
44
+ period.append(str('00') + "-" + str(hour + 1))
45
+ else:
46
+ period.append(str(hour) + "-" + str(hour + 1))
47
+
48
+ df['period'] = period
49
+
50
+ return df
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ matplotlib
3
+ seaborn
4
+ urlextract
5
+ wordcloud
6
+ pandas
7
+ numpy
8
+ torch
9
+ transformers
10
+ emoji==1.7.0