Spaces:
Runtime error
Runtime error
Commit
·
f6c6a41
1
Parent(s):
644d9c6
first commit
Browse files- app.py +171 -0
- helper.py +161 -0
- preprocessor.py +50 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import preprocessor
|
3 |
+
import helper
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
st.sidebar.title("Whatsapp Chat Analyzer")
|
11 |
+
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
12 |
+
if uploaded_file is not None:
|
13 |
+
# To read file as bytes:
|
14 |
+
bytes_data = uploaded_file.getvalue()
|
15 |
+
data = bytes_data.decode("utf-8")
|
16 |
+
df = preprocessor.preprocess(data)
|
17 |
+
|
18 |
+
#fetch unique users
|
19 |
+
user_list = df['user'].unique().tolist()
|
20 |
+
user_list.remove('group_notification')
|
21 |
+
user_list.sort()
|
22 |
+
user_list.insert(0,"Overall")
|
23 |
+
selected_user = st.sidebar.selectbox("Show analysis wrt",user_list)
|
24 |
+
if st.sidebar.button("Show Analysis"):
|
25 |
+
num_messages,words, num_media_messages ,num_links = helper.fetch_stats(selected_user,df)
|
26 |
+
st.title("Top Statistics")
|
27 |
+
col1, col2 , col3, col4 = st.columns(4)
|
28 |
+
|
29 |
+
with col1:
|
30 |
+
st.header("Total Messages")
|
31 |
+
st.title(num_messages)
|
32 |
+
with col2:
|
33 |
+
st.header("Total Words")
|
34 |
+
st.title(words)
|
35 |
+
with col3:
|
36 |
+
st.header("Media shared")
|
37 |
+
st.title(num_media_messages)
|
38 |
+
with col4:
|
39 |
+
st.header("Links shared")
|
40 |
+
st.title(num_links)
|
41 |
+
|
42 |
+
#monthly_timeline
|
43 |
+
st.title("Monthly Timeline")
|
44 |
+
timeline=helper.monthly_timeline(selected_user,df)
|
45 |
+
fig = plt.figure()
|
46 |
+
sns.set_style('darkgrid')
|
47 |
+
sns.lineplot(x=timeline['time'],y=timeline['message'],color='red')
|
48 |
+
plt.xticks(rotation='vertical')
|
49 |
+
st.pyplot(fig)
|
50 |
+
|
51 |
+
# daily timeline
|
52 |
+
st.title("Daily Timeline")
|
53 |
+
daily_timeline = helper.daily_timeline(selected_user, df)
|
54 |
+
fig = plt.figure(figsize=(10, 3))
|
55 |
+
sns.set_style('whitegrid')
|
56 |
+
sns.lineplot(x=daily_timeline['only_date'],y=daily_timeline['message'], color='purple')
|
57 |
+
plt.xticks(rotation='vertical')
|
58 |
+
plt.xlabel("date")
|
59 |
+
st.pyplot(fig)
|
60 |
+
|
61 |
+
#activity map
|
62 |
+
st.title('Activity Map')
|
63 |
+
col1,col2 = st.columns(2)
|
64 |
+
|
65 |
+
with col1:
|
66 |
+
st.header("Most busy day")
|
67 |
+
busy_day=helper.week_activity_map(selected_user,df)
|
68 |
+
fig = plt.figure()
|
69 |
+
sns.set_style('ticks')
|
70 |
+
pal=sns.cubehelix_palette(start=2, rot=0, dark=0.5, light=0.9, reverse=True)
|
71 |
+
sns.barplot(x=busy_day.index,y=busy_day.values,palette=pal)
|
72 |
+
plt.ylabel("messages")
|
73 |
+
plt.xticks(rotation='vertical')
|
74 |
+
st.pyplot(fig)
|
75 |
+
|
76 |
+
with col2:
|
77 |
+
st.header("Most busy month")
|
78 |
+
busy_month=helper.month_activity_map(selected_user,df)
|
79 |
+
fig=plt.figure()
|
80 |
+
sns.set_style('ticks')
|
81 |
+
pal=sns.cubehelix_palette(start=0, rot=0, dark=0.2, light=0.9, reverse=True)
|
82 |
+
sns.barplot(x=busy_month.index,y=busy_month.values,palette=pal)
|
83 |
+
plt.ylabel("messages")
|
84 |
+
plt.xticks(rotation='vertical')
|
85 |
+
st.pyplot(fig)
|
86 |
+
|
87 |
+
st.title("Weekly Activity Map")
|
88 |
+
user_heatmap = helper.activity_heatmap(selected_user, df)
|
89 |
+
fig = plt.figure(figsize=(13,4))
|
90 |
+
cmap = sns.color_palette("viridis", as_cmap=True)
|
91 |
+
sns.heatmap(user_heatmap,cmap=cmap,square=True)
|
92 |
+
st.pyplot(fig)
|
93 |
+
|
94 |
+
#finding active users
|
95 |
+
if(selected_user=='Overall'):
|
96 |
+
st.title('Most active users')
|
97 |
+
x,new_df=helper.most_busy_users(df)
|
98 |
+
fig=plt.figure()
|
99 |
+
|
100 |
+
col1,col2 = st.columns(2)
|
101 |
+
|
102 |
+
with col1:
|
103 |
+
pal = sns.color_palette("cubehelix")
|
104 |
+
sns.barplot(x=x.index,y=x.values,palette=pal)
|
105 |
+
plt.xticks(rotation='vertical')
|
106 |
+
plt.ylabel('messages')
|
107 |
+
st.pyplot(fig)
|
108 |
+
with col2:
|
109 |
+
st.dataframe(new_df)
|
110 |
+
#word cloud
|
111 |
+
st.title('WordCloud')
|
112 |
+
df_wc = helper.create_wordcloud(selected_user,df)
|
113 |
+
fig,ax = plt.subplots()
|
114 |
+
ax.imshow(df_wc)
|
115 |
+
st.pyplot(fig)
|
116 |
+
|
117 |
+
#most common words
|
118 |
+
st.title('Most Common Words')
|
119 |
+
most_common_df=helper.most_common_words(selected_user,df)
|
120 |
+
colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
|
121 |
+
'#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe',
|
122 |
+
'#008080', '#e6beff', '#9a6324', '#fffac8', '#aaffc3',
|
123 |
+
'#808000', '#ffd8b1', '#808080', 'lightgreen', 'lightblue']
|
124 |
+
# explosion
|
125 |
+
|
126 |
+
fig = plt.figure()
|
127 |
+
# Pie Chart
|
128 |
+
plt.pie(most_common_df[1], labels=most_common_df[0], colors=colors,
|
129 |
+
autopct='%0.1f%%', pctdistance=0.9, labeldistance=1, rotatelabels=270, startangle=180,
|
130 |
+
counterclock=False)
|
131 |
+
# draw circle
|
132 |
+
centre_circle = plt.Circle((0, 0), 0.50, fc='white')
|
133 |
+
fig2 = plt.gcf()
|
134 |
+
# Adding Circle in Pie chart
|
135 |
+
fig2.gca().add_artist(centre_circle)
|
136 |
+
st.pyplot(fig)
|
137 |
+
|
138 |
+
#emoji analysys
|
139 |
+
|
140 |
+
emoji_df = helper.emoji_helper(selected_user,df)
|
141 |
+
if(emoji_df.shape[0]):
|
142 |
+
st.title("Emoji Analysis")
|
143 |
+
col1,col2 =st.columns(2)
|
144 |
+
with col1:
|
145 |
+
st.dataframe(emoji_df)
|
146 |
+
with col2:
|
147 |
+
fig,ax = plt.subplots()
|
148 |
+
plt.rcParams['font.family'] = 'Segoe UI Emoji'
|
149 |
+
ax.pie(emoji_df[1].head(min(5,emoji_df.shape[0])),labels=emoji_df[0].head(min(5,emoji_df.shape[0])),autopct="%0.2f")
|
150 |
+
st.pyplot(fig)
|
151 |
+
|
152 |
+
#birth_dates
|
153 |
+
if(selected_user=='Overall'):
|
154 |
+
birth_data = helper.birth_dates(df)
|
155 |
+
if(birth_data.shape[0]):
|
156 |
+
st.title("Birth dates of some users.")
|
157 |
+
st.dataframe(birth_data)
|
158 |
+
|
159 |
+
#Sentiment-analysis
|
160 |
+
if (selected_user != 'Overall'):
|
161 |
+
st.title("Sentiment Analysis")
|
162 |
+
sentiment_data,number=helper.sentiment_analysis(selected_user,df)
|
163 |
+
fig = plt.figure()
|
164 |
+
sns.set_style('ticks')
|
165 |
+
pal = sns.cubehelix_palette(start=0.5, rot=0, dark=0.2, light=0.9, reverse=True)
|
166 |
+
sns.barplot(x=sentiment_data.index,y=sentiment_data.values,palette=pal)
|
167 |
+
plt.xticks(rotation='vertical')
|
168 |
+
st.pyplot(fig)
|
169 |
+
st.header("Based on random "+str(number)+" messages.")
|
170 |
+
st.text("Note : Sentiment Analysis give good results if messages \nare in hinglish (hindi or english or both).")
|
171 |
+
main()
|
helper.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urlextract import URLExtract
|
2 |
+
from wordcloud import WordCloud
|
3 |
+
import pandas as pd
|
4 |
+
from collections import Counter
|
5 |
+
import emoji
|
6 |
+
import re
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
|
10 |
+
extract = URLExtract()
|
11 |
+
def fetch_stats(selected_user,df):
|
12 |
+
if(selected_user!='Overall'):
|
13 |
+
df=df[df['user']==selected_user]
|
14 |
+
num_messages = df.shape[0]
|
15 |
+
words = []
|
16 |
+
for message in df['message']:
|
17 |
+
words.extend(message.split())
|
18 |
+
num_media_messages = df[df['message']=='<Media omitted>\n'].shape[0]
|
19 |
+
links=[]
|
20 |
+
for message in df['message']:
|
21 |
+
links.extend(extract.find_urls(message))
|
22 |
+
return num_messages, len(words), num_media_messages ,len(links)
|
23 |
+
|
24 |
+
def most_busy_users(df):
|
25 |
+
x = df['user'].value_counts()
|
26 |
+
x = x.head(min(10, len(x)))
|
27 |
+
new_df = round((df['user'].value_counts()/df.shape[0])*100,2).reset_index().rename(columns={'user':'name','count':'percent'})
|
28 |
+
return x,new_df
|
29 |
+
|
30 |
+
def create_wordcloud(selected_user,df):
|
31 |
+
|
32 |
+
f = open('stop_hinglish.txt', 'r')
|
33 |
+
stop_words = f.read()
|
34 |
+
|
35 |
+
if selected_user != 'Overall':
|
36 |
+
df = df[df['user'] == selected_user]
|
37 |
+
|
38 |
+
temp = df[df['user'] != 'group_notification']
|
39 |
+
temp = temp[temp['message'] != '<Media omitted>\n']
|
40 |
+
|
41 |
+
def remove_stop_words(message):
|
42 |
+
y = []
|
43 |
+
for word in message.lower().split():
|
44 |
+
if word not in stop_words:
|
45 |
+
y.append(word)
|
46 |
+
return " ".join(y)
|
47 |
+
|
48 |
+
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
|
49 |
+
temp['message'] = temp['message'].apply(remove_stop_words)
|
50 |
+
df_wc = wc.generate(temp['message'].str.cat(sep=" "))
|
51 |
+
return df_wc
|
52 |
+
|
53 |
+
def most_common_words(selected_user,df):
|
54 |
+
|
55 |
+
f = open('stop_hinglish.txt','r')
|
56 |
+
stop_words = f.read()
|
57 |
+
|
58 |
+
if selected_user != 'Overall':
|
59 |
+
df = df[df['user'] == selected_user]
|
60 |
+
|
61 |
+
temp = df[df['user'] != 'group_notification']
|
62 |
+
temp = temp[temp['message'] != '<Media omitted>\n']
|
63 |
+
|
64 |
+
words = []
|
65 |
+
|
66 |
+
for message in temp['message']:
|
67 |
+
for word in message.lower().split():
|
68 |
+
if (word not in stop_words):
|
69 |
+
for c in word:
|
70 |
+
if c not in emoji.UNICODE_EMOJI_ENGLISH:
|
71 |
+
words.append(word)
|
72 |
+
break
|
73 |
+
|
74 |
+
most_common_df = pd.DataFrame(Counter(words).most_common(20))
|
75 |
+
return most_common_df
|
76 |
+
|
77 |
+
def emoji_helper(selected_user,df):
|
78 |
+
if (selected_user != 'Overall'):
|
79 |
+
df = df[df['user'] == selected_user]
|
80 |
+
emojis=[]
|
81 |
+
for message in df['message']:
|
82 |
+
emojis.extend([c for c in message if c in emoji.UNICODE_EMOJI_ENGLISH])
|
83 |
+
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
|
84 |
+
return emoji_df
|
85 |
+
|
86 |
+
def monthly_timeline(selected_user,df):
|
87 |
+
if (selected_user != 'Overall'):
|
88 |
+
df = df[df['user'] == selected_user]
|
89 |
+
|
90 |
+
timeline = df.groupby(['year','month_num','month']).count()['message'].reset_index()
|
91 |
+
time=[]
|
92 |
+
for i in range(timeline.shape[0]):
|
93 |
+
time.append(timeline['month'][i]+"-"+str(timeline['year'][i]))
|
94 |
+
timeline['time'] =time
|
95 |
+
return timeline
|
96 |
+
|
97 |
+
def daily_timeline(selected_user,df):
|
98 |
+
|
99 |
+
if selected_user != 'Overall':
|
100 |
+
df = df[df['user'] == selected_user]
|
101 |
+
|
102 |
+
daily_timeline = df.groupby('only_date').count()['message'].reset_index()
|
103 |
+
|
104 |
+
return daily_timeline
|
105 |
+
|
106 |
+
def week_activity_map(selected_user,df):
|
107 |
+
if selected_user != 'Overall':
|
108 |
+
df = df[df['user'] == selected_user]
|
109 |
+
return df['day_name'].value_counts()
|
110 |
+
|
111 |
+
def month_activity_map(selected_user,df):
|
112 |
+
if selected_user != 'Overall':
|
113 |
+
df = df[df['user'] == selected_user]
|
114 |
+
return df['month'].value_counts()
|
115 |
+
|
116 |
+
def activity_heatmap(selected_user,df):
|
117 |
+
|
118 |
+
if selected_user != 'Overall':
|
119 |
+
df = df[df['user'] == selected_user]
|
120 |
+
|
121 |
+
user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
|
122 |
+
|
123 |
+
return user_heatmap
|
124 |
+
def birth_dates(df):
|
125 |
+
birthdates = []
|
126 |
+
names = []
|
127 |
+
for i in range(df.shape[0]):
|
128 |
+
msg = df['message'][i].lower()
|
129 |
+
if (re.search('happy birthday', msg)):
|
130 |
+
if (re.findall('@[A-Za-z0-9]+', df['message'][i])):
|
131 |
+
users = re.findall('@[A-Za-z0-9]+', df['message'][i])
|
132 |
+
for user in users:
|
133 |
+
if user[1:] not in names:
|
134 |
+
names.append(user[1:])
|
135 |
+
birthdates.append(str(df['month'][i]) + " " + str(df['day'][i]))
|
136 |
+
return pd.DataFrame({'contacts':names,'birthdates':birthdates})
|
137 |
+
|
138 |
+
def sentiment_analysis(selected_user,df):
|
139 |
+
if selected_user != 'Overall':
|
140 |
+
df = df[df['user'] == selected_user]
|
141 |
+
# sample code
|
142 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
143 |
+
tokenizer = BertTokenizer.from_pretrained("ganeshkharad/gk-hinglish-sentiment")
|
144 |
+
model = BertForSequenceClassification.from_pretrained("ganeshkharad/gk-hinglish-sentiment")
|
145 |
+
if df.shape[0]>600:
|
146 |
+
df=df.sample(n=600)
|
147 |
+
|
148 |
+
ans = []
|
149 |
+
for i in range(df.shape[0]):
|
150 |
+
encoded_input = tokenizer(df['message'].iloc[i], return_tensors='pt')
|
151 |
+
output = model(**encoded_input)
|
152 |
+
output = np.argmax(output.logits.detach().numpy())
|
153 |
+
if (output == 0):
|
154 |
+
ans.append('Negative-messages')
|
155 |
+
elif (output == 1):
|
156 |
+
ans.append('Neutral-messages')
|
157 |
+
else:
|
158 |
+
ans.append('Positive-messages')
|
159 |
+
|
160 |
+
# output contains 3 lables LABEL_0 = Negative ,LABEL_1 = Nuetral ,LABEL_2 = Positive
|
161 |
+
return pd.Series(Counter(ans)),df.shape[0]
|
preprocessor.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
def preprocess (data):
|
4 |
+
pattern = '\d\d/\d\d/\d\d,\s[0-9]+:\d\d\s[a-z]m\s-\s'
|
5 |
+
messages = re.split(pattern, data)
|
6 |
+
messages = messages[1:]
|
7 |
+
dates = re.findall(pattern, data)
|
8 |
+
for i in range(len(dates)):
|
9 |
+
dates[i] = re.sub('am', 'AM', dates[i])
|
10 |
+
dates[i] = re.sub('pm', 'PM', dates[i])
|
11 |
+
df = pd.DataFrame({'user_message': messages, 'message-date': dates})
|
12 |
+
df['message-date'] = pd.to_datetime(df['message-date'], format="%d/%m/%y, %I:%M %p - ")
|
13 |
+
df.rename(columns={'message-date': 'date'}, inplace=True)
|
14 |
+
|
15 |
+
# separate users and messages
|
16 |
+
users = []
|
17 |
+
messages = []
|
18 |
+
for message in df['user_message']:
|
19 |
+
entry = re.split(':\s', message)
|
20 |
+
if entry[1:]:
|
21 |
+
users.append(entry[0])
|
22 |
+
messages.append(entry[1])
|
23 |
+
else:
|
24 |
+
users.append('group_notification')
|
25 |
+
messages.append(entry[0])
|
26 |
+
df['user'] = users
|
27 |
+
df['message'] = messages
|
28 |
+
df.drop(columns=['user_message'], inplace=True)
|
29 |
+
|
30 |
+
df['year'] = df['date'].dt.year
|
31 |
+
df['month'] = df['date'].dt.month_name()
|
32 |
+
df['month_num'] = df['date'].dt.month
|
33 |
+
df['only_date']=df['date'].dt.date
|
34 |
+
df['day'] = df['date'].dt.day
|
35 |
+
df['day_name']=df['date'].dt.day_name()
|
36 |
+
df['hour'] = df['date'].dt.hour
|
37 |
+
df['minute'] = df['date'].dt.minute
|
38 |
+
|
39 |
+
period = []
|
40 |
+
for hour in df[['day_name', 'hour']]['hour']:
|
41 |
+
if hour == 23:
|
42 |
+
period.append(str(hour) + "-" + str('00'))
|
43 |
+
elif hour == 0:
|
44 |
+
period.append(str('00') + "-" + str(hour + 1))
|
45 |
+
else:
|
46 |
+
period.append(str(hour) + "-" + str(hour + 1))
|
47 |
+
|
48 |
+
df['period'] = period
|
49 |
+
|
50 |
+
return df
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
matplotlib
|
3 |
+
seaborn
|
4 |
+
urlextract
|
5 |
+
wordcloud
|
6 |
+
pandas
|
7 |
+
numpy
|
8 |
+
torch
|
9 |
+
transformers
|
10 |
+
emoji==1.7.0
|