Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import re
|
7 |
+
import nltk
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.tokenize import word_tokenize
|
10 |
+
from nltk.stem import SnowballStemmer
|
11 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
+
from scipy.cluster.hierarchy import dendrogram, ward
|
13 |
+
from scipy.sparse import csr_matrix
|
14 |
+
|
15 |
+
# Download NLTK resources
|
16 |
+
nltk.download('punkt')
|
17 |
+
nltk.download('stopwords')
|
18 |
+
|
19 |
+
def preprocess_text(text):
|
20 |
+
# Convert text to lowercase
|
21 |
+
text = text.lower()
|
22 |
+
# Remove URLs
|
23 |
+
text = re.sub(r'http\S+', '', text)
|
24 |
+
# Remove @ mentions
|
25 |
+
text = re.sub(r'@\S+', '', text)
|
26 |
+
# Remove hashtags
|
27 |
+
text = re.sub(r'#\S+', '', text)
|
28 |
+
# Tokenize
|
29 |
+
tokens = word_tokenize(text)
|
30 |
+
# Remove stopwords
|
31 |
+
stop_words = set(stopwords.words('english'))
|
32 |
+
filtered_tokens = [word for word in tokens if word not in stop_words]
|
33 |
+
# Stemming
|
34 |
+
stemmer = SnowballStemmer('english')
|
35 |
+
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
36 |
+
# Join tokens into a single string
|
37 |
+
processed_text = ' '.join(stemmed_tokens)
|
38 |
+
return processed_text
|
39 |
+
|
40 |
+
def preprocess_and_plot(data):
|
41 |
+
# Preprocess the text
|
42 |
+
data['clean_text'] = data['text'].apply(preprocess_text)
|
43 |
+
|
44 |
+
# CountVectorizer
|
45 |
+
vectorizer = CountVectorizer()
|
46 |
+
dtm = vectorizer.fit_transform(data['clean_text'])
|
47 |
+
|
48 |
+
# Convert DTM to DataFrame
|
49 |
+
tweets = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
|
50 |
+
tweets['airline_sentiment'] = data['airline_sentiment']
|
51 |
+
|
52 |
+
# Plot histogram of global sentiment
|
53 |
+
overall_sentiment = tweets['airline_sentiment'].value_counts().reset_index()
|
54 |
+
overall_sentiment.columns = ['Sentiment', 'Freq']
|
55 |
+
sns.barplot(data=overall_sentiment, x='Sentiment', y='Freq', palette=['indianred', 'deepskyblue', 'chartreuse'])
|
56 |
+
plt.title('Summary Global Sentiment')
|
57 |
+
plt.xlabel('Sentiment')
|
58 |
+
plt.ylabel('Frequency')
|
59 |
+
plt.show()
|
60 |
+
|
61 |
+
# Plot sentiment distribution for each airline
|
62 |
+
airline_sentiment = tweets.groupby(['airline', 'airline_sentiment']).size().reset_index(name='Freq')
|
63 |
+
plt.figure(figsize=(10, 6))
|
64 |
+
sns.barplot(data=airline_sentiment, x='airline', y='Freq', hue='airline_sentiment', palette=['indianred', 'deepskyblue', 'chartreuse'])
|
65 |
+
plt.title('Number of Tweets and Sentiment for Each Airline')
|
66 |
+
plt.xlabel('Airline')
|
67 |
+
plt.ylabel('Frequency')
|
68 |
+
plt.legend(title='Sentiment')
|
69 |
+
plt.xticks(rotation=45)
|
70 |
+
plt.show()
|
71 |
+
|
72 |
+
def plot_pie_chart(data):
|
73 |
+
def plot_airline_sentiment_pie(data, airline_name):
|
74 |
+
subset = data[data['airline'] == airline_name]
|
75 |
+
sentiment_counts = subset['airline_sentiment'].value_counts()
|
76 |
+
colors = ['indianred', 'deepskyblue', 'chartreuse']
|
77 |
+
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
|
78 |
+
plt.title(f'{airline_name} Sentiment Distribution')
|
79 |
+
plt.show()
|
80 |
+
|
81 |
+
airlines = data['airline'].unique()
|
82 |
+
for airline in airlines:
|
83 |
+
plot_airline_sentiment_pie(data, airline)
|
84 |
+
|
85 |
+
def plot_reasons(data):
|
86 |
+
reason_table = pd.crosstab(data['negativereason'], data['airline'])
|
87 |
+
globalSentReasons = reason_table.reset_index().melt(id_vars='negativereason', var_name='Airline', value_name='Freq')
|
88 |
+
total_by_airline = globalSentReasons.groupby('Airline')['Freq'].sum().reset_index()
|
89 |
+
total_by_airline.columns = ['Airline', 'Total']
|
90 |
+
globalSentReasons = globalSentReasons.merge(total_by_airline, on='Airline', how='left')
|
91 |
+
globalSentReasons['PercentOfTotal'] = (globalSentReasons['Freq'] / globalSentReasons['Total']) * 100
|
92 |
+
|
93 |
+
plt.figure(figsize=(12, 8))
|
94 |
+
sns.barplot(data=globalSentReasons, x='negativereason', y='PercentOfTotal', hue='Airline')
|
95 |
+
plt.xticks(rotation=45)
|
96 |
+
plt.title('Percentage of Total Complaints by Reason and Airline')
|
97 |
+
plt.ylabel('Percentage of Total Complaints')
|
98 |
+
plt.xlabel('Reason for Complaint')
|
99 |
+
plt.legend(title='Airline', bbox_to_anchor=(1.05, 1), loc='upper left')
|
100 |
+
plt.tight_layout()
|
101 |
+
plt.show()
|
102 |
+
|
103 |
+
def plot_word_frequency(data):
|
104 |
+
def count_word_freq(text):
|
105 |
+
vectorizer = CountVectorizer()
|
106 |
+
dtm = vectorizer.fit_transform(text)
|
107 |
+
words = vectorizer.get_feature_names_out()
|
108 |
+
freq = dtm.sum(axis=0)
|
109 |
+
return pd.DataFrame({'word': words, 'freq': freq.tolist()[0]})
|
110 |
+
|
111 |
+
words_freq = count_word_freq(data['clean_text'])
|
112 |
+
wf_filtered = words_freq[words_freq['freq'] > 50].sort_values(by='freq', ascending=False)
|
113 |
+
plt.figure(figsize=(12, 6))
|
114 |
+
plt.bar(wf_filtered['word'], wf_filtered['freq'], color='skyblue')
|
115 |
+
plt.xlabel('Word', fontsize=12)
|
116 |
+
plt.ylabel('Frequency', fontsize=12)
|
117 |
+
plt.title('Word Frequency', fontsize=14)
|
118 |
+
plt.xticks(rotation=45, ha='right', fontsize=10)
|
119 |
+
plt.yticks(fontsize=10)
|
120 |
+
plt.show()
|
121 |
+
|
122 |
+
def plot_dendrogram(data):
|
123 |
+
vectorizer = CountVectorizer()
|
124 |
+
dtm = vectorizer.fit_transform(data['clean_text'])
|
125 |
+
|
126 |
+
# Convert dtm to a dense matrix
|
127 |
+
dense_dtm = dtm.toarray()
|
128 |
+
|
129 |
+
# Remove sparse terms
|
130 |
+
min_occurrences = 3
|
131 |
+
non_sparse_cols = (dense_dtm.sum(axis=0) >= min_occurrences).ravel()
|
132 |
+
sparse = dense_dtm[:, non_sparse_cols]
|
133 |
+
|
134 |
+
# Calculate the distance matrix
|
135 |
+
dist = ward(sparse.T)
|
136 |
+
|
137 |
+
# Plot the dendrogram
|
138 |
+
plt.figure(figsize=(15, 6))
|
139 |
+
dendrogram(dist, leaf_rotation=90, leaf_font_size=10, labels=vectorizer.get_feature_names_out()[non_sparse_cols])
|
140 |
+
plt.xlabel('Terms')
|
141 |
+
plt.ylabel('Distance')
|
142 |
+
plt.title('Dendrogram')
|
143 |
+
plt.show()
|
144 |
+
|
145 |
+
# Create the interface
|
146 |
+
iface = gr.Interface(preprocess_and_plot,
|
147 |
+
inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
|
148 |
+
outputs=None,
|
149 |
+
title="Social Media Trend Analysis",
|
150 |
+
description="Analyze sentiment and trends in your social media data.")
|
151 |
+
iface.launch()
|