Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,19 @@
|
|
1 |
-
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
-
import re
|
7 |
import nltk
|
|
|
|
|
8 |
from nltk.corpus import stopwords
|
9 |
-
from nltk.tokenize import word_tokenize
|
10 |
from nltk.stem import SnowballStemmer
|
11 |
-
|
12 |
-
from scipy.cluster.hierarchy import dendrogram, ward
|
13 |
-
from scipy.sparse import csr_matrix
|
14 |
|
15 |
-
# Download NLTK
|
16 |
nltk.download('punkt')
|
17 |
nltk.download('stopwords')
|
18 |
|
|
|
19 |
def preprocess_text(text):
|
20 |
# Convert text to lowercase
|
21 |
text = text.lower()
|
@@ -25,8 +23,10 @@ def preprocess_text(text):
|
|
25 |
text = re.sub(r'@\S+', '', text)
|
26 |
# Remove hashtags
|
27 |
text = re.sub(r'#\S+', '', text)
|
28 |
-
#
|
29 |
-
|
|
|
|
|
30 |
# Remove stopwords
|
31 |
stop_words = set(stopwords.words('english'))
|
32 |
filtered_tokens = [word for word in tokens if word not in stop_words]
|
@@ -37,115 +37,82 @@ def preprocess_text(text):
|
|
37 |
processed_text = ' '.join(stemmed_tokens)
|
38 |
return processed_text
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
# CountVectorizer
|
45 |
-
vectorizer = CountVectorizer()
|
46 |
-
dtm = vectorizer.fit_transform(data['clean_text'])
|
47 |
-
|
48 |
-
# Convert DTM to DataFrame
|
49 |
-
tweets = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
|
50 |
-
tweets['airline_sentiment'] = data['airline_sentiment']
|
51 |
-
|
52 |
-
# Plot histogram of global sentiment
|
53 |
-
overall_sentiment = tweets['airline_sentiment'].value_counts().reset_index()
|
54 |
-
overall_sentiment.columns = ['Sentiment', 'Freq']
|
55 |
-
sns.barplot(data=overall_sentiment, x='Sentiment', y='Freq', palette=['indianred', 'deepskyblue', 'chartreuse'])
|
56 |
-
plt.title('Summary Global Sentiment')
|
57 |
-
plt.xlabel('Sentiment')
|
58 |
-
plt.ylabel('Frequency')
|
59 |
-
plt.show()
|
60 |
-
|
61 |
-
# Plot sentiment distribution for each airline
|
62 |
-
airline_sentiment = tweets.groupby(['airline', 'airline_sentiment']).size().reset_index(name='Freq')
|
63 |
-
plt.figure(figsize=(10, 6))
|
64 |
-
sns.barplot(data=airline_sentiment, x='airline', y='Freq', hue='airline_sentiment', palette=['indianred', 'deepskyblue', 'chartreuse'])
|
65 |
-
plt.title('Number of Tweets and Sentiment for Each Airline')
|
66 |
-
plt.xlabel('Airline')
|
67 |
-
plt.ylabel('Frequency')
|
68 |
-
plt.legend(title='Sentiment')
|
69 |
-
plt.xticks(rotation=45)
|
70 |
-
plt.show()
|
71 |
-
|
72 |
-
def plot_pie_chart(data):
|
73 |
-
def plot_airline_sentiment_pie(data, airline_name):
|
74 |
-
subset = data[data['airline'] == airline_name]
|
75 |
-
sentiment_counts = subset['airline_sentiment'].value_counts()
|
76 |
-
colors = ['indianred', 'deepskyblue', 'chartreuse']
|
77 |
-
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
|
78 |
-
plt.title(f'{airline_name} Sentiment Distribution')
|
79 |
-
plt.show()
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
globalSentReasons = globalSentReasons.merge(total_by_airline, on='Airline', how='left')
|
91 |
-
globalSentReasons['PercentOfTotal'] = (globalSentReasons['Freq'] / globalSentReasons['Total']) * 100
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
plt.tight_layout()
|
101 |
plt.show()
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
plt.show()
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
non_sparse_cols = (dense_dtm.sum(axis=0) >= min_occurrences).ravel()
|
132 |
-
sparse = dense_dtm[:, non_sparse_cols]
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
plt.show()
|
144 |
|
145 |
-
#
|
146 |
-
iface = gr.Interface(preprocess_and_plot,
|
147 |
-
inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
|
148 |
-
outputs=None,
|
149 |
-
title="Social Media Trend Analysis",
|
150 |
-
description="Analyze sentiment and trends in your social media data.")
|
151 |
iface.launch()
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import matplotlib.pyplot as plt
|
4 |
import seaborn as sns
|
|
|
5 |
import nltk
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
import re
|
8 |
from nltk.corpus import stopwords
|
|
|
9 |
from nltk.stem import SnowballStemmer
|
10 |
+
import gradio as gr
|
|
|
|
|
11 |
|
12 |
+
# Download NLTK punkt tokenizer and stopwords
|
13 |
nltk.download('punkt')
|
14 |
nltk.download('stopwords')
|
15 |
|
16 |
+
# Function to preprocess text
|
17 |
def preprocess_text(text):
|
18 |
# Convert text to lowercase
|
19 |
text = text.lower()
|
|
|
23 |
text = re.sub(r'@\S+', '', text)
|
24 |
# Remove hashtags
|
25 |
text = re.sub(r'#\S+', '', text)
|
26 |
+
# Remove non-alphabetic characters
|
27 |
+
text = re.sub(r'[^a-zA-Z]', ' ', text)
|
28 |
+
# Tokenize text
|
29 |
+
tokens = nltk.word_tokenize(text)
|
30 |
# Remove stopwords
|
31 |
stop_words = set(stopwords.words('english'))
|
32 |
filtered_tokens = [word for word in tokens if word not in stop_words]
|
|
|
37 |
processed_text = ' '.join(stemmed_tokens)
|
38 |
return processed_text
|
39 |
|
40 |
+
# Load the DataFrame
|
41 |
+
def load_dataframe(file_path):
|
42 |
+
return pd.read_csv(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
# Preprocess the DataFrame
|
45 |
+
def preprocess_dataframe(dataframe):
|
46 |
+
dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
|
47 |
+
return dataframe
|
48 |
|
49 |
+
# Create CountVectorizer and fit_transform
|
50 |
+
def create_dtm(dataframe):
|
51 |
+
vectorizer = CountVectorizer()
|
52 |
+
dtm = vectorizer.fit_transform(dataframe['processed_text'])
|
53 |
+
return dtm, vectorizer
|
|
|
|
|
54 |
|
55 |
+
# Plot word frequency
|
56 |
+
def plot_word_frequency(dtm, vectorizer):
|
57 |
+
# Sum word frequencies
|
58 |
+
word_freq = dtm.sum(axis=0)
|
59 |
+
words = vectorizer.get_feature_names_out()
|
60 |
+
# Create DataFrame
|
61 |
+
word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
|
62 |
+
# Sort by frequency
|
63 |
+
word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
|
64 |
+
# Plot
|
65 |
+
plt.figure(figsize=(10, 6))
|
66 |
+
sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
|
67 |
+
plt.title('Top 20 Words Frequency')
|
68 |
+
plt.xlabel('Words')
|
69 |
+
plt.ylabel('Frequency')
|
70 |
+
plt.xticks(rotation=45, ha='right')
|
71 |
plt.tight_layout()
|
72 |
plt.show()
|
73 |
|
74 |
+
# Define Gradio Interface
|
75 |
+
iface = gr.Interface(
|
76 |
+
fn=load_dataframe,
|
77 |
+
inputs=gr.inputs.File(label="Upload CSV file"),
|
78 |
+
outputs=["dataframe", "text"],
|
79 |
+
title="DataFrame Preprocessing",
|
80 |
+
description="Upload a CSV file containing tweet data and preprocess it.",
|
81 |
+
allow_flagging=False
|
82 |
+
)
|
83 |
|
84 |
+
# Preprocess DataFrame
|
85 |
+
iface2 = gr.Interface(
|
86 |
+
fn=preprocess_dataframe,
|
87 |
+
inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
|
88 |
+
outputs="dataframe",
|
89 |
+
title="DataFrame Preprocessing",
|
90 |
+
description="Preprocess the DataFrame by removing stopwords, URLs, and non-alphabetic characters, and perform stemming.",
|
91 |
+
allow_flagging=False
|
92 |
+
)
|
|
|
93 |
|
94 |
+
# Create DTM
|
95 |
+
iface3 = gr.Interface(
|
96 |
+
fn=create_dtm,
|
97 |
+
inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
|
98 |
+
outputs=["dataframe", "text"],
|
99 |
+
title="Create Document-Term Matrix",
|
100 |
+
description="Create a Document-Term Matrix (DTM) from the preprocessed DataFrame.",
|
101 |
+
allow_flagging=False
|
102 |
+
)
|
|
|
|
|
103 |
|
104 |
+
# Plot Word Frequency
|
105 |
+
iface4 = gr.Interface(
|
106 |
+
fn=plot_word_frequency,
|
107 |
+
inputs=["dataframe", "text"],
|
108 |
+
outputs="plot",
|
109 |
+
title="Plot Word Frequency",
|
110 |
+
description="Plot the frequency of the top 20 words in the Document-Term Matrix.",
|
111 |
+
allow_flagging=False
|
112 |
+
)
|
|
|
113 |
|
114 |
+
# Launch the interfaces
|
|
|
|
|
|
|
|
|
|
|
115 |
iface.launch()
|
116 |
+
iface2.launch()
|
117 |
+
iface3.launch()
|
118 |
+
iface4.launch()
|