Ahtisham1583 commited on
Commit
de9149f
·
verified ·
1 Parent(s): df47b33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -110
app.py CHANGED
@@ -1,21 +1,19 @@
1
- import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- import re
7
  import nltk
 
 
8
  from nltk.corpus import stopwords
9
- from nltk.tokenize import word_tokenize
10
  from nltk.stem import SnowballStemmer
11
- from sklearn.feature_extraction.text import CountVectorizer
12
- from scipy.cluster.hierarchy import dendrogram, ward
13
- from scipy.sparse import csr_matrix
14
 
15
- # Download NLTK resources
16
  nltk.download('punkt')
17
  nltk.download('stopwords')
18
 
 
19
  def preprocess_text(text):
20
  # Convert text to lowercase
21
  text = text.lower()
@@ -25,8 +23,10 @@ def preprocess_text(text):
25
  text = re.sub(r'@\S+', '', text)
26
  # Remove hashtags
27
  text = re.sub(r'#\S+', '', text)
28
- # Tokenize
29
- tokens = word_tokenize(text)
 
 
30
  # Remove stopwords
31
  stop_words = set(stopwords.words('english'))
32
  filtered_tokens = [word for word in tokens if word not in stop_words]
@@ -37,115 +37,82 @@ def preprocess_text(text):
37
  processed_text = ' '.join(stemmed_tokens)
38
  return processed_text
39
 
40
- def preprocess_and_plot(data):
41
- # Preprocess the text
42
- data['clean_text'] = data['text'].apply(preprocess_text)
43
-
44
- # CountVectorizer
45
- vectorizer = CountVectorizer()
46
- dtm = vectorizer.fit_transform(data['clean_text'])
47
-
48
- # Convert DTM to DataFrame
49
- tweets = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
50
- tweets['airline_sentiment'] = data['airline_sentiment']
51
-
52
- # Plot histogram of global sentiment
53
- overall_sentiment = tweets['airline_sentiment'].value_counts().reset_index()
54
- overall_sentiment.columns = ['Sentiment', 'Freq']
55
- sns.barplot(data=overall_sentiment, x='Sentiment', y='Freq', palette=['indianred', 'deepskyblue', 'chartreuse'])
56
- plt.title('Summary Global Sentiment')
57
- plt.xlabel('Sentiment')
58
- plt.ylabel('Frequency')
59
- plt.show()
60
-
61
- # Plot sentiment distribution for each airline
62
- airline_sentiment = tweets.groupby(['airline', 'airline_sentiment']).size().reset_index(name='Freq')
63
- plt.figure(figsize=(10, 6))
64
- sns.barplot(data=airline_sentiment, x='airline', y='Freq', hue='airline_sentiment', palette=['indianred', 'deepskyblue', 'chartreuse'])
65
- plt.title('Number of Tweets and Sentiment for Each Airline')
66
- plt.xlabel('Airline')
67
- plt.ylabel('Frequency')
68
- plt.legend(title='Sentiment')
69
- plt.xticks(rotation=45)
70
- plt.show()
71
-
72
- def plot_pie_chart(data):
73
- def plot_airline_sentiment_pie(data, airline_name):
74
- subset = data[data['airline'] == airline_name]
75
- sentiment_counts = subset['airline_sentiment'].value_counts()
76
- colors = ['indianred', 'deepskyblue', 'chartreuse']
77
- plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
78
- plt.title(f'{airline_name} Sentiment Distribution')
79
- plt.show()
80
 
81
- airlines = data['airline'].unique()
82
- for airline in airlines:
83
- plot_airline_sentiment_pie(data, airline)
 
84
 
85
- def plot_reasons(data):
86
- reason_table = pd.crosstab(data['negativereason'], data['airline'])
87
- globalSentReasons = reason_table.reset_index().melt(id_vars='negativereason', var_name='Airline', value_name='Freq')
88
- total_by_airline = globalSentReasons.groupby('Airline')['Freq'].sum().reset_index()
89
- total_by_airline.columns = ['Airline', 'Total']
90
- globalSentReasons = globalSentReasons.merge(total_by_airline, on='Airline', how='left')
91
- globalSentReasons['PercentOfTotal'] = (globalSentReasons['Freq'] / globalSentReasons['Total']) * 100
92
 
93
- plt.figure(figsize=(12, 8))
94
- sns.barplot(data=globalSentReasons, x='negativereason', y='PercentOfTotal', hue='Airline')
95
- plt.xticks(rotation=45)
96
- plt.title('Percentage of Total Complaints by Reason and Airline')
97
- plt.ylabel('Percentage of Total Complaints')
98
- plt.xlabel('Reason for Complaint')
99
- plt.legend(title='Airline', bbox_to_anchor=(1.05, 1), loc='upper left')
 
 
 
 
 
 
 
 
 
100
  plt.tight_layout()
101
  plt.show()
102
 
103
- def plot_word_frequency(data):
104
- def count_word_freq(text):
105
- vectorizer = CountVectorizer()
106
- dtm = vectorizer.fit_transform(text)
107
- words = vectorizer.get_feature_names_out()
108
- freq = dtm.sum(axis=0)
109
- return pd.DataFrame({'word': words, 'freq': freq.tolist()[0]})
 
 
110
 
111
- words_freq = count_word_freq(data['clean_text'])
112
- wf_filtered = words_freq[words_freq['freq'] > 50].sort_values(by='freq', ascending=False)
113
- plt.figure(figsize=(12, 6))
114
- plt.bar(wf_filtered['word'], wf_filtered['freq'], color='skyblue')
115
- plt.xlabel('Word', fontsize=12)
116
- plt.ylabel('Frequency', fontsize=12)
117
- plt.title('Word Frequency', fontsize=14)
118
- plt.xticks(rotation=45, ha='right', fontsize=10)
119
- plt.yticks(fontsize=10)
120
- plt.show()
121
 
122
- def plot_dendrogram(data):
123
- vectorizer = CountVectorizer()
124
- dtm = vectorizer.fit_transform(data['clean_text'])
125
-
126
- # Convert dtm to a dense matrix
127
- dense_dtm = dtm.toarray()
128
-
129
- # Remove sparse terms
130
- min_occurrences = 3
131
- non_sparse_cols = (dense_dtm.sum(axis=0) >= min_occurrences).ravel()
132
- sparse = dense_dtm[:, non_sparse_cols]
133
 
134
- # Calculate the distance matrix
135
- dist = ward(sparse.T)
136
-
137
- # Plot the dendrogram
138
- plt.figure(figsize=(15, 6))
139
- dendrogram(dist, leaf_rotation=90, leaf_font_size=10, labels=vectorizer.get_feature_names_out()[non_sparse_cols])
140
- plt.xlabel('Terms')
141
- plt.ylabel('Distance')
142
- plt.title('Dendrogram')
143
- plt.show()
144
 
145
- # Create the interface
146
- iface = gr.Interface(preprocess_and_plot,
147
- inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
148
- outputs=None,
149
- title="Social Media Trend Analysis",
150
- description="Analyze sentiment and trends in your social media data.")
151
  iface.launch()
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
 
5
  import nltk
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ import re
8
  from nltk.corpus import stopwords
 
9
  from nltk.stem import SnowballStemmer
10
+ import gradio as gr
 
 
11
 
12
+ # Download NLTK punkt tokenizer and stopwords
13
  nltk.download('punkt')
14
  nltk.download('stopwords')
15
 
16
+ # Function to preprocess text
17
  def preprocess_text(text):
18
  # Convert text to lowercase
19
  text = text.lower()
 
23
  text = re.sub(r'@\S+', '', text)
24
  # Remove hashtags
25
  text = re.sub(r'#\S+', '', text)
26
+ # Remove non-alphabetic characters
27
+ text = re.sub(r'[^a-zA-Z]', ' ', text)
28
+ # Tokenize text
29
+ tokens = nltk.word_tokenize(text)
30
  # Remove stopwords
31
  stop_words = set(stopwords.words('english'))
32
  filtered_tokens = [word for word in tokens if word not in stop_words]
 
37
  processed_text = ' '.join(stemmed_tokens)
38
  return processed_text
39
 
40
+ # Load the DataFrame
41
+ def load_dataframe(file_path):
42
+ return pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # Preprocess the DataFrame
45
+ def preprocess_dataframe(dataframe):
46
+ dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
47
+ return dataframe
48
 
49
+ # Create CountVectorizer and fit_transform
50
+ def create_dtm(dataframe):
51
+ vectorizer = CountVectorizer()
52
+ dtm = vectorizer.fit_transform(dataframe['processed_text'])
53
+ return dtm, vectorizer
 
 
54
 
55
+ # Plot word frequency
56
+ def plot_word_frequency(dtm, vectorizer):
57
+ # Sum word frequencies
58
+ word_freq = dtm.sum(axis=0)
59
+ words = vectorizer.get_feature_names_out()
60
+ # Create DataFrame
61
+ word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
62
+ # Sort by frequency
63
+ word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
64
+ # Plot
65
+ plt.figure(figsize=(10, 6))
66
+ sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
67
+ plt.title('Top 20 Words Frequency')
68
+ plt.xlabel('Words')
69
+ plt.ylabel('Frequency')
70
+ plt.xticks(rotation=45, ha='right')
71
  plt.tight_layout()
72
  plt.show()
73
 
74
+ # Define Gradio Interface
75
+ iface = gr.Interface(
76
+ fn=load_dataframe,
77
+ inputs=gr.inputs.File(label="Upload CSV file"),
78
+ outputs=["dataframe", "text"],
79
+ title="DataFrame Preprocessing",
80
+ description="Upload a CSV file containing tweet data and preprocess it.",
81
+ allow_flagging=False
82
+ )
83
 
84
+ # Preprocess DataFrame
85
+ iface2 = gr.Interface(
86
+ fn=preprocess_dataframe,
87
+ inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
88
+ outputs="dataframe",
89
+ title="DataFrame Preprocessing",
90
+ description="Preprocess the DataFrame by removing stopwords, URLs, and non-alphabetic characters, and perform stemming.",
91
+ allow_flagging=False
92
+ )
 
93
 
94
+ # Create DTM
95
+ iface3 = gr.Interface(
96
+ fn=create_dtm,
97
+ inputs=gr.inputs.Dataframe(label="Enter your DataFrame here"),
98
+ outputs=["dataframe", "text"],
99
+ title="Create Document-Term Matrix",
100
+ description="Create a Document-Term Matrix (DTM) from the preprocessed DataFrame.",
101
+ allow_flagging=False
102
+ )
 
 
103
 
104
+ # Plot Word Frequency
105
+ iface4 = gr.Interface(
106
+ fn=plot_word_frequency,
107
+ inputs=["dataframe", "text"],
108
+ outputs="plot",
109
+ title="Plot Word Frequency",
110
+ description="Plot the frequency of the top 20 words in the Document-Term Matrix.",
111
+ allow_flagging=False
112
+ )
 
113
 
114
+ # Launch the interfaces
 
 
 
 
 
115
  iface.launch()
116
+ iface2.launch()
117
+ iface3.launch()
118
+ iface4.launch()