SantanuBanerjee commited on
Commit
cc1cd60
1 Parent(s): a79720f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -1
app.py CHANGED
@@ -1,14 +1,30 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
 
 
 
 
 
 
 
 
 
 
4
  def process_excel(file):
5
  try:
6
  # Ensure the file path is correct
7
  file_path = file.name if hasattr(file, 'name') else file
8
  # Read the Excel file
9
  df = pd.read_excel(file_path)
 
10
  # Perform any processing on the DataFrame here
11
- return df.head() # Return the first few rows as an example
 
 
 
 
 
12
  except Exception as e:
13
  return str(e) # Return the error message
14
 
@@ -24,3 +40,214 @@ interface = gr.Interface(
24
  # Launch the interface
25
  if __name__ == "__main__":
26
  interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
+
5
+
6
+
7
+
8
+ def nlp_pipeline(original_df):
9
+ original_df['Sum'] = df['a'] + df['b']
10
+ return original_df
11
+
12
+
13
+
14
  def process_excel(file):
15
  try:
16
  # Ensure the file path is correct
17
  file_path = file.name if hasattr(file, 'name') else file
18
  # Read the Excel file
19
  df = pd.read_excel(file_path)
20
+
21
  # Perform any processing on the DataFrame here
22
+ # Example: adding a new column with the sum of two other columns
23
+ # df['Sum'] = df['Column1'] + df['Column2']
24
+ result_df = nlp_pipeline(original_df)
25
+
26
+ return result_df # Return the first few rows as an example
27
+
28
  except Exception as e:
29
  return str(e) # Return the error message
30
 
 
40
  # Launch the interface
41
  if __name__ == "__main__":
42
  interface.launch()
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+ # #!/usr/bin/env python
55
+ # # coding: utf-8
56
+
57
+ # import pandas as pd
58
+ # import string
59
+ # import nltk
60
+ # import seaborn as sns
61
+ # import matplotlib.pyplot as plt
62
+ # from nltk.corpus import stopwords
63
+ # from nltk.tokenize import word_tokenize
64
+ # from nltk.sentiment import SentimentIntensityAnalyzer
65
+ # from sklearn.feature_extraction.text import TfidfVectorizer
66
+ # from sklearn.cluster import KMeans
67
+ # from transformers import T5ForConditionalGeneration, T5Tokenizer
68
+ # from datasets import Dataset
69
+
70
+ # # Load the data
71
+ # file_responses = pd.read_excel("#TaxDirection (Responses).xlsx")
72
+
73
+ # # Process financial allocations
74
+ # def process_allocations(df, col_name):
75
+ # return pd.to_numeric(df[col_name], errors='coerce').fillna(0)
76
+
77
+ # columns_to_process = [
78
+ # '''Your financial allocation for Problem 1:
79
+ # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''',
80
+ # '''Your financial allocation for Problem 2:
81
+ # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''',
82
+ # '''Your financial allocation for Problem 3:
83
+ # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''
84
+ # ]
85
+
86
+ # for col in columns_to_process:
87
+ # file_responses[col] = process_allocations(file_responses, col)
88
+
89
+ # file_responses['How much was your latest Tax payment (in U$D)?'] = pd.to_numeric(
90
+ # file_responses['How much was your latest Tax payment (in U$D)?'], errors='coerce').fillna(0)
91
+
92
+ # # Compute total allocation and financial weights
93
+ # file_responses['Total Allocation'] = file_responses[columns_to_process].apply(lambda x: x.clip(lower=10)).sum(axis=1)
94
+
95
+ # for i in range(1, 4):
96
+ # file_responses[f'Financial Token Weight for Problem {i}'] = (
97
+ # file_responses['How much was your latest Tax payment (in U$D)?'] *
98
+ # file_responses[columns_to_process[i - 1]] /
99
+ # file_responses['Total Allocation']
100
+ # )
101
+
102
+ # # Create initial datasets
103
+ # initial_datasets = []
104
+ # for i in range(1, 4):
105
+ # initial_datasets.append(
106
+ # file_responses[[f'''Describe Problem {i}:
107
+ # Enter the context of the problem.
108
+ # What are the difficulties you are facing personally or as a part of an organization?
109
+ # You may briefly propose a solution idea as well.''',
110
+ # f'''Problem {i}: Geographical Location :
111
+ # Where is the location you are facing this problem?
112
+ # You may mention the nearby geographical area of the proposed solution as:
113
+ # City/Town, State/Province, Country.''',
114
+ # f'Financial Token Weight for Problem {i}']]
115
+ # )
116
+
117
+ # # Rename columns
118
+ # for idx, df in enumerate(initial_datasets):
119
+ # initial_datasets[idx] = df.rename(columns={
120
+ # df.columns[0]: 'Problem_Description',
121
+ # df.columns[1]: 'Geographical_Location',
122
+ # df.columns[2]: 'Financial_Weight'
123
+ # })
124
+
125
+ # # Merge datasets
126
+ # merged_dataset = pd.concat(initial_datasets, ignore_index=True)
127
+
128
+ # # Preprocess text
129
+ # nltk.download('stopwords')
130
+ # nltk.download('punkt')
131
+ # nltk.download('omw-1.4')
132
+
133
+ # def preprocess_text(text):
134
+ # translator = str.maketrans("", "", string.punctuation)
135
+ # text = text.translate(translator)
136
+ # tokens = word_tokenize(text)
137
+ # stop_words = set(stopwords.words('english'))
138
+ # tokens = [word for word in tokens if word.lower() not in stop_words]
139
+ # return ' '.join(tokens)
140
+
141
+ # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].astype(str).apply(preprocess_text)
142
+ # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].str.replace(r'\d+', '', regex=True)
143
+ # merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].str.replace(r'\d+', '', regex=True)
144
+ # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
145
+ # merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
146
+
147
+ # # Lemmatize text
148
+ # lemmatizer = nltk.WordNetLemmatizer()
149
+ # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
150
+
151
+ # # Clustering
152
+ # corpus = merged_dataset['Problem_Description'].tolist()
153
+ # tfidf_vectorizer = TfidfVectorizer(max_features=77000)
154
+ # tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
155
+
156
+ # problem_cluster_count = 77
157
+ # kmeans = KMeans(n_clusters=problem_cluster_count)
158
+ # kmeans.fit(tfidf_matrix)
159
+
160
+ # terms = tfidf_vectorizer.get_feature_names_out()
161
+ # ordered_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
162
+
163
+ # cluster_representations = {}
164
+ # for i in range(kmeans.n_clusters):
165
+ # cluster_representations[i] = [terms[ind] for ind in ordered_centroids[i, :17]]
166
+
167
+ # merged_dataset['Problem_Category_Numeric'] = kmeans.labels_
168
+ # merged_dataset['Problem_Category_Words'] = [cluster_representations[label] for label in kmeans.labels_]
169
+
170
+ # # Clustering geographical locations
171
+ # geographical_data = merged_dataset['Geographical_Location'].tolist()
172
+ # tfidf_vectorizer_geography = TfidfVectorizer(max_features=3000)
173
+ # tfidf_matrix_geography = tfidf_vectorizer_geography.fit_transform(geographical_data)
174
+
175
+ # location_cluster_count = 33
176
+ # kmeans_locations = KMeans(n_clusters=location_cluster_count)
177
+ # kmeans_locations.fit(tfidf_matrix_geography)
178
+
179
+ # terms_geography = tfidf_vectorizer_geography.get_feature_names_out()
180
+ # ordered_centroids_geography = kmeans_locations.cluster_centers_.argsort()[:, ::-1]
181
+
182
+ # cluster_representations_geography = {}
183
+ # for i in range(kmeans_locations.n_clusters):
184
+ # cluster_representations_geography[i] = [terms_geography[ind] for ind in ordered_centroids_geography[i, :5]]
185
+
186
+ # merged_dataset['Location_Category_Numeric'] = kmeans_locations.labels_
187
+ # merged_dataset['Location_Category_Words'] = [cluster_representations_geography[label] for label in kmeans_locations.labels_]
188
+
189
+ # # Create 2D matrices for problem descriptions and financial weights
190
+ # matrix2Dfinances = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
191
+ # matrix2Dproblems = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
192
+
193
+ # for index, row in merged_dataset.iterrows():
194
+ # location_index = row['Location_Category_Numeric']
195
+ # problem_index = row['Problem_Category_Numeric']
196
+ # problem_description = row['Problem_Description']
197
+ # financial_wt = row['Financial_Weight']
198
+
199
+ # matrix2Dproblems[problem_index][location_index].append(problem_description)
200
+ # matrix2Dfinances[problem_index][location_index].append(financial_wt)
201
+
202
+ # # Aggregating financial weights
203
+ # aggregated_Financial_wts = {}
204
+ # un_aggregated_Financial_wts = {}
205
+
206
+ # for Financ_wt_index, Financ_wt_row in enumerate(matrix2Dfinances):
207
+ # aggregated_Financial_wts[Financ_wt_index] = {}
208
+ # un_aggregated_Financial_wts[Financ_wt_index] = {}
209
+
210
+ # for location_index, cell_finances in enumerate(Financ_wt_row):
211
+ # cell_sum = sum(cell_finances)
212
+ # aggregated_Financial_wts[Financ_wt_index][location_index] = cell_sum
213
+ # un_aggregated_Financial_wts[Financ_wt_index][location_index] = cell_finances
214
+
215
+ # matrix2Dfinances_df = pd.DataFrame(aggregated_Financial_wts)
216
+ # matrix2Dfinances_df.to_excel('matrix2Dfinances_HeatMap.xlsx', index=True)
217
+
218
+ # unagregated_finances_df = pd.DataFrame(un_aggregated_Financial_wts)
219
+ # unagregated_finances_df.to_excel('UNaggregated Financial Weights.xlsx', index=True)
220
+
221
+ # # Create heatmaps
222
+ # plt.figure(figsize=(15, 7))
223
+ # sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn')
224
+ # plt.title('Project Financial Weights')
225
+ # plt.ylabel('Location Clusters')
226
+ # plt.xlabel('Problem Clusters')
227
+ # plt.savefig('Project Financial Weights_HeatMap_GreenHigh.png')
228
+ # plt.show()
229
+
230
+ # plt.figure(figsize=(14, 6))
231
+ # sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn_r')
232
+ # plt.title('Project Financial Weights')
233
+ # plt.ylabel('Location Clusters')
234
+ # plt.xlabel('Problem Clusters')
235
+ # plt.savefig('Project Financial Weights_HeatMap_RedHigh.png')
236
+ # plt.show()
237
+
238
+ # # Summarizing problems using T5
239
+ # model = T5ForConditionalGeneration.from_pretrained('t5-small')
240
+ # tokenizer = T5Tokenizer.from_pretrained('t5-small')
241
+
242
+ # def t5_summarize(text):
243
+ # input_text = "summarize: " + text
244
+ # inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
245
+ # summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
246
+ # return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
247
+
248
+ # summarized_problems = [[t5_summarize(" ".join(cell)) for cell in row] for row in matrix2Dproblems]
249
+
250
+ # # Save summarized problems
251
+ # with open('summarized_problems.txt', 'w') as file:
252
+ # for problem_row in summarized_problems:
253
+ # file.write("\t".join(problem_row) + "\n")