SantanuBanerjee commited on
Commit
61e6b62
·
verified ·
1 Parent(s): 2d6a87c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -302
app.py CHANGED
@@ -1,99 +1,49 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
4
-
5
-
6
  def data_pre_processing(file_responses):
7
  # Financial Weights are in per decas and NOT per cents
8
-
9
- ### GPT: Assuming 'Your financial allocation for Problem (in $)' column contains numerical values
10
-
11
- file_responses['''Your financial allocation for Problem 1:
12
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 1:
13
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''], errors='coerce').fillna(0)
14
-
15
- file_responses['''Your financial allocation for Problem 2:
16
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 2:
17
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''], errors='coerce').fillna(0)
18
-
19
- file_responses['''Your financial allocation for Problem 3:
20
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 3:
21
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''], errors='coerce').fillna(0)
22
-
23
- file_responses['''How much was your latest Tax payment (in U$D) ?
24
-
25
- Please try to be as accurate as possible:
26
- Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
27
-
28
- If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
29
- ] = pd.to_numeric(file_responses['''How much was your latest Tax payment (in U$D) ?
30
-
31
- Please try to be as accurate as possible:
32
- Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
33
-
34
- If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
35
- ], errors='coerce').fillna(0)
36
-
37
-
38
-
39
-
40
-
41
- # Adding a new column 'Total Allocation' by summing specific columns by their names
42
- file_responses['Total Allocation'] = file_responses[['''Your financial allocation for Problem 1:
43
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''' , '''Your financial allocation for Problem 2:
44
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''' , '''Your financial allocation for Problem 3:
45
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.''']].apply(lambda x: x.clip(lower=10)).sum(axis=1)
46
-
47
-
48
-
49
-
50
-
51
- # Creating 'Financial Weight' column by dividing 'Your financial allocation for Problem 1' by 'Total Allocation' and multiplying this with the assigned decage (similar to percentage but for 10) for Problem 1
52
- file_responses['Financial Token Weight for Problem 1'] = file_responses['''How much was your latest Tax payment (in U$D) ?
53
-
54
- Please try to be as accurate as possible:
55
- Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
56
-
57
- If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
58
- ] * file_responses['''Your financial allocation for Problem 1:
59
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''] / file_responses['Total Allocation']
60
-
61
-
62
- file_responses['Financial Token Weight for Problem 2'] = file_responses['''How much was your latest Tax payment (in U$D) ?
63
-
64
- Please try to be as accurate as possible:
65
- Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
66
-
67
- If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
68
- ] * file_responses['''Your financial allocation for Problem 2:
69
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''] / file_responses['Total Allocation']
70
-
71
-
72
- file_responses['Financial Token Weight for Problem 3'] = file_responses['''How much was your latest Tax payment (in U$D) ?
73
-
74
- Please try to be as accurate as possible:
75
- Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
76
-
77
- If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
78
- ] * file_responses['''Your financial allocation for Problem 3:
79
- Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''] / file_responses['Total Allocation']
80
-
81
- return file_responses
82
-
83
-
84
-
85
-
86
-
87
-
88
-
89
-
90
 
91
  def nlp_pipeline(original_df):
92
  processed_df = data_pre_processing(original_df)
93
- #original_df['Sum'] = original_df['a'] + original_df['b']
94
  return processed_df
95
-
96
-
97
 
98
  def process_excel(file):
99
  try:
@@ -102,12 +52,10 @@ def process_excel(file):
102
  # Read the Excel file
103
  df = pd.read_excel(file_path)
104
 
105
- # Perform any processing on the DataFrame here
106
- # Example: adding a new column with the sum of two other columns
107
  result_df = nlp_pipeline(df)
108
 
109
-
110
- return result_df # Return the first few rows as an example
111
 
112
  except Exception as e:
113
  return str(e) # Return the error message
@@ -124,214 +72,3 @@ interface = gr.Interface(
124
  # Launch the interface
125
  if __name__ == "__main__":
126
  interface.launch()
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
-
135
-
136
-
137
-
138
- # #!/usr/bin/env python
139
- # # coding: utf-8
140
-
141
- # import pandas as pd
142
- # import string
143
- # import nltk
144
- # import seaborn as sns
145
- # import matplotlib.pyplot as plt
146
- # from nltk.corpus import stopwords
147
- # from nltk.tokenize import word_tokenize
148
- # from nltk.sentiment import SentimentIntensityAnalyzer
149
- # from sklearn.feature_extraction.text import TfidfVectorizer
150
- # from sklearn.cluster import KMeans
151
- # from transformers import T5ForConditionalGeneration, T5Tokenizer
152
- # from datasets import Dataset
153
-
154
- # # Load the data
155
- # file_responses = pd.read_excel("#TaxDirection (Responses).xlsx")
156
-
157
- # # Process financial allocations
158
- # def process_allocations(df, col_name):
159
- # return pd.to_numeric(df[col_name], errors='coerce').fillna(0)
160
-
161
- # columns_to_process = [
162
- # '''Your financial allocation for Problem 1:
163
- # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''',
164
- # '''Your financial allocation for Problem 2:
165
- # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''',
166
- # '''Your financial allocation for Problem 3:
167
- # Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''
168
- # ]
169
-
170
- # for col in columns_to_process:
171
- # file_responses[col] = process_allocations(file_responses, col)
172
-
173
- # file_responses['How much was your latest Tax payment (in U$D)?'] = pd.to_numeric(
174
- # file_responses['How much was your latest Tax payment (in U$D)?'], errors='coerce').fillna(0)
175
-
176
- # # Compute total allocation and financial weights
177
- # file_responses['Total Allocation'] = file_responses[columns_to_process].apply(lambda x: x.clip(lower=10)).sum(axis=1)
178
-
179
- # for i in range(1, 4):
180
- # file_responses[f'Financial Token Weight for Problem {i}'] = (
181
- # file_responses['How much was your latest Tax payment (in U$D)?'] *
182
- # file_responses[columns_to_process[i - 1]] /
183
- # file_responses['Total Allocation']
184
- # )
185
-
186
- # # Create initial datasets
187
- # initial_datasets = []
188
- # for i in range(1, 4):
189
- # initial_datasets.append(
190
- # file_responses[[f'''Describe Problem {i}:
191
- # Enter the context of the problem.
192
- # What are the difficulties you are facing personally or as a part of an organization?
193
- # You may briefly propose a solution idea as well.''',
194
- # f'''Problem {i}: Geographical Location :
195
- # Where is the location you are facing this problem?
196
- # You may mention the nearby geographical area of the proposed solution as:
197
- # City/Town, State/Province, Country.''',
198
- # f'Financial Token Weight for Problem {i}']]
199
- # )
200
-
201
- # # Rename columns
202
- # for idx, df in enumerate(initial_datasets):
203
- # initial_datasets[idx] = df.rename(columns={
204
- # df.columns[0]: 'Problem_Description',
205
- # df.columns[1]: 'Geographical_Location',
206
- # df.columns[2]: 'Financial_Weight'
207
- # })
208
-
209
- # # Merge datasets
210
- # merged_dataset = pd.concat(initial_datasets, ignore_index=True)
211
-
212
- # # Preprocess text
213
- # nltk.download('stopwords')
214
- # nltk.download('punkt')
215
- # nltk.download('omw-1.4')
216
-
217
- # def preprocess_text(text):
218
- # translator = str.maketrans("", "", string.punctuation)
219
- # text = text.translate(translator)
220
- # tokens = word_tokenize(text)
221
- # stop_words = set(stopwords.words('english'))
222
- # tokens = [word for word in tokens if word.lower() not in stop_words]
223
- # return ' '.join(tokens)
224
-
225
- # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].astype(str).apply(preprocess_text)
226
- # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].str.replace(r'\d+', '', regex=True)
227
- # merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].str.replace(r'\d+', '', regex=True)
228
- # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
229
- # merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
230
-
231
- # # Lemmatize text
232
- # lemmatizer = nltk.WordNetLemmatizer()
233
- # merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
234
-
235
- # # Clustering
236
- # corpus = merged_dataset['Problem_Description'].tolist()
237
- # tfidf_vectorizer = TfidfVectorizer(max_features=77000)
238
- # tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
239
-
240
- # problem_cluster_count = 77
241
- # kmeans = KMeans(n_clusters=problem_cluster_count)
242
- # kmeans.fit(tfidf_matrix)
243
-
244
- # terms = tfidf_vectorizer.get_feature_names_out()
245
- # ordered_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
246
-
247
- # cluster_representations = {}
248
- # for i in range(kmeans.n_clusters):
249
- # cluster_representations[i] = [terms[ind] for ind in ordered_centroids[i, :17]]
250
-
251
- # merged_dataset['Problem_Category_Numeric'] = kmeans.labels_
252
- # merged_dataset['Problem_Category_Words'] = [cluster_representations[label] for label in kmeans.labels_]
253
-
254
- # # Clustering geographical locations
255
- # geographical_data = merged_dataset['Geographical_Location'].tolist()
256
- # tfidf_vectorizer_geography = TfidfVectorizer(max_features=3000)
257
- # tfidf_matrix_geography = tfidf_vectorizer_geography.fit_transform(geographical_data)
258
-
259
- # location_cluster_count = 33
260
- # kmeans_locations = KMeans(n_clusters=location_cluster_count)
261
- # kmeans_locations.fit(tfidf_matrix_geography)
262
-
263
- # terms_geography = tfidf_vectorizer_geography.get_feature_names_out()
264
- # ordered_centroids_geography = kmeans_locations.cluster_centers_.argsort()[:, ::-1]
265
-
266
- # cluster_representations_geography = {}
267
- # for i in range(kmeans_locations.n_clusters):
268
- # cluster_representations_geography[i] = [terms_geography[ind] for ind in ordered_centroids_geography[i, :5]]
269
-
270
- # merged_dataset['Location_Category_Numeric'] = kmeans_locations.labels_
271
- # merged_dataset['Location_Category_Words'] = [cluster_representations_geography[label] for label in kmeans_locations.labels_]
272
-
273
- # # Create 2D matrices for problem descriptions and financial weights
274
- # matrix2Dfinances = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
275
- # matrix2Dproblems = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
276
-
277
- # for index, row in merged_dataset.iterrows():
278
- # location_index = row['Location_Category_Numeric']
279
- # problem_index = row['Problem_Category_Numeric']
280
- # problem_description = row['Problem_Description']
281
- # financial_wt = row['Financial_Weight']
282
-
283
- # matrix2Dproblems[problem_index][location_index].append(problem_description)
284
- # matrix2Dfinances[problem_index][location_index].append(financial_wt)
285
-
286
- # # Aggregating financial weights
287
- # aggregated_Financial_wts = {}
288
- # un_aggregated_Financial_wts = {}
289
-
290
- # for Financ_wt_index, Financ_wt_row in enumerate(matrix2Dfinances):
291
- # aggregated_Financial_wts[Financ_wt_index] = {}
292
- # un_aggregated_Financial_wts[Financ_wt_index] = {}
293
-
294
- # for location_index, cell_finances in enumerate(Financ_wt_row):
295
- # cell_sum = sum(cell_finances)
296
- # aggregated_Financial_wts[Financ_wt_index][location_index] = cell_sum
297
- # un_aggregated_Financial_wts[Financ_wt_index][location_index] = cell_finances
298
-
299
- # matrix2Dfinances_df = pd.DataFrame(aggregated_Financial_wts)
300
- # matrix2Dfinances_df.to_excel('matrix2Dfinances_HeatMap.xlsx', index=True)
301
-
302
- # unagregated_finances_df = pd.DataFrame(un_aggregated_Financial_wts)
303
- # unagregated_finances_df.to_excel('UNaggregated Financial Weights.xlsx', index=True)
304
-
305
- # # Create heatmaps
306
- # plt.figure(figsize=(15, 7))
307
- # sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn')
308
- # plt.title('Project Financial Weights')
309
- # plt.ylabel('Location Clusters')
310
- # plt.xlabel('Problem Clusters')
311
- # plt.savefig('Project Financial Weights_HeatMap_GreenHigh.png')
312
- # plt.show()
313
-
314
- # plt.figure(figsize=(14, 6))
315
- # sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn_r')
316
- # plt.title('Project Financial Weights')
317
- # plt.ylabel('Location Clusters')
318
- # plt.xlabel('Problem Clusters')
319
- # plt.savefig('Project Financial Weights_HeatMap_RedHigh.png')
320
- # plt.show()
321
-
322
- # # Summarizing problems using T5
323
- # model = T5ForConditionalGeneration.from_pretrained('t5-small')
324
- # tokenizer = T5Tokenizer.from_pretrained('t5-small')
325
-
326
- # def t5_summarize(text):
327
- # input_text = "summarize: " + text
328
- # inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
329
- # summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
330
- # return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
331
-
332
- # summarized_problems = [[t5_summarize(" ".join(cell)) for cell in row] for row in matrix2Dproblems]
333
-
334
- # # Save summarized problems
335
- # with open('summarized_problems.txt', 'w') as file:
336
- # for problem_row in summarized_problems:
337
- # file.write("\t".join(problem_row) + "\n")
 
1
  import gradio as gr
2
  import pandas as pd
3
 
 
 
4
  def data_pre_processing(file_responses):
5
  # Financial Weights are in per decas and NOT per cents
6
+ try:
7
+ # Define the columns to be processed
8
+ columns = [
9
+ '''Your financial allocation for Problem 1:
10
+ Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''',
11
+ '''Your financial allocation for Problem 2:
12
+ Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''',
13
+ '''Your financial allocation for Problem 3:
14
+ Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''
15
+ ]
16
+
17
+ # Convert columns to numeric and fill NaN values with 0
18
+ for col in columns:
19
+ file_responses[col] = pd.to_numeric(file_responses[col], errors='coerce').fillna(0)
20
+
21
+ # Calculate the Total Allocation
22
+ file_responses['Total Allocation'] = file_responses[columns].sum(axis=1)
23
+
24
+ # Convert the Tax Payment column to numeric
25
+ tax_payment_col = '''How much was your latest Tax payment (in U$D) ?
26
+
27
+ Please try to be as accurate as possible:
28
+ Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
29
+
30
+ If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
31
+
32
+ file_responses[tax_payment_col] = pd.to_numeric(file_responses[tax_payment_col], errors='coerce').fillna(0)
33
+
34
+ # Calculate Financial Token Weights
35
+ for i, col in enumerate(columns, start=1):
36
+ file_responses[f'Financial Token Weight for Problem {i}'] = (
37
+ file_responses[tax_payment_col] * file_responses[col] / file_responses['Total Allocation']
38
+ ).fillna(0)
39
+
40
+ return file_responses
41
+ except Exception as e:
42
+ return str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def nlp_pipeline(original_df):
45
  processed_df = data_pre_processing(original_df)
 
46
  return processed_df
 
 
47
 
48
  def process_excel(file):
49
  try:
 
52
  # Read the Excel file
53
  df = pd.read_excel(file_path)
54
 
55
+ # Process the DataFrame
 
56
  result_df = nlp_pipeline(df)
57
 
58
+ return result_df # Return the processed DataFrame
 
59
 
60
  except Exception as e:
61
  return str(e) # Return the error message
 
72
  # Launch the interface
73
  if __name__ == "__main__":
74
  interface.launch()