Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Jul 30, 2024

Commit

61e6b62

verified ·

1 Parent(s): 2d6a87c

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -302

app.py CHANGED Viewed

@@ -1,99 +1,49 @@
 import gradio as gr
 import pandas as pd
 def data_pre_processing(file_responses):
     # Financial Weights are in per decas and NOT per cents
-    ### GPT: Assuming 'Your financial allocation for Problem (in $)' column contains numerical values
-    file_responses['''Your financial allocation for Problem 1:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 1:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''], errors='coerce').fillna(0)
-    file_responses['''Your financial allocation for Problem 2:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 2:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''], errors='coerce').fillna(0)
-    file_responses['''Your financial allocation for Problem 3:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''] = pd.to_numeric(file_responses['''Your financial allocation for Problem 3:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''], errors='coerce').fillna(0)
-    file_responses['''How much was your latest Tax payment (in U$D) ?
-    Please try to be as accurate as possible:
-    Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
-    If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
-    ] = pd.to_numeric(file_responses['''How much was your latest Tax payment (in U$D) ?
-    Please try to be as accurate as possible:
-    Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
-    If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
-    ], errors='coerce').fillna(0)
-    # Adding a new column 'Total Allocation' by summing specific columns by their names
-    file_responses['Total Allocation'] = file_responses[['''Your financial allocation for Problem 1:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''' , '''Your financial allocation for Problem 2:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''' , '''Your financial allocation for Problem 3:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.''']].apply(lambda x: x.clip(lower=10)).sum(axis=1)
-    # Creating 'Financial Weight' column by dividing 'Your financial allocation for Problem 1' by 'Total Allocation' and multiplying this with the assigned decage (similar to percentage but for 10) for Problem 1
-    file_responses['Financial Token Weight for Problem 1'] = file_responses['''How much was your latest Tax payment (in U$D) ?
-    Please try to be as accurate as possible:
-    Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
-    If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
-    ] * file_responses['''Your financial allocation for Problem 1:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.'''] / file_responses['Total Allocation']
-    file_responses['Financial Token Weight for Problem 2'] = file_responses['''How much was your latest Tax payment (in U$D) ?
-    Please try to be as accurate as possible:
-    Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
-    If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
-    ] * file_responses['''Your financial allocation for Problem 2:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.'''] / file_responses['Total Allocation']
-    file_responses['Financial Token Weight for Problem 3'] = file_responses['''How much was your latest Tax payment (in U$D) ?
-    Please try to be as accurate as possible:
-    Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
-    If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
-    ] * file_responses['''Your financial allocation for Problem 3:
-    Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''] / file_responses['Total Allocation']
-    return file_responses
 def nlp_pipeline(original_df):
     processed_df = data_pre_processing(original_df)
-    #original_df['Sum'] = original_df['a'] + original_df['b']
     return processed_df
 def process_excel(file):
     try:
@@ -102,12 +52,10 @@ def process_excel(file):
         # Read the Excel file
         df = pd.read_excel(file_path)
-        # Perform any processing on the DataFrame here
-        # Example: adding a new column with the sum of two other columns
         result_df = nlp_pipeline(df)
-        return result_df  # Return the first few rows as an example
     except Exception as e:
         return str(e)  # Return the error message
@@ -124,214 +72,3 @@ interface = gr.Interface(
 # Launch the interface
 if __name__ == "__main__":
     interface.launch()
-# #!/usr/bin/env python
-# # coding: utf-8
-# import pandas as pd
-# import string
-# import nltk
-# import seaborn as sns
-# import matplotlib.pyplot as plt
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize
-# from nltk.sentiment import SentimentIntensityAnalyzer
-# from sklearn.feature_extraction.text import TfidfVectorizer
-# from sklearn.cluster import KMeans
-# from transformers import T5ForConditionalGeneration, T5Tokenizer
-# from datasets import Dataset
-# # Load the data
-# file_responses = pd.read_excel("#TaxDirection (Responses).xlsx")
-# # Process financial allocations
-# def process_allocations(df, col_name):
-#     return pd.to_numeric(df[col_name], errors='coerce').fillna(0)
-# columns_to_process = [
-#     '''Your financial allocation for Problem 1:
-# Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''',
-#     '''Your financial allocation for Problem 2:
-# Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''',
-#     '''Your financial allocation for Problem 3:
-# Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''
-# ]
-# for col in columns_to_process:
-#     file_responses[col] = process_allocations(file_responses, col)
-# file_responses['How much was your latest Tax payment (in U$D)?'] = pd.to_numeric(
-#     file_responses['How much was your latest Tax payment (in U$D)?'], errors='coerce').fillna(0)
-# # Compute total allocation and financial weights
-# file_responses['Total Allocation'] = file_responses[columns_to_process].apply(lambda x: x.clip(lower=10)).sum(axis=1)
-# for i in range(1, 4):
-#     file_responses[f'Financial Token Weight for Problem {i}'] = (
-#         file_responses['How much was your latest Tax payment (in U$D)?'] *
-#         file_responses[columns_to_process[i - 1]] /
-#         file_responses['Total Allocation']
-#     )
-# # Create initial datasets
-# initial_datasets = []
-# for i in range(1, 4):
-#     initial_datasets.append(
-#         file_responses[[f'''Describe Problem {i}:
-# Enter the context of the problem.
-# What are the difficulties you are facing personally or as a part of an organization?
-# You may briefly propose a solution idea as well.''',
-#             f'''Problem {i}: Geographical Location :
-# Where is the location you are facing this problem?
-# You may mention the nearby geographical area of the proposed solution as:
-# City/Town, State/Province, Country.''',
-#             f'Financial Token Weight for Problem {i}']]
-#     )
-# # Rename columns
-# for idx, df in enumerate(initial_datasets):
-#     initial_datasets[idx] = df.rename(columns={
-#         df.columns[0]: 'Problem_Description',
-#         df.columns[1]: 'Geographical_Location',
-#         df.columns[2]: 'Financial_Weight'
-#     })
-# # Merge datasets
-# merged_dataset = pd.concat(initial_datasets, ignore_index=True)
-# # Preprocess text
-# nltk.download('stopwords')
-# nltk.download('punkt')
-# nltk.download('omw-1.4')
-# def preprocess_text(text):
-#     translator = str.maketrans("", "", string.punctuation)
-#     text = text.translate(translator)
-#     tokens = word_tokenize(text)
-#     stop_words = set(stopwords.words('english'))
-#     tokens = [word for word in tokens if word.lower() not in stop_words]
-#     return ' '.join(tokens)
-# merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].astype(str).apply(preprocess_text)
-# merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].str.replace(r'\d+', '', regex=True)
-# merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].str.replace(r'\d+', '', regex=True)
-# merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
-# merged_dataset['Geographical_Location'] = merged_dataset['Geographical_Location'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
-# # Lemmatize text
-# lemmatizer = nltk.WordNetLemmatizer()
-# merged_dataset['Problem_Description'] = merged_dataset['Problem_Description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
-# # Clustering
-# corpus = merged_dataset['Problem_Description'].tolist()
-# tfidf_vectorizer = TfidfVectorizer(max_features=77000)
-# tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
-# problem_cluster_count = 77
-# kmeans = KMeans(n_clusters=problem_cluster_count)
-# kmeans.fit(tfidf_matrix)
-# terms = tfidf_vectorizer.get_feature_names_out()
-# ordered_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
-# cluster_representations = {}
-# for i in range(kmeans.n_clusters):
-#     cluster_representations[i] = [terms[ind] for ind in ordered_centroids[i, :17]]
-# merged_dataset['Problem_Category_Numeric'] = kmeans.labels_
-# merged_dataset['Problem_Category_Words'] = [cluster_representations[label] for label in kmeans.labels_]
-# # Clustering geographical locations
-# geographical_data = merged_dataset['Geographical_Location'].tolist()
-# tfidf_vectorizer_geography = TfidfVectorizer(max_features=3000)
-# tfidf_matrix_geography = tfidf_vectorizer_geography.fit_transform(geographical_data)
-# location_cluster_count = 33
-# kmeans_locations = KMeans(n_clusters=location_cluster_count)
-# kmeans_locations.fit(tfidf_matrix_geography)
-# terms_geography = tfidf_vectorizer_geography.get_feature_names_out()
-# ordered_centroids_geography = kmeans_locations.cluster_centers_.argsort()[:, ::-1]
-# cluster_representations_geography = {}
-# for i in range(kmeans_locations.n_clusters):
-#     cluster_representations_geography[i] = [terms_geography[ind] for ind in ordered_centroids_geography[i, :5]]
-# merged_dataset['Location_Category_Numeric'] = kmeans_locations.labels_
-# merged_dataset['Location_Category_Words'] = [cluster_representations_geography[label] for label in kmeans_locations.labels_]
-# # Create 2D matrices for problem descriptions and financial weights
-# matrix2Dfinances = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
-# matrix2Dproblems = [[[] for _ in range(location_cluster_count)] for _ in range(problem_cluster_count)]
-# for index, row in merged_dataset.iterrows():
-#     location_index = row['Location_Category_Numeric']
-#     problem_index = row['Problem_Category_Numeric']
-#     problem_description = row['Problem_Description']
-#     financial_wt = row['Financial_Weight']
-#     matrix2Dproblems[problem_index][location_index].append(problem_description)
-#     matrix2Dfinances[problem_index][location_index].append(financial_wt)
-# # Aggregating financial weights
-# aggregated_Financial_wts = {}
-# un_aggregated_Financial_wts = {}
-# for Financ_wt_index, Financ_wt_row in enumerate(matrix2Dfinances):
-#     aggregated_Financial_wts[Financ_wt_index] = {}
-#     un_aggregated_Financial_wts[Financ_wt_index] = {}
-#     for location_index, cell_finances in enumerate(Financ_wt_row):
-#         cell_sum = sum(cell_finances)
-#         aggregated_Financial_wts[Financ_wt_index][location_index] = cell_sum
-#         un_aggregated_Financial_wts[Financ_wt_index][location_index] = cell_finances
-# matrix2Dfinances_df = pd.DataFrame(aggregated_Financial_wts)
-# matrix2Dfinances_df.to_excel('matrix2Dfinances_HeatMap.xlsx', index=True)
-# unagregated_finances_df = pd.DataFrame(un_aggregated_Financial_wts)
-# unagregated_finances_df.to_excel('UNaggregated Financial Weights.xlsx', index=True)
-# # Create heatmaps
-# plt.figure(figsize=(15, 7))
-# sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn')
-# plt.title('Project Financial Weights')
-# plt.ylabel('Location Clusters')
-# plt.xlabel('Problem Clusters')
-# plt.savefig('Project Financial Weights_HeatMap_GreenHigh.png')
-# plt.show()
-# plt.figure(figsize=(14, 6))
-# sns.heatmap(matrix2Dfinances_df, annot=False, cmap='RdYlGn_r')
-# plt.title('Project Financial Weights')
-# plt.ylabel('Location Clusters')
-# plt.xlabel('Problem Clusters')
-# plt.savefig('Project Financial Weights_HeatMap_RedHigh.png')
-# plt.show()
-# # Summarizing problems using T5
-# model = T5ForConditionalGeneration.from_pretrained('t5-small')
-# tokenizer = T5Tokenizer.from_pretrained('t5-small')
-# def t5_summarize(text):
-#     input_text = "summarize: " + text
-#     inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
-#     summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-#     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-# summarized_problems = [[t5_summarize(" ".join(cell)) for cell in row] for row in matrix2Dproblems]
-# # Save summarized problems
-# with open('summarized_problems.txt', 'w') as file:
-#     for problem_row in summarized_problems:
-#         file.write("\t".join(problem_row) + "\n")

 import gradio as gr
 import pandas as pd
 def data_pre_processing(file_responses):
     # Financial Weights are in per decas and NOT per cents
+    try:
+        # Define the columns to be processed
+        columns = [
+            '''Your financial allocation for Problem 1:
+            Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a specific solution for your 1st problem.''',
+            '''Your financial allocation for Problem 2:
+            Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 2nd problem.''',
+            '''Your financial allocation for Problem 3:
+            Mention the percentage of your Tax Amount which you wish the Government would allocate through their annual budget, to implement a solution specifically to your 3rd problem.'''
+        ]
+        # Convert columns to numeric and fill NaN values with 0
+        for col in columns:
+            file_responses[col] = pd.to_numeric(file_responses[col], errors='coerce').fillna(0)
+        # Calculate the Total Allocation
+        file_responses['Total Allocation'] = file_responses[columns].sum(axis=1)
+        # Convert the Tax Payment column to numeric
+        tax_payment_col = '''How much was your latest Tax payment (in U$D) ?
+        Please try to be as accurate as possible:
+        Eg.: If your last tax amount was INR 25,785/-; then convert it in U$D and enter only the amount as: 310.
+        If you have never paid tax, consider putting in a realistic donation amount which wish to contribute towards helping yourself obtain the desired relief.'''
+        file_responses[tax_payment_col] = pd.to_numeric(file_responses[tax_payment_col], errors='coerce').fillna(0)
+        # Calculate Financial Token Weights
+        for i, col in enumerate(columns, start=1):
+            file_responses[f'Financial Token Weight for Problem {i}'] = (
+                file_responses[tax_payment_col] * file_responses[col] / file_responses['Total Allocation']
+            ).fillna(0)
+        return file_responses
+    except Exception as e:
+        return str(e)
 def nlp_pipeline(original_df):
     processed_df = data_pre_processing(original_df)
     return processed_df
 def process_excel(file):
     try:
         # Read the Excel file
         df = pd.read_excel(file_path)
+        # Process the DataFrame
         result_df = nlp_pipeline(df)
+        return result_df  # Return the processed DataFrame
     except Exception as e:
         return str(e)  # Return the error message
 # Launch the interface
 if __name__ == "__main__":
     interface.launch()