Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 8, 2024

Commit

5ad8a00

verified ·

1 Parent(s): b6f8dc2

Update app.py

Browse files

This is the best version that can be run. If this doesn't work, you can always use the previous setting which is the MVP

Files changed (1) hide show

app.py +13 -14

app.py CHANGED Viewed

@@ -194,8 +194,8 @@ from collections import Counter
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
-                            cluster_range=(6, 8),
-                            top_words=7):
     consoleMessage_and_Print("Extracting Problem Domains...")
     # Sentence Transformers approach
@@ -276,8 +276,8 @@ def text_processing_for_location(text):
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
-                              cluster_range=(3, 5),
-                              top_words=3):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
@@ -361,17 +361,17 @@ from transformers import GPTNeoForCausalLM, GPT2Tokenizer
 def generate_project_proposal(prompt): # Generate the proposal
     default_proposal = "Hyper-local Sustainability Projects would lead to Longevity of the self and Prosperity of the community. Therefore UNSDGs coupled with Longevity initiatives should be focused upon."
-    # model_Name = "EleutherAI/gpt-neo-2.7B"
-    # tempareCHUR = uniform(0.3,0.6)
-    model_Name = "EleutherAI/gpt-neo-1.3B"
-    tempareCHUR = uniform(0.5,0.8)
     consoleMessage_and_Print(f"Trying to access {model_Name} model. The Prompt is: \n{prompt}")
     model = GPTNeoForCausalLM.from_pretrained(model_Name)
     tokenizer = GPT2Tokenizer.from_pretrained(model_Name)
-    model_max_token_limit = 1500 #2048
     try:
         # input_ids = tokenizer.encode(prompt, return_tensors="pt")
@@ -457,9 +457,8 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
                 random.shuffle(shuffled_descriptions)
                 # Prepare the prompt
-                # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
-                # problems_summary = "; \n".join(problem_descriptions[:3])  # Limit to first 3 for brevity
-                problems_summary = "; \n".join(shuffled_descriptions[:7])
                 # prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
@@ -767,9 +766,9 @@ def process_excel(file):
 example_files = []
-example_files.append('#TaxDirection (Responses)_BasicExample.xlsx')
 # example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
-# example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
 import random

 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
+                            cluster_range=(3, 17),
+                            top_words=10):
     consoleMessage_and_Print("Extracting Problem Domains...")
     # Sentence Transformers approach
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
+                              cluster_range=(3, 17),
+                              top_words=10):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
 def generate_project_proposal(prompt): # Generate the proposal
     default_proposal = "Hyper-local Sustainability Projects would lead to Longevity of the self and Prosperity of the community. Therefore UNSDGs coupled with Longevity initiatives should be focused upon."
+    model_Name = "EleutherAI/gpt-neo-2.7B"
+    tempareCHUR = uniform(0.3,0.6)
+    # model_Name = "EleutherAI/gpt-neo-1.3B"
+    # tempareCHUR = uniform(0.5,0.8)
     consoleMessage_and_Print(f"Trying to access {model_Name} model. The Prompt is: \n{prompt}")
     model = GPTNeoForCausalLM.from_pretrained(model_Name)
     tokenizer = GPT2Tokenizer.from_pretrained(model_Name)
+    model_max_token_limit = 2048 #1500
     try:
         # input_ids = tokenizer.encode(prompt, return_tensors="pt")
                 random.shuffle(shuffled_descriptions)
                 # Prepare the prompt
+                # problems_summary = "; \n".join(shuffled_descriptions[:7]) # Limit to first 3 for brevity
+                problems_summary = "; \n".join(shuffled_descriptions) # Join all problem descriptions
                 # prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
 example_files = []
+# example_files.append('#TaxDirection (Responses)_BasicExample.xlsx')
 # example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
+example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
 import random