Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 7, 2024

Commit

db2900b

verified ·

1 Parent(s): 1534c20

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -30

app.py CHANGED Viewed

@@ -352,43 +352,56 @@ def create_cluster_dataframes(processed_df):
     return budget_cluster_df, problem_cluster_df
 from transformers import GPTNeoForCausalLM, GPT2Tokenizer
-def generate_project_proposal(prompt):
-    print("Trying to access gpt-neo-1.3B")
-    print("prompt: \t", prompt)
-    # Generate the proposal
-    model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-    tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
     try:
         # input_ids = tokenizer.encode(prompt, return_tensors="pt")
         # Truncate the prompt to fit within the model's input limits
-        max_input_length = 1024  # Adjust as per your model's limit
-        input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
         print("Input IDs shape:", input_ids.shape)
         output = model.generate(
             input_ids,
-            # max_length=300,
-            max_new_tokens=500,
             num_return_sequences=1,
             no_repeat_ngram_size=2,
             temperature=0.5,
             pad_token_id=tokenizer.eos_token_id  # Ensure padding with EOS token
             )
         print("Output shape:", output.shape)
-        proposal = tokenizer.decode(output[0], skip_special_tokens=True)
-        if "Project Proposal:" in proposal:
-            proposal = proposal.split("Project Proposal:", 1)[1].strip()
-        else:
-            proposal = proposal.strip()
         # print("Successfully accessed gpt-neo-1.3B and returning")
-        print("Generated Proposal: ", proposal)
         return proposal
     except Exception as e:
         print("Error generating proposal:", str(e))
@@ -404,8 +417,6 @@ import copy
 def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
     consoleMessage_and_Print("\n Starting function: create_project_proposals")
     proposals = {}
-    sanban_debug = False
     for loc in budget_cluster_df.index:
         consoleMessage_and_Print(f"\n loc: {loc}")
@@ -432,26 +443,20 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
                 # Prepare the prompt
                 # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
                 # problems_summary = "; \n".join(problem_descriptions[:3])  # Limit to first 3 for brevity
-                problems_summary = "; \n".join(shuffled_descriptions[:3])
                 # prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
                 # prompt = f"Generate a solution-oriented project proposal for the following public problem (only output the proposal):\n\n Geographical/Digital Location: {location}\nProblem Category: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
-                prompt = f"Generate a single solution-oriented project proposal bespoke to the following Location~Domain cluster of public problems:\n\n Geographical/Digital Location: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal: <only output this proposal>"
                 proposal = generate_project_proposal(prompt)
                 # Check if proposal is valid
                 if isinstance(proposal, str) and proposal.strip():  # Valid string that's not empty
                     proposals[(loc, prob)] = proposal
-                    sanban_debug = True
-                    break
             else:
                 print(f"Skipping empty problem descriptions for location: {location}, problem domain: {problem_domain}")
-        if sanban_debug:
-            break
     return proposals
@@ -746,8 +751,8 @@ def process_excel(file):
 example_files = []
-example_files.append('#TaxDirection (Responses)_BasicExample.xlsx')
-# example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
 # example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
@@ -765,7 +770,7 @@ interface = gr.Interface(
     outputs=[
         gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"),  # File download output
-        gr.Textbox(label="Console Messages", lines=10, interactive=False)  # Console messages output
         ],

     return budget_cluster_df, problem_cluster_df
 from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+def generate_project_proposal(prompt): # Generate the proposal
+    # model_Name = "EleutherAI/gpt-neo-2.7B"
+    model_Name = "EleutherAI/gpt-neo-1.3B"
+    consoleMessage_and_Print(f"Trying to access {model_Name} model. The Prompt is: \n{prompt}")
+    model = GPTNeoForCausalLM.from_pretrained(model_Name)
+    tokenizer = GPT2Tokenizer.from_pretrained(model_Name)
+    model_max_token_limit = 2048
     try:
         # input_ids = tokenizer.encode(prompt, return_tensors="pt")
         # Truncate the prompt to fit within the model's input limits
+        # Adjust as per your model's limit
+        input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length = model_max_token_limit/2)
         print("Input IDs shape:", input_ids.shape)
+        # Generate the output
         output = model.generate(
             input_ids,
+            max_new_tokens = model_max_token_limit,
             num_return_sequences=1,
             no_repeat_ngram_size=2,
             temperature=0.5,
             pad_token_id=tokenizer.eos_token_id  # Ensure padding with EOS token
             )
         print("Output shape:", output.shape)
+        # Decode the output to text
+        full_returned_segment = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Slice off the input part if the input length is known
+        input_length = input_ids.shape[1]
+        generated_part = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)
+        proposal = generated_part.strip()
+        # if "Project Proposal:" in proposal:
+        #     proposal = proposal.split("Project Proposal:", 1)[1].strip()
         # print("Successfully accessed gpt-neo-1.3B and returning")
+        print("Generated Proposal: \n", proposal,"\n\n")
         return proposal
     except Exception as e:
         print("Error generating proposal:", str(e))
 def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
     consoleMessage_and_Print("\n Starting function: create_project_proposals")
     proposals = {}
     for loc in budget_cluster_df.index:
         consoleMessage_and_Print(f"\n loc: {loc}")
                 # Prepare the prompt
                 # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
                 # problems_summary = "; \n".join(problem_descriptions[:3])  # Limit to first 3 for brevity
+                problems_summary = "; \n".join(shuffled_descriptions[:7])
                 # prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
                 # prompt = f"Generate a solution-oriented project proposal for the following public problem (only output the proposal):\n\n Geographical/Digital Location: {location}\nProblem Category: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
+                prompt = f"Generate a singular solution-oriented project proposal bespoke to the following Location~Domain cluster of public problems:\n\n Geographical/Digital Location: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal: \t"
                 proposal = generate_project_proposal(prompt)
                 # Check if proposal is valid
                 if isinstance(proposal, str) and proposal.strip():  # Valid string that's not empty
                     proposals[(loc, prob)] = proposal
             else:
                 print(f"Skipping empty problem descriptions for location: {location}, problem domain: {problem_domain}")
     return proposals
 example_files = []
+# example_files.append('#TaxDirection (Responses)_BasicExample.xlsx')
+example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
 # example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
     outputs=[
         gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"),  # File download output
+        gr.Textbox(label="Console Messages", lines=5, interactive=False)  # Console messages output
         ],