Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -341,6 +341,81 @@ def extract_location_clusters(df,
|
|
341 |
|
342 |
|
343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
def nlp_pipeline(original_df):
|
345 |
console_messages.append("Starting NLP pipeline...")
|
346 |
|
@@ -380,12 +455,25 @@ def nlp_pipeline(original_df):
|
|
380 |
console_messages.append(f"Error in extract_location_clusters: {str(e)}")
|
381 |
console_messages.append("NLP pipeline for location extraction completed.")
|
382 |
|
|
|
|
|
383 |
|
384 |
-
console_messages.append("NLP pipeline completed.")
|
385 |
-
return processed_df
|
386 |
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
|
|
|
|
|
|
|
|
389 |
|
390 |
|
391 |
|
@@ -400,14 +488,17 @@ def process_excel(file):
|
|
400 |
try:
|
401 |
# Process the DataFrame
|
402 |
console_messages.append("Processing the DataFrame...")
|
403 |
-
|
404 |
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
408 |
|
409 |
console_messages.append("Processing completed. Ready for download.")
|
410 |
-
return
|
411 |
|
412 |
except Exception as e:
|
413 |
# return str(e) # Return the error message
|
@@ -422,8 +513,8 @@ def process_excel(file):
|
|
422 |
|
423 |
|
424 |
example_files = []
|
425 |
-
|
426 |
-
example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
|
427 |
# example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
|
428 |
|
429 |
|
|
|
341 |
|
342 |
|
343 |
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
def create_cluster_dataframes(processed_df):
|
348 |
+
# Create a dataframe for Financial Weights
|
349 |
+
budget_cluster_df = processed_df.pivot_table(
|
350 |
+
values='Financial_Weight',
|
351 |
+
index='Location_Cluster',
|
352 |
+
columns='Problem_Cluster',
|
353 |
+
aggfunc='sum',
|
354 |
+
fill_value=0)
|
355 |
+
|
356 |
+
# Create a dataframe for Problem Descriptions
|
357 |
+
problem_cluster_df = processed_df.groupby(['Location_Cluster', 'Problem_Cluster'])['Problem_Description'].apply(list).unstack()
|
358 |
+
|
359 |
+
return budget_cluster_df, problem_cluster_df
|
360 |
+
|
361 |
+
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
|
362 |
+
def generate_project_proposal(problem_descriptions, location, problem_domain):
|
363 |
+
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
|
364 |
+
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
|
365 |
+
|
366 |
+
# Prepare the prompt
|
367 |
+
problems_summary = "; ".join(problem_descriptions[:3]) # Limit to first 3 for brevity
|
368 |
+
# problems_summary = "; ".join(problem_descriptions)
|
369 |
+
# prompt = f"Generate a project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\nBudget: ${financial_weight:.2f}\n\nProject Proposal:"
|
370 |
+
prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
|
371 |
+
|
372 |
+
# Generate the proposal
|
373 |
+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
374 |
+
output = model.generate(
|
375 |
+
input_ids,
|
376 |
+
max_length=300,
|
377 |
+
num_return_sequences=1,
|
378 |
+
no_repeat_ngram_size=2,
|
379 |
+
temperature=0.75)
|
380 |
+
|
381 |
+
proposal = tokenizer.decode(output[0], skip_special_tokens=True)
|
382 |
+
return proposal
|
383 |
+
|
384 |
+
def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
|
385 |
+
proposals = {}
|
386 |
+
for loc in budget_cluster_df.index:
|
387 |
+
for prob in budget_cluster_df.columns:
|
388 |
+
location = ", ".join(location_clusters[loc])
|
389 |
+
problem_domain = ", ".join(problem_clusters[prob])
|
390 |
+
problem_descriptions = problem_cluster_df.loc[loc, prob]
|
391 |
+
|
392 |
+
if problem_descriptions:
|
393 |
+
proposal = generate_project_proposal(
|
394 |
+
problem_descriptions,
|
395 |
+
location,
|
396 |
+
problem_domain)
|
397 |
+
proposals[(loc, prob)] = proposal
|
398 |
+
|
399 |
+
return proposals
|
400 |
+
|
401 |
+
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
|
408 |
+
|
409 |
+
|
410 |
+
|
411 |
+
|
412 |
+
|
413 |
+
|
414 |
+
|
415 |
+
|
416 |
+
|
417 |
+
|
418 |
+
|
419 |
def nlp_pipeline(original_df):
|
420 |
console_messages.append("Starting NLP pipeline...")
|
421 |
|
|
|
455 |
console_messages.append(f"Error in extract_location_clusters: {str(e)}")
|
456 |
console_messages.append("NLP pipeline for location extraction completed.")
|
457 |
|
458 |
+
|
459 |
+
|
460 |
|
|
|
|
|
461 |
|
462 |
+
# Create cluster dataframes
|
463 |
+
budget_cluster_df, problem_cluster_df = create_cluster_dataframes(processed_df)
|
464 |
+
|
465 |
+
# Generate project proposals
|
466 |
+
location_clusters = dict(enumerate(processed_df['Location_Category_Words'].unique()))
|
467 |
+
problem_clusters = dict(enumerate(processed_df['Problem_Category_Words'].unique()))
|
468 |
+
project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
|
469 |
+
|
470 |
+
console_messages.append("NLP pipeline completed.")
|
471 |
+
return processed_df, budget_cluster_df, problem_cluster_df, project_proposals
|
472 |
|
473 |
+
|
474 |
+
|
475 |
+
|
476 |
+
|
477 |
|
478 |
|
479 |
|
|
|
488 |
try:
|
489 |
# Process the DataFrame
|
490 |
console_messages.append("Processing the DataFrame...")
|
491 |
+
processed_df, budget_cluster_df, problem_cluster_df, project_proposals = nlp_pipeline(df)
|
492 |
|
493 |
+
output_filename = "OutPut_PPs.xlsx"
|
494 |
+
with pd.ExcelWriter(output_filename) as writer:
|
495 |
+
project_proposals.to_excel(writer, sheet_name='Project_Proposals', index=False)
|
496 |
+
budget_cluster_df.to_excel(writer, sheet_name='Financial_Weights')
|
497 |
+
problem_cluster_df.to_excel(writer, sheet_name='Problem_Descriptions')
|
498 |
+
processed_df.to_excel(writer, sheet_name='Input_Processed', index=False)
|
499 |
|
500 |
console_messages.append("Processing completed. Ready for download.")
|
501 |
+
return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
|
502 |
|
503 |
except Exception as e:
|
504 |
# return str(e) # Return the error message
|
|
|
513 |
|
514 |
|
515 |
example_files = []
|
516 |
+
example_files.append('#TaxDirection (Responses)_BasicExample.xlsx')
|
517 |
+
# example_files.append('#TaxDirection (Responses)_IntermediateExample.xlsx')
|
518 |
# example_files.append('#TaxDirection (Responses)_UltimateExample.xlsx')
|
519 |
|
520 |
|