Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

yulongchen commited on Jun 27

Commit

90fcd9f

1 Parent(s): b2cd959

Add system

Browse files

Files changed (2) hide show

app.py +2 -2
system/pledge_tracking.py +64 -52

app.py CHANGED Viewed

@@ -118,7 +118,7 @@ def run_model():
     data = request.get_json()
     claim = data.get("claim", "no input")
     time_range_option = data.get("time_range", "month")
-    system_start_time = datetime.now().isoformat()
     suggestion_meta = data.get("suggestion_meta")
     pledge_date = data.get("pledge_date", "")
@@ -168,7 +168,7 @@ def run_model():
         df.to_json(json_path, orient="records", indent=2)
-        system_end_time = datetime.now().isoformat()
         runtime = system_end_time - system_start_time

     data = request.get_json()
     claim = data.get("claim", "no input")
     time_range_option = data.get("time_range", "month")
+    system_start_time = datetime.now()
     suggestion_meta = data.get("suggestion_meta")
     pledge_date = data.get("pledge_date", "")
         df.to_json(json_path, orient="records", indent=2)
+        system_end_time = datetime.now()
         runtime = system_end_time - system_start_time

system/pledge_tracking.py CHANGED Viewed

@@ -119,70 +119,82 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
                       f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
             step_id+=1
-    augmented_tsv_file = run_augmented_searching(
-        qa_file=qa_file_path,
-        pledge_author=pledge_author,
-        pledge_date=pledge_date,
-        pipeline_base_dir=pipeline_base_dir,
-        start_date=start_date,
-        suggestion_meta=suggestion_meta,
-        end_date="",
-    )
-    with open(augmented_tsv_file, "r", encoding="utf-8") as f:
-        line_count = sum(1 for line in f)
-    if update_fn:
-        update_fn(step_id, f"{line_count} URLs are retrieved")
-        step_id+=1
     augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
     os.makedirs(augmented_data_store_dir, exist_ok=True)
-    augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
-    run_scraper(augmented_tsv_file, augmented_scraped_output_path)
-    with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
-        line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
-    if update_fn:
-        update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
-        step_id+=1
-    run_hero_reranking(pipeline_base_dir, suggestion_meta)
-    # Step 7: Preparing for GPT-4
-    # print("🧠 Step 7: Processing format ...")
-    meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
-    # Step 8: Event extraction using GPT-4
-    print("🧠 Extracting events ...")
-    all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
-    unique_urls = set()
-    with open(all_info_path, "r", encoding="utf-8") as f:
-        for line in f:
-            data = json.loads(line)
-            docs = data.get("evidence", [])
-            for doc in docs:
-                if "url" in doc:
-                    unique_urls.add(doc["url"])
-    if update_fn:
-        update_fn(step_id, f"{len(unique_urls)} documents are selected")
-        step_id+=1
-    extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
-    events_num = count_total_events(extracted_event_path)
-    if update_fn:
-        update_fn(step_id, f"{events_num} events are extracted from those documents.")
-        step_id+=1
-    # Step 9: Sorting events and label usefulness
     print("📅 Sorting events temporally ...")
@@ -193,7 +205,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
         claim=claim,
         suggestion_meta=suggestion_meta
         )
-    # print(sorted_events)
     df = pd.DataFrame(sorted_events)
     sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
     df.to_excel(sorted_event_path, index=False)

                       f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
             step_id+=1
+    try:
+        augmented_tsv_file = run_augmented_searching(
+            qa_file=qa_file_path,
+            pledge_author=pledge_author,
+            pledge_date=pledge_date,
+            pipeline_base_dir=pipeline_base_dir,
+            start_date=start_date,
+            suggestion_meta=suggestion_meta,
+            end_date="",
+        )
+        with open(augmented_tsv_file, "r", encoding="utf-8") as f:
+            line_count = sum(1 for line in f)
+        if update_fn:
+            update_fn(step_id, f"{line_count} URLs are retrieved")
+            step_id+=1
+    except Exception as e:
+        if update_fn:
+            update_fn(step_id, f"❌ run_augmented_searching failed: {e}")
+        raise
     augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
     os.makedirs(augmented_data_store_dir, exist_ok=True)
+    try:
+        augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
+        run_scraper(augmented_tsv_file, augmented_scraped_output_path)
+        with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
+            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
+        if update_fn:
+            update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
+            step_id+=1
+    except Exception as e:
+        if update_fn:
+            update_fn(step_id, f"❌ run_scraper failed: {e}")
+        raise
+    try:
+        run_hero_reranking(pipeline_base_dir, suggestion_meta)
+        meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
+        all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
+        unique_urls = set()
+        with open(all_info_path, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                docs = data.get("evidence", [])
+                for doc in docs:
+                    if "url" in doc:
+                        unique_urls.add(doc["url"])
+        if update_fn:
+            update_fn(step_id, f"{len(unique_urls)} documents are selected")
+            step_id+=1
+    except Exception as e:
+        if update_fn:
+            update_fn(step_id, f"❌ run_hero_reranking failed: {e}")
+        raise
+    try:
+        extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
+        events_num = count_total_events(extracted_event_path)
+        if update_fn:
+            update_fn(step_id, f"{events_num} events are extracted from those documents.")
+            step_id+=1
+    except Exception as e:
+        if update_fn:
+            update_fn(step_id, f"❌ Event extraction failed: {e}")
+        raise
     print("📅 Sorting events temporally ...")
         claim=claim,
         suggestion_meta=suggestion_meta
         )
     df = pd.DataFrame(sorted_events)
     sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
     df.to_excel(sorted_event_path, index=False)