yulongchen commited on
Commit
90fcd9f
Β·
1 Parent(s): b2cd959

Add system

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. system/pledge_tracking.py +64 -52
app.py CHANGED
@@ -118,7 +118,7 @@ def run_model():
118
  data = request.get_json()
119
  claim = data.get("claim", "no input")
120
  time_range_option = data.get("time_range", "month")
121
- system_start_time = datetime.now().isoformat()
122
 
123
  suggestion_meta = data.get("suggestion_meta")
124
  pledge_date = data.get("pledge_date", "")
@@ -168,7 +168,7 @@ def run_model():
168
  df.to_json(json_path, orient="records", indent=2)
169
 
170
 
171
- system_end_time = datetime.now().isoformat()
172
 
173
  runtime = system_end_time - system_start_time
174
 
 
118
  data = request.get_json()
119
  claim = data.get("claim", "no input")
120
  time_range_option = data.get("time_range", "month")
121
+ system_start_time = datetime.now()
122
 
123
  suggestion_meta = data.get("suggestion_meta")
124
  pledge_date = data.get("pledge_date", "")
 
168
  df.to_json(json_path, orient="records", indent=2)
169
 
170
 
171
+ system_end_time = datetime.now()
172
 
173
  runtime = system_end_time - system_start_time
174
 
system/pledge_tracking.py CHANGED
@@ -119,70 +119,82 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
119
  f"    5. {questions[4]}")
120
  step_id+=1
121
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- augmented_tsv_file = run_augmented_searching(
124
- qa_file=qa_file_path,
125
- pledge_author=pledge_author,
126
- pledge_date=pledge_date,
127
- pipeline_base_dir=pipeline_base_dir,
128
- start_date=start_date,
129
- suggestion_meta=suggestion_meta,
130
- end_date="",
131
- )
132
-
133
-
134
 
135
- with open(augmented_tsv_file, "r", encoding="utf-8") as f:
136
- line_count = sum(1 for line in f)
137
- if update_fn:
138
- update_fn(step_id, f"{line_count} URLs are retrieved")
139
- step_id+=1
 
 
 
 
140
 
141
  augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
142
  os.makedirs(augmented_data_store_dir, exist_ok=True)
143
- augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
144
- run_scraper(augmented_tsv_file, augmented_scraped_output_path)
145
-
146
- with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
147
- line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
148
- if update_fn:
149
- update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
150
- step_id+=1
151
-
152
-
153
- run_hero_reranking(pipeline_base_dir, suggestion_meta)
154
 
155
- # Step 7: Preparing for GPT-4
156
- # print("🧠 Step 7: Processing format ...")
 
157
 
158
- meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
159
-
160
- # Step 8: Event extraction using GPT-4
161
- print("🧠 Extracting events ...")
 
 
 
 
 
 
162
 
163
- all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
164
- unique_urls = set()
165
- with open(all_info_path, "r", encoding="utf-8") as f:
166
- for line in f:
167
- data = json.loads(line)
168
- docs = data.get("evidence", [])
169
- for doc in docs:
170
- if "url" in doc:
171
- unique_urls.add(doc["url"])
172
- if update_fn:
173
- update_fn(step_id, f"{len(unique_urls)} documents are selected")
174
- step_id+=1
 
 
 
 
 
 
 
 
175
 
176
- extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
 
177
 
178
- events_num = count_total_events(extracted_event_path)
179
 
180
- if update_fn:
181
- update_fn(step_id, f"{events_num} events are extracted from those documents.")
182
- step_id+=1
 
 
 
 
183
 
184
 
185
- # Step 9: Sorting events and label usefulness
186
  print("πŸ“… Sorting events temporally ...")
187
 
188
 
@@ -193,7 +205,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
193
  claim=claim,
194
  suggestion_meta=suggestion_meta
195
  )
196
- # print(sorted_events)
197
  df = pd.DataFrame(sorted_events)
198
  sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
199
  df.to_excel(sorted_event_path, index=False)
 
119
  f"    5. {questions[4]}")
120
  step_id+=1
121
 
122
+ try:
123
+ augmented_tsv_file = run_augmented_searching(
124
+ qa_file=qa_file_path,
125
+ pledge_author=pledge_author,
126
+ pledge_date=pledge_date,
127
+ pipeline_base_dir=pipeline_base_dir,
128
+ start_date=start_date,
129
+ suggestion_meta=suggestion_meta,
130
+ end_date="",
131
+ )
132
 
133
+
 
 
 
 
 
 
 
 
 
 
134
 
135
+ with open(augmented_tsv_file, "r", encoding="utf-8") as f:
136
+ line_count = sum(1 for line in f)
137
+ if update_fn:
138
+ update_fn(step_id, f"{line_count} URLs are retrieved")
139
+ step_id+=1
140
+ except Exception as e:
141
+ if update_fn:
142
+ update_fn(step_id, f"❌ run_augmented_searching failed: {e}")
143
+ raise
144
 
145
  augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
146
  os.makedirs(augmented_data_store_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ try:
149
+ augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
150
+ run_scraper(augmented_tsv_file, augmented_scraped_output_path)
151
 
152
+ with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
153
+ line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
154
+ if update_fn:
155
+ update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
156
+ step_id+=1
157
+ except Exception as e:
158
+ if update_fn:
159
+ update_fn(step_id, f"❌ run_scraper failed: {e}")
160
+ raise
161
+
162
 
163
+ try:
164
+ run_hero_reranking(pipeline_base_dir, suggestion_meta)
165
+
166
+ meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
167
+ all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
168
+ unique_urls = set()
169
+ with open(all_info_path, "r", encoding="utf-8") as f:
170
+ for line in f:
171
+ data = json.loads(line)
172
+ docs = data.get("evidence", [])
173
+ for doc in docs:
174
+ if "url" in doc:
175
+ unique_urls.add(doc["url"])
176
+ if update_fn:
177
+ update_fn(step_id, f"{len(unique_urls)} documents are selected")
178
+ step_id+=1
179
+ except Exception as e:
180
+ if update_fn:
181
+ update_fn(step_id, f"❌ run_hero_reranking failed: {e}")
182
+ raise
183
 
184
+ try:
185
+ extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
186
 
187
+ events_num = count_total_events(extracted_event_path)
188
 
189
+ if update_fn:
190
+ update_fn(step_id, f"{events_num} events are extracted from those documents.")
191
+ step_id+=1
192
+ except Exception as e:
193
+ if update_fn:
194
+ update_fn(step_id, f"❌ Event extraction failed: {e}")
195
+ raise
196
 
197
 
 
198
  print("πŸ“… Sorting events temporally ...")
199
 
200
 
 
205
  claim=claim,
206
  suggestion_meta=suggestion_meta
207
  )
208
+
209
  df = pd.DataFrame(sorted_events)
210
  sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
211
  df.to_excel(sorted_event_path, index=False)