Spaces:
Sleeping
Sleeping
Commit
Β·
90fcd9f
1
Parent(s):
b2cd959
Add system
Browse files- app.py +2 -2
- system/pledge_tracking.py +64 -52
app.py
CHANGED
@@ -118,7 +118,7 @@ def run_model():
|
|
118 |
data = request.get_json()
|
119 |
claim = data.get("claim", "no input")
|
120 |
time_range_option = data.get("time_range", "month")
|
121 |
-
system_start_time = datetime.now()
|
122 |
|
123 |
suggestion_meta = data.get("suggestion_meta")
|
124 |
pledge_date = data.get("pledge_date", "")
|
@@ -168,7 +168,7 @@ def run_model():
|
|
168 |
df.to_json(json_path, orient="records", indent=2)
|
169 |
|
170 |
|
171 |
-
system_end_time = datetime.now()
|
172 |
|
173 |
runtime = system_end_time - system_start_time
|
174 |
|
|
|
118 |
data = request.get_json()
|
119 |
claim = data.get("claim", "no input")
|
120 |
time_range_option = data.get("time_range", "month")
|
121 |
+
system_start_time = datetime.now()
|
122 |
|
123 |
suggestion_meta = data.get("suggestion_meta")
|
124 |
pledge_date = data.get("pledge_date", "")
|
|
|
168 |
df.to_json(json_path, orient="records", indent=2)
|
169 |
|
170 |
|
171 |
+
system_end_time = datetime.now()
|
172 |
|
173 |
runtime = system_end_time - system_start_time
|
174 |
|
system/pledge_tracking.py
CHANGED
@@ -119,70 +119,82 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
|
|
119 |
f" 5. {questions[4]}")
|
120 |
step_id+=1
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
qa_file=qa_file_path,
|
125 |
-
pledge_author=pledge_author,
|
126 |
-
pledge_date=pledge_date,
|
127 |
-
pipeline_base_dir=pipeline_base_dir,
|
128 |
-
start_date=start_date,
|
129 |
-
suggestion_meta=suggestion_meta,
|
130 |
-
end_date="",
|
131 |
-
)
|
132 |
-
|
133 |
-
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
140 |
|
141 |
augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
|
142 |
os.makedirs(augmented_data_store_dir, exist_ok=True)
|
143 |
-
augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
|
144 |
-
run_scraper(augmented_tsv_file, augmented_scraped_output_path)
|
145 |
-
|
146 |
-
with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
|
147 |
-
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
148 |
-
if update_fn:
|
149 |
-
update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
|
150 |
-
step_id+=1
|
151 |
-
|
152 |
-
|
153 |
-
run_hero_reranking(pipeline_base_dir, suggestion_meta)
|
154 |
|
155 |
-
|
156 |
-
|
|
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
|
|
|
177 |
|
178 |
-
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
183 |
|
184 |
|
185 |
-
# Step 9: Sorting events and label usefulness
|
186 |
print("π
Sorting events temporally ...")
|
187 |
|
188 |
|
@@ -193,7 +205,7 @@ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_
|
|
193 |
claim=claim,
|
194 |
suggestion_meta=suggestion_meta
|
195 |
)
|
196 |
-
|
197 |
df = pd.DataFrame(sorted_events)
|
198 |
sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
|
199 |
df.to_excel(sorted_event_path, index=False)
|
|
|
119 |
f" 5. {questions[4]}")
|
120 |
step_id+=1
|
121 |
|
122 |
+
try:
|
123 |
+
augmented_tsv_file = run_augmented_searching(
|
124 |
+
qa_file=qa_file_path,
|
125 |
+
pledge_author=pledge_author,
|
126 |
+
pledge_date=pledge_date,
|
127 |
+
pipeline_base_dir=pipeline_base_dir,
|
128 |
+
start_date=start_date,
|
129 |
+
suggestion_meta=suggestion_meta,
|
130 |
+
end_date="",
|
131 |
+
)
|
132 |
|
133 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
with open(augmented_tsv_file, "r", encoding="utf-8") as f:
|
136 |
+
line_count = sum(1 for line in f)
|
137 |
+
if update_fn:
|
138 |
+
update_fn(step_id, f"{line_count} URLs are retrieved")
|
139 |
+
step_id+=1
|
140 |
+
except Exception as e:
|
141 |
+
if update_fn:
|
142 |
+
update_fn(step_id, f"β run_augmented_searching failed: {e}")
|
143 |
+
raise
|
144 |
|
145 |
augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
|
146 |
os.makedirs(augmented_data_store_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
+
try:
|
149 |
+
augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
|
150 |
+
run_scraper(augmented_tsv_file, augmented_scraped_output_path)
|
151 |
|
152 |
+
with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
|
153 |
+
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
154 |
+
if update_fn:
|
155 |
+
update_fn(step_id, f"{line_count} URL pages have been sucessefully scraped")
|
156 |
+
step_id+=1
|
157 |
+
except Exception as e:
|
158 |
+
if update_fn:
|
159 |
+
update_fn(step_id, f"β run_scraper failed: {e}")
|
160 |
+
raise
|
161 |
+
|
162 |
|
163 |
+
try:
|
164 |
+
run_hero_reranking(pipeline_base_dir, suggestion_meta)
|
165 |
+
|
166 |
+
meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
|
167 |
+
all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
|
168 |
+
unique_urls = set()
|
169 |
+
with open(all_info_path, "r", encoding="utf-8") as f:
|
170 |
+
for line in f:
|
171 |
+
data = json.loads(line)
|
172 |
+
docs = data.get("evidence", [])
|
173 |
+
for doc in docs:
|
174 |
+
if "url" in doc:
|
175 |
+
unique_urls.add(doc["url"])
|
176 |
+
if update_fn:
|
177 |
+
update_fn(step_id, f"{len(unique_urls)} documents are selected")
|
178 |
+
step_id+=1
|
179 |
+
except Exception as e:
|
180 |
+
if update_fn:
|
181 |
+
update_fn(step_id, f"β run_hero_reranking failed: {e}")
|
182 |
+
raise
|
183 |
|
184 |
+
try:
|
185 |
+
extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
|
186 |
|
187 |
+
events_num = count_total_events(extracted_event_path)
|
188 |
|
189 |
+
if update_fn:
|
190 |
+
update_fn(step_id, f"{events_num} events are extracted from those documents.")
|
191 |
+
step_id+=1
|
192 |
+
except Exception as e:
|
193 |
+
if update_fn:
|
194 |
+
update_fn(step_id, f"β Event extraction failed: {e}")
|
195 |
+
raise
|
196 |
|
197 |
|
|
|
198 |
print("π
Sorting events temporally ...")
|
199 |
|
200 |
|
|
|
205 |
claim=claim,
|
206 |
suggestion_meta=suggestion_meta
|
207 |
)
|
208 |
+
|
209 |
df = pd.DataFrame(sorted_events)
|
210 |
sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
|
211 |
df.to_excel(sorted_event_path, index=False)
|