zhenyundeng commited on
Commit
8532c4b
·
1 Parent(s): dd80156

update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -6
app.py CHANGED
@@ -15,6 +15,8 @@ import gradio as gr
15
  import os
16
  import torch
17
  import json
 
 
18
  import numpy as np
19
  import requests
20
  from rank_bm25 import BM25Okapi
@@ -26,6 +28,9 @@ from transformers import BloomTokenizerFast, BloomForCausalLM, BertTokenizer, Be
26
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
27
  import pytorch_lightning as pl
28
 
 
 
 
29
  from averitec.models.DualEncoderModule import DualEncoderModule
30
  from averitec.models.SequenceClassificationModule import SequenceClassificationModule
31
  from averitec.models.JustificationGenerationModule import JustificationGenerationModule
@@ -43,6 +48,82 @@ import spacy
43
  os.system("python -m spacy download en_core_web_sm")
44
  nlp = spacy.load("en_core_web_sm")
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # ---------------------------------------------------------------------------------------------------------------------
47
  # ---------------------------------------------------------------------------
48
  # load .env
@@ -76,20 +157,27 @@ LABEL = [
76
  ]
77
 
78
  if torch.cuda.is_available():
 
 
 
 
 
 
 
 
 
 
79
  # Veracity
80
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
81
  veracity_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
82
  bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4, problem_type="single_label_classification")
83
  veracity_checkpoint_path = os.getcwd() + "/averitec/pretrained_models/bert_veracity.ckpt"
84
  veracity_model = SequenceClassificationModule.load_from_checkpoint(veracity_checkpoint_path,tokenizer=veracity_tokenizer, model=bert_model)
85
- # veracity_model = SequenceClassificationModule.load_from_checkpoint(veracity_checkpoint_path,tokenizer=veracity_tokenizer, model=bert_model).to(device)
86
 
87
  # Justification
88
  justification_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)
89
  bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
90
  best_checkpoint = os.getcwd() + '/averitec/pretrained_models/bart_justifications_verdict-epoch=13-val_loss=2.03-val_meteor=0.28.ckpt'
91
  justification_model = JustificationGenerationModule.load_from_checkpoint(best_checkpoint, tokenizer=justification_tokenizer, model=bart_model)
92
- # justification_model = JustificationGenerationModule.load_from_checkpoint(best_checkpoint, tokenizer=justification_tokenizer, model=bart_model).to(device)
93
  # ---------------------------------------------------------------------------
94
 
95
  # ----------------------------------------------------------------------------
@@ -100,10 +188,351 @@ class Docs:
100
 
101
 
102
  # ------------------------------ Googleretriever -----------------------------
103
- def Googleretriever():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # ------------------------------ Googleretriever -----------------------------
109
 
@@ -455,7 +884,7 @@ if __name__ == "__main__":
455
  # if __name__ == "__main__":
456
  # item = {
457
  # "claim": "England won the Euro 2024.",
458
- # "source": "Wikipedia",
459
  # }
460
  #
461
  # results = fact_checking(item)
 
15
  import os
16
  import torch
17
  import json
18
+ import tqdm
19
+ from time import sleep
20
  import numpy as np
21
  import requests
22
  from rank_bm25 import BM25Okapi
 
28
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
29
  import pytorch_lightning as pl
30
 
31
+ from urllib.parse import urlparse
32
+ from html2lines import url2lines
33
+ from googleapiclient.discovery import build
34
  from averitec.models.DualEncoderModule import DualEncoderModule
35
  from averitec.models.SequenceClassificationModule import SequenceClassificationModule
36
  from averitec.models.JustificationGenerationModule import JustificationGenerationModule
 
48
  os.system("python -m spacy download en_core_web_sm")
49
  nlp = spacy.load("en_core_web_sm")
50
 
51
+ # ---------------------------------------------------------------------------
52
+ train_examples = json.load(open('averitec/data/train.json', 'r'))
53
+
54
+ def claim2prompts(example):
55
+ claim = example["claim"]
56
+ # claim_str = "Claim: " + claim + "||Evidence: "
57
+ claim_str = "Evidence: "
58
+
59
+ for question in example["questions"]:
60
+ q_text = question["question"].strip()
61
+ if len(q_text) == 0:
62
+ continue
63
+
64
+ if not q_text[-1] == "?":
65
+ q_text += "?"
66
+
67
+ answer_strings = []
68
+
69
+ for a in question["answers"]:
70
+ if a["answer_type"] in ["Extractive", "Abstractive"]:
71
+ answer_strings.append(a["answer"])
72
+ if a["answer_type"] == "Boolean":
73
+ answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
74
+
75
+ for a_text in answer_strings:
76
+ if not a_text[-1] in [".", "!", ":", "?"]:
77
+ a_text += "."
78
+
79
+ # prompt_lookup_str = claim + " " + a_text
80
+ prompt_lookup_str = a_text
81
+ this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
82
+ yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
83
+
84
+
85
+ def generate_reference_corpus(reference_file):
86
+ all_data_corpus = []
87
+ tokenized_corpus = []
88
+
89
+ for train_example in train_examples:
90
+ train_claim = train_example["claim"]
91
+
92
+ speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
93
+ train_example["speaker"]) > 1 else "they"
94
+
95
+ questions = [q["question"] for q in train_example["questions"]]
96
+
97
+ claim_dict_builder = {}
98
+ claim_dict_builder["claim"] = train_claim
99
+ claim_dict_builder["speaker"] = speaker
100
+ claim_dict_builder["questions"] = questions
101
+
102
+ tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
103
+ all_data_corpus.append(claim_dict_builder)
104
+
105
+ return tokenized_corpus, all_data_corpus
106
+
107
+
108
+ def generate_step2_reference_corpus(reference_file):
109
+ prompt_corpus = []
110
+ tokenized_corpus = []
111
+
112
+ for example in train_examples:
113
+ for lookup_str, prompt in claim2prompts(example):
114
+ entry = nltk.word_tokenize(lookup_str)
115
+ tokenized_corpus.append(entry)
116
+ prompt_corpus.append(prompt)
117
+
118
+ return tokenized_corpus, prompt_corpus
119
+
120
+ reference_file = "averitec/data/train.json"
121
+ tokenized_corpus0, all_data_corpus0 = generate_reference_corpus(reference_file)
122
+ qg_bm25 = BM25Okapi(tokenized_corpus0)
123
+
124
+ tokenized_corpus1, prompt_corpus1 = generate_step2_reference_corpus(reference_file)
125
+ prompt_bm25 = BM25Okapi(tokenized_corpus1)
126
+
127
  # ---------------------------------------------------------------------------------------------------------------------
128
  # ---------------------------------------------------------------------------
129
  # load .env
 
157
  ]
158
 
159
  if torch.cuda.is_available():
160
+ # question generation
161
+ qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-1b1")
162
+ qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-1b1", torch_dtype=torch.bfloat16).to('cuda')
163
+
164
+ # rerank
165
+ rerank_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
166
+ rereank_bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, problem_type="single_label_classification") # Must specify single_label for some reason
167
+ best_checkpoint = "averitec/pretrained_models/bert_dual_encoder.ckpt"
168
+ rerank_trained_model = DualEncoderModule.load_from_checkpoint(best_checkpoint, tokenizer=rerank_tokenizer, model=rereank_bert_model)
169
+
170
  # Veracity
 
171
  veracity_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
172
  bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4, problem_type="single_label_classification")
173
  veracity_checkpoint_path = os.getcwd() + "/averitec/pretrained_models/bert_veracity.ckpt"
174
  veracity_model = SequenceClassificationModule.load_from_checkpoint(veracity_checkpoint_path,tokenizer=veracity_tokenizer, model=bert_model)
 
175
 
176
  # Justification
177
  justification_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)
178
  bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
179
  best_checkpoint = os.getcwd() + '/averitec/pretrained_models/bart_justifications_verdict-epoch=13-val_loss=2.03-val_meteor=0.28.ckpt'
180
  justification_model = JustificationGenerationModule.load_from_checkpoint(best_checkpoint, tokenizer=justification_tokenizer, model=bart_model)
 
181
  # ---------------------------------------------------------------------------
182
 
183
  # ----------------------------------------------------------------------------
 
188
 
189
 
190
  # ------------------------------ Googleretriever -----------------------------
191
+ def doc2prompt(doc):
192
+ prompt_parts = "Outrageously, " + doc["speaker"] + " claimed that \"" + doc[
193
+ "claim"].strip() + "\". Criticism includes questions like: "
194
+ questions = [q.strip() for q in doc["questions"]]
195
+ return prompt_parts + " ".join(questions)
196
+
197
+
198
+ def docs2prompt(top_docs):
199
+ return "\n\n".join([doc2prompt(d) for d in top_docs])
200
+
201
+ @spaces.GPU
202
+ def prompt_question_generation(test_claim, speaker="they", topk=10):
203
+ # --------------------------------------------------
204
+ # test claim
205
+ s = qg_bm25.get_scores(nltk.word_tokenize(test_claim))
206
+ top_n = np.argsort(s)[::-1][:topk]
207
+ docs = [all_data_corpus0[i] for i in top_n]
208
+ # --------------------------------------------------
209
+
210
+ prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
211
+ "\". Criticism includes questions like: "
212
+ sentences = [prompt]
213
+
214
+ inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
215
+ outputs = qg_model.generate(inputs["input_ids"], max_length=2000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
216
+
217
+ tgt_text = qg_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
218
+ in_len = len(sentences[0])
219
+ questions_str = tgt_text[in_len:].split("\n")[0]
220
+
221
+ qs = questions_str.split("?")
222
+ qs = [q.strip() + "?" for q in qs if q.strip() and len(q.strip()) < 300]
223
+
224
+ #
225
+ generate_question = [{"question": q, "answers": []} for q in qs]
226
+
227
+ return generate_question
228
+
229
+
230
+ def check_claim_date(check_date):
231
+ try:
232
+ year, month, date = check_date.split("-")
233
+ except:
234
+ month, date, year = "01", "01", "2022"
235
+
236
+ if len(year) == 2 and int(year) <= 30:
237
+ year = "20" + year
238
+ elif len(year) == 2:
239
+ year = "19" + year
240
+ elif len(year) == 1:
241
+ year = "200" + year
242
+
243
+ if len(month) == 1:
244
+ month = "0" + month
245
+
246
+ if len(date) == 1:
247
+ date = "0" + date
248
+
249
+ sort_date = year + month + date
250
+
251
+ return sort_date
252
+
253
+
254
+ def string_to_search_query(text, author):
255
+ parts = word_tokenize(text.strip())
256
+ tags = pos_tag(parts)
257
+
258
+ keep_tags = ["CD", "JJ", "NN", "VB"]
259
+
260
+ if author is not None:
261
+ search_string = author.split()
262
+ else:
263
+ search_string = []
264
+
265
+ for token, tag in zip(parts, tags):
266
+ for keep_tag in keep_tags:
267
+ if tag[1].startswith(keep_tag):
268
+ search_string.append(token)
269
+
270
+ search_string = " ".join(search_string)
271
+ return search_string
272
+
273
+
274
+ def get_google_search_results(api_key, search_engine_id, google_search, sort_date, search_string, page=0):
275
+ search_results = []
276
+ for i in range(1):
277
+ try:
278
+ search_results += google_search(
279
+ search_string,
280
+ api_key,
281
+ search_engine_id,
282
+ num=3, # num=10,
283
+ start=0 + 10 * page,
284
+ sort="date:r:19000101:" + sort_date,
285
+ dateRestrict=None,
286
+ gl="US"
287
+ )
288
+ break
289
+ except:
290
+ sleep(1)
291
+
292
+ return search_results
293
+
294
+
295
+ def google_search(search_term, api_key, cse_id, **kwargs):
296
+ service = build("customsearch", "v1", developerKey=api_key)
297
+ res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
298
+
299
+ if "items" in res:
300
+ return res['items']
301
+ else:
302
+ return []
303
+
304
+
305
+ def get_domain_name(url):
306
+ if '://' not in url:
307
+ url = 'http://' + url
308
+
309
+ domain = urlparse(url).netloc
310
+
311
+ if domain.startswith("www."):
312
+ return domain[4:]
313
+ else:
314
+ return domain
315
+
316
+
317
+ def get_text_from_link(url_link):
318
+ page_lines = url2lines(url_link)
319
+
320
+ return "\n".join([url_link] + page_lines)
321
 
322
 
323
+ def averitec_search(claim, generate_question, speaker="they", check_date="2024-07-01", n_pages=1): # n_pages=3
324
+ # default config
325
+ api_key = os.environ["GOOGLE_API_KEY"]
326
+ search_engine_id = os.environ["GOOGLE_SEARCH_ENGINE_ID"]
327
+
328
+ blacklist = [
329
+ "jstor.org", # Blacklisted because their pdfs are not labelled as such, and clog up the download
330
+ "facebook.com", # Blacklisted because only post titles can be scraped, but the scraper doesn't know this,
331
+ "ftp.cs.princeton.edu", # Blacklisted because it hosts many large NLP corpora that keep showing up
332
+ "nlp.cs.princeton.edu",
333
+ "huggingface.co"
334
+ ]
335
+
336
+ blacklist_files = [ # Blacklisted some NLP nonsense that crashes my machine with OOM errors
337
+ "/glove.",
338
+ "ftp://ftp.cs.princeton.edu/pub/cs226/autocomplete/words-333333.txt",
339
+ "https://web.mit.edu/adamrose/Public/googlelist",
340
+ ]
341
+
342
+ # save to folder
343
+ store_folder = "averitec/data/store/retrieved_docs"
344
+ #
345
+ index = 0
346
+ questions = [q["question"] for q in generate_question][:3]
347
+ # questions = [q["question"] for q in generate_question] # ori
348
+
349
+ # check the date of the claim
350
+ current_date = datetime.now().strftime("%Y-%m-%d")
351
+ sort_date = check_claim_date(current_date) # check_date="2022-01-01"
352
+
353
+ #
354
+ search_strings = []
355
+ search_types = []
356
+
357
+ search_string_2 = string_to_search_query(claim, None)
358
+ search_strings += [search_string_2, claim, ]
359
+ search_types += ["claim", "claim-noformat", ]
360
+
361
+ search_strings += questions
362
+ search_types += ["question" for _ in questions]
363
+
364
+ # start to search
365
+ search_results = []
366
+ visited = {}
367
+ store_counter = 0
368
+ worker_stack = list(range(10))
369
+
370
+ retrieve_evidence = []
371
+
372
+ for this_search_string, this_search_type in zip(search_strings, search_types):
373
+ for page_num in range(n_pages):
374
+ search_results = get_google_search_results(api_key, search_engine_id, google_search, sort_date,
375
+ this_search_string, page=page_num)
376
+
377
+ for result in search_results:
378
+ link = str(result["link"])
379
+ domain = get_domain_name(link)
380
+
381
+ if domain in blacklist:
382
+ continue
383
+ broken = False
384
+ for b_file in blacklist_files:
385
+ if b_file in link:
386
+ broken = True
387
+ if broken:
388
+ continue
389
+ if link.endswith(".pdf") or link.endswith(".doc"):
390
+ continue
391
+
392
+ store_file_path = ""
393
+
394
+ if link in visited:
395
+ web_text = visited[link]
396
+ else:
397
+ web_text = get_text_from_link(link)
398
+ visited[link] = web_text
399
+
400
+ line = [str(index), claim, link, str(page_num), this_search_string, this_search_type, web_text]
401
+ retrieve_evidence.append(line)
402
+
403
+ return retrieve_evidence
404
+
405
+
406
+ @spaces.GPU
407
+ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
408
+ #
409
+ tokenized_corpus = []
410
+ all_data_corpus = []
411
+
412
+ for retri_evi in tqdm.tqdm(retrieve_evidence):
413
+ # store_file = retri_evi[-1]
414
+ # with open(store_file, 'r') as f:
415
+ web_text = retri_evi[-1]
416
+ lines_in_web = web_text.split("\n")
417
+
418
+ first = True
419
+ for line in lines_in_web:
420
+ # for line in f:
421
+ line = line.strip()
422
+
423
+ if first:
424
+ first = False
425
+ location_url = line
426
+ continue
427
+
428
+ if len(line) > 3:
429
+ entry = nltk.word_tokenize(line)
430
+ if (location_url, line) not in all_data_corpus:
431
+ tokenized_corpus.append(entry)
432
+ all_data_corpus.append((location_url, line))
433
+
434
+ if len(tokenized_corpus) == 0:
435
+ print("")
436
+
437
+ bm25 = BM25Okapi(tokenized_corpus)
438
+ s = bm25.get_scores(nltk.word_tokenize(claim))
439
+ top_n = np.argsort(s)[::-1][:top_k]
440
+ docs = [all_data_corpus[i] for i in top_n]
441
+
442
+ generate_qa_pairs = []
443
+ # Then, generate questions for those top 50:
444
+ for doc in tqdm.tqdm(docs):
445
+ # prompt_lookup_str = example["claim"] + " " + doc[1]
446
+ prompt_lookup_str = doc[1]
447
+
448
+ prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
449
+ prompt_n = 10
450
+ prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
451
+ prompt_docs = [prompt_corpus1[i] for i in prompt_top_n]
452
+
453
+ claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
454
+ prompt = "\n\n".join(prompt_docs + [claim_prompt])
455
+ sentences = [prompt]
456
+
457
+ inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
458
+ outputs = qg_model.generate(inputs["input_ids"], max_length=5000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
459
+
460
+ tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
461
+ # We are not allowed to generate more than 250 characters:
462
+ tgt_text = tgt_text[:250]
463
+
464
+ qa_pair = [tgt_text.strip().split("?")[0].replace("\n", " ") + "?", doc[1].replace("\n", " "), doc[0]]
465
+ generate_qa_pairs.append(qa_pair)
466
+
467
+ return generate_qa_pairs
468
+
469
+
470
+ def triple_to_string(x):
471
+ return " </s> ".join([item.strip() for item in x])
472
+
473
+
474
+ @spaces.GPU
475
+ def rerank_questions(claim, bm25_qas, topk=3):
476
+ #
477
+ strs_to_score = []
478
+ values = []
479
+
480
+ for question, answer, source in bm25_qas:
481
+ str_to_score = triple_to_string([claim, question, answer])
482
+
483
+ strs_to_score.append(str_to_score)
484
+ values.append([question, answer, source])
485
+
486
+ if len(bm25_qas) > 0:
487
+ encoded_dict = rerank_tokenizer(strs_to_score, max_length=512, padding="longest", truncation=True, return_tensors="pt").to(rerank_trained_model.device)
488
+
489
+ input_ids = encoded_dict['input_ids']
490
+ attention_masks = encoded_dict['attention_mask']
491
+
492
+ scores = torch.softmax(rerank_trained_model(input_ids, attention_mask=attention_masks).logits, axis=-1)[:, 1]
493
+
494
+ top_n = torch.argsort(scores, descending=True)[:topk]
495
+ pass_through = [{"question": values[i][0], "answers": values[i][1], "source_url": values[i][2]} for i in top_n]
496
+ else:
497
+ pass_through = []
498
+
499
+ top3_qa_pairs = pass_through
500
+
501
+ return top3_qa_pairs
502
+
503
+
504
+ @spaces.GPU
505
+ def Googleretriever(query):
506
+ # ----- Generate QA pairs using AVeriTeC
507
+ # step 1: generate questions for the query/claim using Bloom
508
+ generate_question = prompt_question_generation(query)
509
+ # step 2: retrieve evidence for the generated questions using Google API
510
+ retrieve_evidence = averitec_search(query, generate_question)
511
+ # step 3: generate QA pairs for each retrieved document
512
+ bm25_qa_pairs = decorate_with_questions(query, retrieve_evidence)
513
+ # step 4: rerank QA pairs
514
+ top3_qa_pairs = rerank_questions(query, bm25_qa_pairs)
515
+
516
+ # Add score to metadata
517
+ results = []
518
+ for i, qa in enumerate(top3_qa_pairs):
519
+ metadata = dict()
520
+
521
+ metadata['name'] = qa['question']
522
+ metadata['url'] = qa['source_url']
523
+ metadata['cached_source_url'] = qa['source_url']
524
+ metadata['short_name'] = "Evidence {}".format(i + 1)
525
+ metadata['page_number'] = ""
526
+ metadata['title'] = qa['question']
527
+ metadata['evidence'] = qa['answers']
528
+ metadata['query'] = qa['question']
529
+ metadata['answer'] = qa['answers']
530
+ metadata['page_content'] = "<b>Question</b>: " + qa['question'] + "<br>" + "<b>Answer</b>: " + qa['answers']
531
+ page_content = f"""{metadata['page_content']}"""
532
+
533
+ results.append(Docs(metadata, page_content))
534
+
535
+ return results
536
 
537
  # ------------------------------ Googleretriever -----------------------------
538
 
 
884
  # if __name__ == "__main__":
885
  # item = {
886
  # "claim": "England won the Euro 2024.",
887
+ # "source": "Google", # Google, Wikipedia
888
  # }
889
  #
890
  # results = fact_checking(item)