zhenyundeng commited on
Commit
55ca411
·
1 Parent(s): 8c5fc49
Files changed (1) hide show
  1. app.py +125 -99
app.py CHANGED
@@ -75,6 +75,80 @@ nlp = spacy.load("en_core_web_sm")
75
  # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
76
  train_examples = json.load(open('averitec/data/train.json', 'r'))
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # print(train_examples[0]['claim'])
79
  # ---------------------------------------------------------------------------
80
  # ---------- Load pretrained models ----------
@@ -98,8 +172,8 @@ if torch.cuda.is_available():
98
  # device = "cuda:0" if torch.cuda.is_available() else "cpu"
99
 
100
  # question generation
101
- qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
102
- qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to('cuda')
103
  # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
104
  # qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
105
  # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
@@ -359,30 +433,30 @@ def QAprediction(claim, evidence, sources):
359
 
360
 
361
  # ----------GoogleAPIretriever---------
362
- def generate_reference_corpus(reference_file):
363
- # with open(reference_file) as f:
364
- # train_examples = json.load(f)
365
-
366
- all_data_corpus = []
367
- tokenized_corpus = []
368
-
369
- for train_example in train_examples:
370
- train_claim = train_example["claim"]
371
-
372
- speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
373
- train_example["speaker"]) > 1 else "they"
374
-
375
- questions = [q["question"] for q in train_example["questions"]]
376
-
377
- claim_dict_builder = {}
378
- claim_dict_builder["claim"] = train_claim
379
- claim_dict_builder["speaker"] = speaker
380
- claim_dict_builder["questions"] = questions
381
-
382
- tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
383
- all_data_corpus.append(claim_dict_builder)
384
-
385
- return tokenized_corpus, all_data_corpus
386
 
387
 
388
  def doc2prompt(doc):
@@ -399,22 +473,15 @@ def docs2prompt(top_docs):
399
  @spaces.GPU
400
  def prompt_question_generation(test_claim, speaker="they", topk=10):
401
  #
402
- reference_file = "averitec/data/train.json"
403
- tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
404
- bm25 = BM25Okapi(tokenized_corpus)
405
-
406
- # Define the bloom model:
407
- accelerator = Accelerator()
408
- # accel_device = accelerator.device
409
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
410
- # tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
411
- # model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
412
 
413
  # --------------------------------------------------
414
  # test claim
415
- s = bm25.get_scores(nltk.word_tokenize(test_claim))
416
  top_n = np.argsort(s)[::-1][:topk]
417
- docs = [all_data_corpus[i] for i in top_n]
418
  # --------------------------------------------------
419
 
420
  prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
@@ -640,71 +707,30 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
640
  return retrieve_evidence
641
 
642
 
643
- def claim2prompts(example):
644
- claim = example["claim"]
645
 
646
- # claim_str = "Claim: " + claim + "||Evidence: "
647
- claim_str = "Evidence: "
648
 
649
- for question in example["questions"]:
650
- q_text = question["question"].strip()
651
- if len(q_text) == 0:
652
- continue
653
-
654
- if not q_text[-1] == "?":
655
- q_text += "?"
656
 
657
- answer_strings = []
658
-
659
- for a in question["answers"]:
660
- if a["answer_type"] in ["Extractive", "Abstractive"]:
661
- answer_strings.append(a["answer"])
662
- if a["answer_type"] == "Boolean":
663
- answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
664
-
665
- for a_text in answer_strings:
666
- if not a_text[-1] in [".", "!", ":", "?"]:
667
- a_text += "."
668
-
669
- # prompt_lookup_str = claim + " " + a_text
670
- prompt_lookup_str = a_text
671
- this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
672
- yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
673
-
674
-
675
- def generate_step2_reference_corpus(reference_file):
676
- # with open(reference_file) as f:
677
- # train_examples = json.load(f)
678
-
679
- prompt_corpus = []
680
- tokenized_corpus = []
681
-
682
- for example in train_examples:
683
- for lookup_str, prompt in claim2prompts(example):
684
- entry = nltk.word_tokenize(lookup_str)
685
- tokenized_corpus.append(entry)
686
- prompt_corpus.append(prompt)
687
-
688
- return tokenized_corpus, prompt_corpus
689
 
690
  @spaces.GPU
691
  def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
692
  #
693
- reference_file = "averitec/data/train.json"
694
- tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
695
- prompt_bm25 = BM25Okapi(tokenized_corpus)
696
-
697
- # Define the bloom model:
698
- # accelerator = Accelerator()
699
- # accel_device = accelerator.device
700
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
701
- # tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
702
- # model = BloomForCausalLM.from_pretrained(
703
- # "bigscience/bloom-7b1",
704
- # device_map="auto",
705
- # torch_dtype=torch.bfloat16,
706
- # offload_folder="./offload"
707
- # )
708
 
709
  #
710
  tokenized_corpus = []
@@ -749,7 +775,7 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
749
  prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
750
  prompt_n = 10
751
  prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
752
- prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
753
 
754
  claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
755
  prompt = "\n\n".join(prompt_docs + [claim_prompt])
@@ -757,8 +783,8 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
757
 
758
  inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
759
  # inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
760
- outputs = qg_model.generate(inputs["input_ids"], max_length=5000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
761
-
762
  tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
763
  # We are not allowed to generate more than 250 characters:
764
  tgt_text = tgt_text[:250]
 
75
  # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
76
  train_examples = json.load(open('averitec/data/train.json', 'r'))
77
 
78
+ def claim2prompts(example):
79
+ claim = example["claim"]
80
+
81
+ # claim_str = "Claim: " + claim + "||Evidence: "
82
+ claim_str = "Evidence: "
83
+
84
+ for question in example["questions"]:
85
+ q_text = question["question"].strip()
86
+ if len(q_text) == 0:
87
+ continue
88
+
89
+ if not q_text[-1] == "?":
90
+ q_text += "?"
91
+
92
+ answer_strings = []
93
+
94
+ for a in question["answers"]:
95
+ if a["answer_type"] in ["Extractive", "Abstractive"]:
96
+ answer_strings.append(a["answer"])
97
+ if a["answer_type"] == "Boolean":
98
+ answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
99
+
100
+ for a_text in answer_strings:
101
+ if not a_text[-1] in [".", "!", ":", "?"]:
102
+ a_text += "."
103
+
104
+ # prompt_lookup_str = claim + " " + a_text
105
+ prompt_lookup_str = a_text
106
+ this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
107
+ yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
108
+
109
+
110
+ def generate_reference_corpus(reference_file):
111
+ all_data_corpus = []
112
+ tokenized_corpus = []
113
+
114
+ for train_example in train_examples:
115
+ train_claim = train_example["claim"]
116
+
117
+ speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
118
+ train_example["speaker"]) > 1 else "they"
119
+
120
+ questions = [q["question"] for q in train_example["questions"]]
121
+
122
+ claim_dict_builder = {}
123
+ claim_dict_builder["claim"] = train_claim
124
+ claim_dict_builder["speaker"] = speaker
125
+ claim_dict_builder["questions"] = questions
126
+
127
+ tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
128
+ all_data_corpus.append(claim_dict_builder)
129
+
130
+ return tokenized_corpus, all_data_corpus
131
+
132
+ def generate_step2_reference_corpus(reference_file):
133
+ prompt_corpus = []
134
+ tokenized_corpus = []
135
+
136
+ for example in train_examples:
137
+ for lookup_str, prompt in claim2prompts(example):
138
+ entry = nltk.word_tokenize(lookup_str)
139
+ tokenized_corpus.append(entry)
140
+ prompt_corpus.append(prompt)
141
+
142
+ return tokenized_corpus, prompt_corpus
143
+
144
+ reference_file = "averitec/data/train.json"
145
+ tokenized_corpus0, all_data_corpus0 = generate_reference_corpus(reference_file)
146
+ qg_bm25 = BM25Okapi(tokenized_corpus0)
147
+
148
+ tokenized_corpus1, prompt_corpus1 = generate_step2_reference_corpus(reference_file)
149
+ prompt_bm25 = BM25Okapi(tokenized_corpus1)
150
+
151
+
152
  # print(train_examples[0]['claim'])
153
  # ---------------------------------------------------------------------------
154
  # ---------- Load pretrained models ----------
 
172
  # device = "cuda:0" if torch.cuda.is_available() else "cpu"
173
 
174
  # question generation
175
+ qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-1b1")
176
+ qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-1b1", torch_dtype=torch.bfloat16).to('cuda')
177
  # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
178
  # qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
179
  # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
 
433
 
434
 
435
  # ----------GoogleAPIretriever---------
436
+ # def generate_reference_corpus(reference_file):
437
+ # # with open(reference_file) as f:
438
+ # # train_examples = json.load(f)
439
+ #
440
+ # all_data_corpus = []
441
+ # tokenized_corpus = []
442
+ #
443
+ # for train_example in train_examples:
444
+ # train_claim = train_example["claim"]
445
+ #
446
+ # speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
447
+ # train_example["speaker"]) > 1 else "they"
448
+ #
449
+ # questions = [q["question"] for q in train_example["questions"]]
450
+ #
451
+ # claim_dict_builder = {}
452
+ # claim_dict_builder["claim"] = train_claim
453
+ # claim_dict_builder["speaker"] = speaker
454
+ # claim_dict_builder["questions"] = questions
455
+ #
456
+ # tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
457
+ # all_data_corpus.append(claim_dict_builder)
458
+ #
459
+ # return tokenized_corpus, all_data_corpus
460
 
461
 
462
  def doc2prompt(doc):
 
473
  @spaces.GPU
474
  def prompt_question_generation(test_claim, speaker="they", topk=10):
475
  #
476
+ # reference_file = "averitec/data/train.json"
477
+ # tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
478
+ # bm25 = BM25Okapi(tokenized_corpus)
 
 
 
 
 
 
 
479
 
480
  # --------------------------------------------------
481
  # test claim
482
+ s = qg_bm25.get_scores(nltk.word_tokenize(test_claim))
483
  top_n = np.argsort(s)[::-1][:topk]
484
+ docs = [all_data_corpus0[i] for i in top_n]
485
  # --------------------------------------------------
486
 
487
  prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
 
707
  return retrieve_evidence
708
 
709
 
 
 
710
 
 
 
711
 
 
 
 
 
 
 
 
712
 
713
+ # def generate_step2_reference_corpus(reference_file):
714
+ # # with open(reference_file) as f:
715
+ # # train_examples = json.load(f)
716
+ #
717
+ # prompt_corpus = []
718
+ # tokenized_corpus = []
719
+ #
720
+ # for example in train_examples:
721
+ # for lookup_str, prompt in claim2prompts(example):
722
+ # entry = nltk.word_tokenize(lookup_str)
723
+ # tokenized_corpus.append(entry)
724
+ # prompt_corpus.append(prompt)
725
+ #
726
+ # return tokenized_corpus, prompt_corpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
  @spaces.GPU
729
  def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
730
  #
731
+ # reference_file = "averitec/data/train.json"
732
+ # tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
733
+ # prompt_bm25 = BM25Okapi(tokenized_corpus)
 
 
 
 
 
 
 
 
 
 
 
 
734
 
735
  #
736
  tokenized_corpus = []
 
775
  prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
776
  prompt_n = 10
777
  prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
778
+ prompt_docs = [prompt_corpus1[i] for i in prompt_top_n]
779
 
780
  claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
781
  prompt = "\n\n".join(prompt_docs + [claim_prompt])
 
783
 
784
  inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
785
  # inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
786
+ outputs = qg_model.generate(inputs["input_ids"], max_length=2000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
787
+ # max_length=5000
788
  tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
789
  # We are not allowed to generate more than 250 characters:
790
  tgt_text = tgt_text[:250]