Spaces:
Build error
Build error
zhenyundeng
commited on
Commit
·
55ca411
1
Parent(s):
8c5fc49
udpate
Browse files
app.py
CHANGED
@@ -75,6 +75,80 @@ nlp = spacy.load("en_core_web_sm")
|
|
75 |
# all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
|
76 |
train_examples = json.load(open('averitec/data/train.json', 'r'))
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
# print(train_examples[0]['claim'])
|
79 |
# ---------------------------------------------------------------------------
|
80 |
# ---------- Load pretrained models ----------
|
@@ -98,8 +172,8 @@ if torch.cuda.is_available():
|
|
98 |
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
99 |
|
100 |
# question generation
|
101 |
-
qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-
|
102 |
-
qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-
|
103 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
104 |
# qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
105 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
@@ -359,30 +433,30 @@ def QAprediction(claim, evidence, sources):
|
|
359 |
|
360 |
|
361 |
# ----------GoogleAPIretriever---------
|
362 |
-
def generate_reference_corpus(reference_file):
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
|
387 |
|
388 |
def doc2prompt(doc):
|
@@ -399,22 +473,15 @@ def docs2prompt(top_docs):
|
|
399 |
@spaces.GPU
|
400 |
def prompt_question_generation(test_claim, speaker="they", topk=10):
|
401 |
#
|
402 |
-
reference_file = "averitec/data/train.json"
|
403 |
-
tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
|
404 |
-
bm25 = BM25Okapi(tokenized_corpus)
|
405 |
-
|
406 |
-
# Define the bloom model:
|
407 |
-
accelerator = Accelerator()
|
408 |
-
# accel_device = accelerator.device
|
409 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
410 |
-
# tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
411 |
-
# model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
412 |
|
413 |
# --------------------------------------------------
|
414 |
# test claim
|
415 |
-
s =
|
416 |
top_n = np.argsort(s)[::-1][:topk]
|
417 |
-
docs = [
|
418 |
# --------------------------------------------------
|
419 |
|
420 |
prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
|
@@ -640,71 +707,30 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
|
|
640 |
return retrieve_evidence
|
641 |
|
642 |
|
643 |
-
def claim2prompts(example):
|
644 |
-
claim = example["claim"]
|
645 |
|
646 |
-
# claim_str = "Claim: " + claim + "||Evidence: "
|
647 |
-
claim_str = "Evidence: "
|
648 |
|
649 |
-
for question in example["questions"]:
|
650 |
-
q_text = question["question"].strip()
|
651 |
-
if len(q_text) == 0:
|
652 |
-
continue
|
653 |
-
|
654 |
-
if not q_text[-1] == "?":
|
655 |
-
q_text += "?"
|
656 |
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
|
672 |
-
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
|
673 |
-
|
674 |
-
|
675 |
-
def generate_step2_reference_corpus(reference_file):
|
676 |
-
# with open(reference_file) as f:
|
677 |
-
# train_examples = json.load(f)
|
678 |
-
|
679 |
-
prompt_corpus = []
|
680 |
-
tokenized_corpus = []
|
681 |
-
|
682 |
-
for example in train_examples:
|
683 |
-
for lookup_str, prompt in claim2prompts(example):
|
684 |
-
entry = nltk.word_tokenize(lookup_str)
|
685 |
-
tokenized_corpus.append(entry)
|
686 |
-
prompt_corpus.append(prompt)
|
687 |
-
|
688 |
-
return tokenized_corpus, prompt_corpus
|
689 |
|
690 |
@spaces.GPU
|
691 |
def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
|
692 |
#
|
693 |
-
reference_file = "averitec/data/train.json"
|
694 |
-
tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
|
695 |
-
prompt_bm25 = BM25Okapi(tokenized_corpus)
|
696 |
-
|
697 |
-
# Define the bloom model:
|
698 |
-
# accelerator = Accelerator()
|
699 |
-
# accel_device = accelerator.device
|
700 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
701 |
-
# tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
702 |
-
# model = BloomForCausalLM.from_pretrained(
|
703 |
-
# "bigscience/bloom-7b1",
|
704 |
-
# device_map="auto",
|
705 |
-
# torch_dtype=torch.bfloat16,
|
706 |
-
# offload_folder="./offload"
|
707 |
-
# )
|
708 |
|
709 |
#
|
710 |
tokenized_corpus = []
|
@@ -749,7 +775,7 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
|
|
749 |
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
750 |
prompt_n = 10
|
751 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
752 |
-
prompt_docs = [
|
753 |
|
754 |
claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
|
755 |
prompt = "\n\n".join(prompt_docs + [claim_prompt])
|
@@ -757,8 +783,8 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
|
|
757 |
|
758 |
inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
|
759 |
# inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
|
760 |
-
outputs = qg_model.generate(inputs["input_ids"], max_length=
|
761 |
-
|
762 |
tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
|
763 |
# We are not allowed to generate more than 250 characters:
|
764 |
tgt_text = tgt_text[:250]
|
|
|
75 |
# all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
|
76 |
train_examples = json.load(open('averitec/data/train.json', 'r'))
|
77 |
|
78 |
+
def claim2prompts(example):
|
79 |
+
claim = example["claim"]
|
80 |
+
|
81 |
+
# claim_str = "Claim: " + claim + "||Evidence: "
|
82 |
+
claim_str = "Evidence: "
|
83 |
+
|
84 |
+
for question in example["questions"]:
|
85 |
+
q_text = question["question"].strip()
|
86 |
+
if len(q_text) == 0:
|
87 |
+
continue
|
88 |
+
|
89 |
+
if not q_text[-1] == "?":
|
90 |
+
q_text += "?"
|
91 |
+
|
92 |
+
answer_strings = []
|
93 |
+
|
94 |
+
for a in question["answers"]:
|
95 |
+
if a["answer_type"] in ["Extractive", "Abstractive"]:
|
96 |
+
answer_strings.append(a["answer"])
|
97 |
+
if a["answer_type"] == "Boolean":
|
98 |
+
answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
|
99 |
+
|
100 |
+
for a_text in answer_strings:
|
101 |
+
if not a_text[-1] in [".", "!", ":", "?"]:
|
102 |
+
a_text += "."
|
103 |
+
|
104 |
+
# prompt_lookup_str = claim + " " + a_text
|
105 |
+
prompt_lookup_str = a_text
|
106 |
+
this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
|
107 |
+
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
|
108 |
+
|
109 |
+
|
110 |
+
def generate_reference_corpus(reference_file):
|
111 |
+
all_data_corpus = []
|
112 |
+
tokenized_corpus = []
|
113 |
+
|
114 |
+
for train_example in train_examples:
|
115 |
+
train_claim = train_example["claim"]
|
116 |
+
|
117 |
+
speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
|
118 |
+
train_example["speaker"]) > 1 else "they"
|
119 |
+
|
120 |
+
questions = [q["question"] for q in train_example["questions"]]
|
121 |
+
|
122 |
+
claim_dict_builder = {}
|
123 |
+
claim_dict_builder["claim"] = train_claim
|
124 |
+
claim_dict_builder["speaker"] = speaker
|
125 |
+
claim_dict_builder["questions"] = questions
|
126 |
+
|
127 |
+
tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
|
128 |
+
all_data_corpus.append(claim_dict_builder)
|
129 |
+
|
130 |
+
return tokenized_corpus, all_data_corpus
|
131 |
+
|
132 |
+
def generate_step2_reference_corpus(reference_file):
|
133 |
+
prompt_corpus = []
|
134 |
+
tokenized_corpus = []
|
135 |
+
|
136 |
+
for example in train_examples:
|
137 |
+
for lookup_str, prompt in claim2prompts(example):
|
138 |
+
entry = nltk.word_tokenize(lookup_str)
|
139 |
+
tokenized_corpus.append(entry)
|
140 |
+
prompt_corpus.append(prompt)
|
141 |
+
|
142 |
+
return tokenized_corpus, prompt_corpus
|
143 |
+
|
144 |
+
reference_file = "averitec/data/train.json"
|
145 |
+
tokenized_corpus0, all_data_corpus0 = generate_reference_corpus(reference_file)
|
146 |
+
qg_bm25 = BM25Okapi(tokenized_corpus0)
|
147 |
+
|
148 |
+
tokenized_corpus1, prompt_corpus1 = generate_step2_reference_corpus(reference_file)
|
149 |
+
prompt_bm25 = BM25Okapi(tokenized_corpus1)
|
150 |
+
|
151 |
+
|
152 |
# print(train_examples[0]['claim'])
|
153 |
# ---------------------------------------------------------------------------
|
154 |
# ---------- Load pretrained models ----------
|
|
|
172 |
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
173 |
|
174 |
# question generation
|
175 |
+
qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-1b1")
|
176 |
+
qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-1b1", torch_dtype=torch.bfloat16).to('cuda')
|
177 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
178 |
# qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
179 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
|
|
433 |
|
434 |
|
435 |
# ----------GoogleAPIretriever---------
|
436 |
+
# def generate_reference_corpus(reference_file):
|
437 |
+
# # with open(reference_file) as f:
|
438 |
+
# # train_examples = json.load(f)
|
439 |
+
#
|
440 |
+
# all_data_corpus = []
|
441 |
+
# tokenized_corpus = []
|
442 |
+
#
|
443 |
+
# for train_example in train_examples:
|
444 |
+
# train_claim = train_example["claim"]
|
445 |
+
#
|
446 |
+
# speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
|
447 |
+
# train_example["speaker"]) > 1 else "they"
|
448 |
+
#
|
449 |
+
# questions = [q["question"] for q in train_example["questions"]]
|
450 |
+
#
|
451 |
+
# claim_dict_builder = {}
|
452 |
+
# claim_dict_builder["claim"] = train_claim
|
453 |
+
# claim_dict_builder["speaker"] = speaker
|
454 |
+
# claim_dict_builder["questions"] = questions
|
455 |
+
#
|
456 |
+
# tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
|
457 |
+
# all_data_corpus.append(claim_dict_builder)
|
458 |
+
#
|
459 |
+
# return tokenized_corpus, all_data_corpus
|
460 |
|
461 |
|
462 |
def doc2prompt(doc):
|
|
|
473 |
@spaces.GPU
|
474 |
def prompt_question_generation(test_claim, speaker="they", topk=10):
|
475 |
#
|
476 |
+
# reference_file = "averitec/data/train.json"
|
477 |
+
# tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
|
478 |
+
# bm25 = BM25Okapi(tokenized_corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
|
480 |
# --------------------------------------------------
|
481 |
# test claim
|
482 |
+
s = qg_bm25.get_scores(nltk.word_tokenize(test_claim))
|
483 |
top_n = np.argsort(s)[::-1][:topk]
|
484 |
+
docs = [all_data_corpus0[i] for i in top_n]
|
485 |
# --------------------------------------------------
|
486 |
|
487 |
prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
|
|
|
707 |
return retrieve_evidence
|
708 |
|
709 |
|
|
|
|
|
710 |
|
|
|
|
|
711 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
712 |
|
713 |
+
# def generate_step2_reference_corpus(reference_file):
|
714 |
+
# # with open(reference_file) as f:
|
715 |
+
# # train_examples = json.load(f)
|
716 |
+
#
|
717 |
+
# prompt_corpus = []
|
718 |
+
# tokenized_corpus = []
|
719 |
+
#
|
720 |
+
# for example in train_examples:
|
721 |
+
# for lookup_str, prompt in claim2prompts(example):
|
722 |
+
# entry = nltk.word_tokenize(lookup_str)
|
723 |
+
# tokenized_corpus.append(entry)
|
724 |
+
# prompt_corpus.append(prompt)
|
725 |
+
#
|
726 |
+
# return tokenized_corpus, prompt_corpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
@spaces.GPU
|
729 |
def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
|
730 |
#
|
731 |
+
# reference_file = "averitec/data/train.json"
|
732 |
+
# tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
|
733 |
+
# prompt_bm25 = BM25Okapi(tokenized_corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
#
|
736 |
tokenized_corpus = []
|
|
|
775 |
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
776 |
prompt_n = 10
|
777 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
778 |
+
prompt_docs = [prompt_corpus1[i] for i in prompt_top_n]
|
779 |
|
780 |
claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
|
781 |
prompt = "\n\n".join(prompt_docs + [claim_prompt])
|
|
|
783 |
|
784 |
inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
|
785 |
# inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
|
786 |
+
outputs = qg_model.generate(inputs["input_ids"], max_length=2000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
|
787 |
+
# max_length=5000
|
788 |
tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
|
789 |
# We are not allowed to generate more than 250 characters:
|
790 |
tgt_text = tgt_text[:250]
|