Spaces:

zhenyundeng
/

AVeriTeC-API

Build error

App Files Files Community

zhenyundeng commited on Sep 8, 2024

Commit

55ca411

1 Parent(s): 8c5fc49

udpate

Browse files

Files changed (1) hide show

app.py +125 -99

app.py CHANGED Viewed

@@ -75,6 +75,80 @@ nlp = spacy.load("en_core_web_sm")
 # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
 train_examples = json.load(open('averitec/data/train.json', 'r'))
 # print(train_examples[0]['claim'])
 # ---------------------------------------------------------------------------
 # ---------- Load pretrained models        ----------
@@ -98,8 +172,8 @@ if torch.cuda.is_available():
     # device = "cuda:0" if torch.cuda.is_available() else "cpu"
     # question generation
-    qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
-    qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to('cuda')
     # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
     # qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
     # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
@@ -359,30 +433,30 @@ def QAprediction(claim, evidence, sources):
 # ----------GoogleAPIretriever---------
-def generate_reference_corpus(reference_file):
-    # with open(reference_file) as f:
-    #     train_examples = json.load(f)
-    all_data_corpus = []
-    tokenized_corpus = []
-    for train_example in train_examples:
-        train_claim = train_example["claim"]
-        speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
-            train_example["speaker"]) > 1 else "they"
-        questions = [q["question"] for q in train_example["questions"]]
-        claim_dict_builder = {}
-        claim_dict_builder["claim"] = train_claim
-        claim_dict_builder["speaker"] = speaker
-        claim_dict_builder["questions"] = questions
-        tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
-        all_data_corpus.append(claim_dict_builder)
-    return tokenized_corpus, all_data_corpus
 def doc2prompt(doc):
@@ -399,22 +473,15 @@ def docs2prompt(top_docs):
 @spaces.GPU
 def prompt_question_generation(test_claim, speaker="they", topk=10):
     #
-    reference_file = "averitec/data/train.json"
-    tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
-    bm25 = BM25Okapi(tokenized_corpus)
-    # Define the bloom model:
-    accelerator = Accelerator()
-    # accel_device = accelerator.device
-    # device = "cuda:0" if torch.cuda.is_available() else "cpu"
-    # tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
-    # model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
     # --------------------------------------------------
     # test claim
-    s = bm25.get_scores(nltk.word_tokenize(test_claim))
     top_n = np.argsort(s)[::-1][:topk]
-    docs = [all_data_corpus[i] for i in top_n]
     # --------------------------------------------------
     prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
@@ -640,71 +707,30 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
     return retrieve_evidence
-def claim2prompts(example):
-    claim = example["claim"]
-    # claim_str = "Claim: " + claim + "||Evidence: "
-    claim_str = "Evidence: "
-    for question in example["questions"]:
-        q_text = question["question"].strip()
-        if len(q_text) == 0:
-            continue
-        if not q_text[-1] == "?":
-            q_text += "?"
-        answer_strings = []
-        for a in question["answers"]:
-            if a["answer_type"] in ["Extractive", "Abstractive"]:
-                answer_strings.append(a["answer"])
-            if a["answer_type"] == "Boolean":
-                answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
-        for a_text in answer_strings:
-            if not a_text[-1] in [".", "!", ":", "?"]:
-                a_text += "."
-            # prompt_lookup_str = claim + " " + a_text
-            prompt_lookup_str = a_text
-            this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
-            yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
-def generate_step2_reference_corpus(reference_file):
-    # with open(reference_file) as f:
-    #     train_examples = json.load(f)
-    prompt_corpus = []
-    tokenized_corpus = []
-    for example in train_examples:
-        for lookup_str, prompt in claim2prompts(example):
-            entry = nltk.word_tokenize(lookup_str)
-            tokenized_corpus.append(entry)
-            prompt_corpus.append(prompt)
-    return tokenized_corpus, prompt_corpus
 @spaces.GPU
 def decorate_with_questions(claim, retrieve_evidence, top_k=3):  # top_k=5, 10, 100
     #
-    reference_file = "averitec/data/train.json"
-    tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
-    prompt_bm25 = BM25Okapi(tokenized_corpus)
-    # Define the bloom model:
-    # accelerator = Accelerator()
-    # accel_device = accelerator.device
-    # device = "cuda:0" if torch.cuda.is_available() else "cpu"
-    # tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
-    # model = BloomForCausalLM.from_pretrained(
-    #     "bigscience/bloom-7b1",
-    #     device_map="auto",
-    #     torch_dtype=torch.bfloat16,
-    #     offload_folder="./offload"
-    # )
     #
     tokenized_corpus = []
@@ -749,7 +775,7 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3):  # top_k=5, 10,
         prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
         prompt_n = 10
         prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
-        prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
         claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
         prompt = "\n\n".join(prompt_docs + [claim_prompt])
@@ -757,8 +783,8 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3):  # top_k=5, 10,
         inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
         # inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
-        outputs = qg_model.generate(inputs["input_ids"], max_length=5000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
         tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
         # We are not allowed to generate more than 250 characters:
         tgt_text = tgt_text[:250]

 # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
 train_examples = json.load(open('averitec/data/train.json', 'r'))
+def claim2prompts(example):
+    claim = example["claim"]
+    # claim_str = "Claim: " + claim + "||Evidence: "
+    claim_str = "Evidence: "
+    for question in example["questions"]:
+        q_text = question["question"].strip()
+        if len(q_text) == 0:
+            continue
+        if not q_text[-1] == "?":
+            q_text += "?"
+        answer_strings = []
+        for a in question["answers"]:
+            if a["answer_type"] in ["Extractive", "Abstractive"]:
+                answer_strings.append(a["answer"])
+            if a["answer_type"] == "Boolean":
+                answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
+        for a_text in answer_strings:
+            if not a_text[-1] in [".", "!", ":", "?"]:
+                a_text += "."
+            # prompt_lookup_str = claim + " " + a_text
+            prompt_lookup_str = a_text
+            this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
+            yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
+def generate_reference_corpus(reference_file):
+    all_data_corpus = []
+    tokenized_corpus = []
+    for train_example in train_examples:
+        train_claim = train_example["claim"]
+        speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
+            train_example["speaker"]) > 1 else "they"
+        questions = [q["question"] for q in train_example["questions"]]
+        claim_dict_builder = {}
+        claim_dict_builder["claim"] = train_claim
+        claim_dict_builder["speaker"] = speaker
+        claim_dict_builder["questions"] = questions
+        tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
+        all_data_corpus.append(claim_dict_builder)
+    return tokenized_corpus, all_data_corpus
+def generate_step2_reference_corpus(reference_file):
+    prompt_corpus = []
+    tokenized_corpus = []
+    for example in train_examples:
+        for lookup_str, prompt in claim2prompts(example):
+            entry = nltk.word_tokenize(lookup_str)
+            tokenized_corpus.append(entry)
+            prompt_corpus.append(prompt)
+    return tokenized_corpus, prompt_corpus
+reference_file = "averitec/data/train.json"
+tokenized_corpus0, all_data_corpus0 = generate_reference_corpus(reference_file)
+qg_bm25 = BM25Okapi(tokenized_corpus0)
+tokenized_corpus1, prompt_corpus1 = generate_step2_reference_corpus(reference_file)
+prompt_bm25 = BM25Okapi(tokenized_corpus1)
 # print(train_examples[0]['claim'])
 # ---------------------------------------------------------------------------
 # ---------- Load pretrained models        ----------
     # device = "cuda:0" if torch.cuda.is_available() else "cpu"
     # question generation
+    qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-1b1")
+    qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-1b1", torch_dtype=torch.bfloat16).to('cuda')
     # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
     # qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
     # qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
 # ----------GoogleAPIretriever---------
+# def generate_reference_corpus(reference_file):
+#     # with open(reference_file) as f:
+#     #     train_examples = json.load(f)
+#
+#     all_data_corpus = []
+#     tokenized_corpus = []
+#
+#     for train_example in train_examples:
+#         train_claim = train_example["claim"]
+#
+#         speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
+#             train_example["speaker"]) > 1 else "they"
+#
+#         questions = [q["question"] for q in train_example["questions"]]
+#
+#         claim_dict_builder = {}
+#         claim_dict_builder["claim"] = train_claim
+#         claim_dict_builder["speaker"] = speaker
+#         claim_dict_builder["questions"] = questions
+#
+#         tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
+#         all_data_corpus.append(claim_dict_builder)
+#
+#     return tokenized_corpus, all_data_corpus
 def doc2prompt(doc):
 @spaces.GPU
 def prompt_question_generation(test_claim, speaker="they", topk=10):
     #
+    # reference_file = "averitec/data/train.json"
+    # tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
+    # bm25 = BM25Okapi(tokenized_corpus)
     # --------------------------------------------------
     # test claim
+    s = qg_bm25.get_scores(nltk.word_tokenize(test_claim))
     top_n = np.argsort(s)[::-1][:topk]
+    docs = [all_data_corpus0[i] for i in top_n]
     # --------------------------------------------------
     prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
     return retrieve_evidence
+# def generate_step2_reference_corpus(reference_file):
+#     # with open(reference_file) as f:
+#     #     train_examples = json.load(f)
+#
+#     prompt_corpus = []
+#     tokenized_corpus = []
+#
+#     for example in train_examples:
+#         for lookup_str, prompt in claim2prompts(example):
+#             entry = nltk.word_tokenize(lookup_str)
+#             tokenized_corpus.append(entry)
+#             prompt_corpus.append(prompt)
+#
+#     return tokenized_corpus, prompt_corpus
 @spaces.GPU
 def decorate_with_questions(claim, retrieve_evidence, top_k=3):  # top_k=5, 10, 100
     #
+    # reference_file = "averitec/data/train.json"
+    # tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
+    # prompt_bm25 = BM25Okapi(tokenized_corpus)
     #
     tokenized_corpus = []
         prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
         prompt_n = 10
         prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
+        prompt_docs = [prompt_corpus1[i] for i in prompt_top_n]
         claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
         prompt = "\n\n".join(prompt_docs + [claim_prompt])
         inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
         # inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
+        outputs = qg_model.generate(inputs["input_ids"], max_length=2000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
+                                                        # max_length=5000
         tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
         # We are not allowed to generate more than 250 characters:
         tgt_text = tgt_text[:250]