Spaces:

jingxiangmo
/

Azza

Runtime error

App Files Files Community

Jingxiang Mo commited on Mar 26, 2023

Commit

77e7345

1 Parent(s): 4071dd4

Lint and code optimization

Browse files

Files changed (3) hide show

.vscode/settings.json +5 -0
__pycache__/app.cpython-39.pyc +0 -0
app.py +74 -73

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "python.linting.pylintEnabled": true,
+    "python.linting.enabled": true,
+    "python.formatting.provider": "yapf"
+}

__pycache__/app.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-39.pyc and b/__pycache__/app.cpython-39.pyc differ

app.py CHANGED Viewed

@@ -3,16 +3,18 @@ import gradio as gr
 import numpy as np
 import wikipediaapi as wk
 import wikipedia
 from transformers import (
     TokenClassificationPipeline,
     AutoModelForTokenClassification,
     AutoTokenizer,
     BertForQuestionAnswering,
-    BertTokenizer
 )
 from transformers.pipelines import AggregationStrategy
 import torch
 # =====[ DEFINE PIPELINE ]===== #
 class KeyphraseExtractionPipeline(TokenClassificationPipeline):
     def __init__(self, model, *args, **kwargs):
@@ -20,7 +22,7 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
             model=AutoModelForTokenClassification.from_pretrained(model),
             tokenizer=AutoTokenizer.from_pretrained(model),
             *args,
-            **kwargs
         )
     def postprocess(self, model_outputs):
@@ -30,89 +32,109 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
         )
         return np.unique([result.get("word").strip() for result in results])
 # =====[ LOAD PIPELINE ]===== #
 keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
 extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
-model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
 def keyphrases_extraction(text: str) -> str:
     keyphrases = extractor(text)
     return keyphrases
 def wikipedia_search(input: str) -> str:
     input = input.replace("\n", " ")
     keyphrases = keyphrases_extraction(input)
-    wiki = wk.Wikipedia('en')
-    try :
         if len(keyphrases) == 0:
             return "Can you add more details to your question?"
         query_suggestion = wikipedia.suggest(keyphrases[0])
-        if(query_suggestion != None):
             results = wikipedia.search(query_suggestion)
         else:
             results = wikipedia.search(keyphrases[0])
         index = 0
         page = wiki.page(results[index])
-        while not ('.' in page.summary) or not page.exists():
             index += 1
             if index == len(results):
                 raise Exception
             page = wiki.page(results[index])
         return page.summary
     except:
         return "I cannot answer this question"
-def answer_question(question):
     context = wikipedia_search(question)
-    if (context == "I cannot answer this question") or (context == "Can you add more details to your question?"):
         return context
-    # ======== Tokenize ========
     # Apply the tokenizer to the input text, treating them as a text-pair.
     input_ids = tokenizer.encode(question, context)
-    question_ids = input_ids[:input_ids.index(tokenizer.sep_token_id)+1]
     # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
     length_of_group = 512 - len(question_ids)
-    input_ids_without_question = input_ids[input_ids.index(tokenizer.sep_token_id)+1:]
-    print(f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n")
     input_ids_split = []
-    for group in range(len(input_ids_without_question)//length_of_group + 1):
-        input_ids_split.append(question_ids + input_ids_without_question[length_of_group*group:length_of_group*(group+1)-1])
-    input_ids_split.append(question_ids + input_ids_without_question[length_of_group*(len(input_ids_without_question)//length_of_group + 1):len(input_ids_without_question)-1])
     scores = []
     for input in input_ids_split:
-    # ======== Set Segment IDs ========
-    # Search the input_ids for the first instance of the `[SEP]` token.
         sep_index = input.index(tokenizer.sep_token_id)
-    # The number of segment A tokens includes the [SEP] token istelf.
         num_seg_a = sep_index + 1
-    # The remainder are segment B.
-        num_seg_b = len(input) - num_seg_a
-    # Construct the list of 0s and 1s.
-        segment_ids = [0]*num_seg_a + [1]*num_seg_b
-    # There should be a segment_id for every input token.
         assert len(segment_ids) == len(input)
-    # ======== Evaluate ========
-    # Run our example through the model.
-        outputs = model(torch.tensor([input]), # The tokens representing our input text.
-                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
-                    return_dict=True)
         start_scores = outputs.start_logits
         end_scores = outputs.end_logits
@@ -123,53 +145,32 @@ def answer_question(question):
         print(max_start_score)
         print(max_end_score)
-    # ======== Reconstruct Answer ========
-    # Find the tokens with the highest `start` and `end` scores.
-        answer_start = torch.argmax(start_scores)
-        answer_end = torch.argmax(end_scores)
-    # Get the string versions of the input tokens.
         tokens = tokenizer.convert_ids_to_tokens(input_ids)
-    # Start with the first token.
-        answer = tokens[answer_start]
-    # Select the remaining answer tokens and join them with whitespace.
-        for i in range(answer_start + 1, answer_end + 1):
-        # If it's a subword token, then recombine it with the previous token.
-            if tokens[i][0:2] == '##':
                 answer += tokens[i][2:]
-        # Otherwise, add a space then the token.
             else:
-                answer += ' ' + tokens[i]
         scores.append((max_start_score, max_end_score, answer))
     # Compare scores for answers found and each paragraph and pick the most relevant.
-    final_answer = max(scores, key=lambda x: x[0] + x[1])[2]
-    return final_answer
 # =====[ DEFINE INTERFACE ]===== #'
 title = "Azza Knowledge Agent"
-examples = [
-    ["Where is the Eiffel Tower?"],
-    ["What is the population of France?"]
-]
 demo = gr.Interface(
-    title = title,
     fn=answer_question,
-    inputs = "text",
-    outputs = "text",
     examples=examples,
     allow_flagging="never",
-    )
 if __name__ == "__main__":
-    demo.launch()

 import numpy as np
 import wikipediaapi as wk
 import wikipedia
+import openai
 from transformers import (
     TokenClassificationPipeline,
     AutoModelForTokenClassification,
     AutoTokenizer,
     BertForQuestionAnswering,
+    BertTokenizer,
 )
 from transformers.pipelines import AggregationStrategy
 import torch
 # =====[ DEFINE PIPELINE ]===== #
 class KeyphraseExtractionPipeline(TokenClassificationPipeline):
     def __init__(self, model, *args, **kwargs):
             model=AutoModelForTokenClassification.from_pretrained(model),
             tokenizer=AutoTokenizer.from_pretrained(model),
             *args,
+            **kwargs,
         )
     def postprocess(self, model_outputs):
         )
         return np.unique([result.get("word").strip() for result in results])
 # =====[ LOAD PIPELINE ]===== #
 keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
 extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
+model = BertForQuestionAnswering.from_pretrained(
+    "bert-large-uncased-whole-word-masking-finetuned-squad"
+)
+tokenizer = BertTokenizer.from_pretrained(
+    "bert-large-uncased-whole-word-masking-finetuned-squad"
+)
 def keyphrases_extraction(text: str) -> str:
     keyphrases = extractor(text)
     return keyphrases
 def wikipedia_search(input: str) -> str:
     input = input.replace("\n", " ")
     keyphrases = keyphrases_extraction(input)
+    wiki = wk.Wikipedia("en")
+    try:
         if len(keyphrases) == 0:
             return "Can you add more details to your question?"
         query_suggestion = wikipedia.suggest(keyphrases[0])
+        if query_suggestion != None:
             results = wikipedia.search(query_suggestion)
         else:
             results = wikipedia.search(keyphrases[0])
         index = 0
         page = wiki.page(results[index])
+        while not ("." in page.summary) or not page.exists():
             index += 1
             if index == len(results):
                 raise Exception
             page = wiki.page(results[index])
         return page.summary
     except:
         return "I cannot answer this question"
+def answer_question(question):
     context = wikipedia_search(question)
+    if (context == "I cannot answer this question") or (
+        context == "Can you add more details to your question?"
+    ):
         return context
+    # Tokenize
     # Apply the tokenizer to the input text, treating them as a text-pair.
     input_ids = tokenizer.encode(question, context)
+    question_ids = input_ids[: input_ids.index(tokenizer.sep_token_id) + 1]
     # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
     length_of_group = 512 - len(question_ids)
+    input_ids_without_question = input_ids[
+        input_ids.index(tokenizer.sep_token_id) + 1 :
+    ]
+    print(
+        f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n"
+    )
     input_ids_split = []
+    for group in range(len(input_ids_without_question) // length_of_group + 1):
+        input_ids_split.append(
+            question_ids
+            + input_ids_without_question[
+                length_of_group * group : length_of_group * (group + 1) - 1
+            ]
+        )
+    input_ids_split.append(
+        question_ids
+        + input_ids_without_question[
+            length_of_group
+            * (len(input_ids_without_question) // length_of_group + 1) : len(
+                input_ids_without_question
+            )
+            - 1
+        ]
+    )
     scores = []
     for input in input_ids_split:
+        # set Segment IDs
+        # Search the input_ids for the first instance of the `[SEP]` token.
         sep_index = input.index(tokenizer.sep_token_id)
         num_seg_a = sep_index + 1
+        segment_ids = [0] * num_seg_a + [1] * (len(input) - num_seg_a)
         assert len(segment_ids) == len(input)
+        # evaulate the model
+        outputs = model(
+            torch.tensor([input]),  # The tokens representing our input text.
+            token_type_ids=torch.tensor(
+                [segment_ids]
+            ),  # The segment IDs to differentiate question from answer_text
+            return_dict=True,
+        )
         start_scores = outputs.start_logits
         end_scores = outputs.end_logits
         print(max_start_score)
         print(max_end_score)
+        #  reconstruct answer from the tokens
         tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer = tokens[torch.argmax(start_scores)]
+        for i in range(torch.argmax(start_scores) + 1, torch.argmax(end_scores) + 1):
+            if tokens[i][0:2] == "##":
                 answer += tokens[i][2:]
             else:
+                answer += " " + tokens[i]
         scores.append((max_start_score, max_end_score, answer))
     # Compare scores for answers found and each paragraph and pick the most relevant.
+    return max(scores, key=lambda x: x[0] + x[1])[2]
 # =====[ DEFINE INTERFACE ]===== #'
 title = "Azza Knowledge Agent"
+examples = [["Where is the Eiffel Tower?"], ["What is the population of France?"]]
 demo = gr.Interface(
+    title=title,
     fn=answer_question,
+    inputs="text",
+    outputs="text",
     examples=examples,
     allow_flagging="never",
+)
 if __name__ == "__main__":
+    demo.launch()