File size: 6,121 Bytes
c1811af
9831428
41cb046
a381bc0
80e614a
77e7345
9831428
 
 
 
064fc00
77e7345
9831428
 
064fc00
d4e5967
3d68848
77e7345
9522bb7
9831428
 
 
 
 
 
77e7345
9831428
 
 
 
 
 
 
 
9522bb7
77e7345
d4e5967
 
 
9522bb7
41cb046
 
77e7345
 
 
 
 
 
 
9831428
d4e5967
 
a381bc0
d4e5967
 
 
 
 
 
77e7345
a381bc0
d4e5967
80e614a
77e7345
 
 
80e614a
 
77e7345
80e614a
77e7345
80e614a
 
 
22eefa0
80e614a
 
77e7345
80e614a
 
22eefa0
80e614a
 
77e7345
4970856
 
41cb046
77e7345
d4e5967
 
 
 
 
 
 
 
 
 
41cb046
77e7345
 
 
41cb046
 
d4e5967
41cb046
77e7345
41cb046
288a5de
77eba15
77e7345
 
 
 
 
 
41cb046
288a5de
77e7345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288a5de
 
77e7345
 
288a5de
 
77e7345
288a5de
41cb046
77e7345
 
 
 
 
 
 
 
41cb046
288a5de
 
 
 
 
 
 
 
 
77e7345
288a5de
77e7345
41cb046
77e7345
 
288a5de
 
77e7345
288a5de
 
 
d4e5967
 
 
 
 
 
 
 
41cb046
77eba15
9522bb7
4071dd4
77e7345
4970856
77e7345
41cb046
77e7345
 
e537f35
4071dd4
77e7345
c1811af
4970856
77e7345
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import gradio as gr
import numpy as np
import wikipediaapi as wk
import wikipedia
import openai
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
    BertForQuestionAnswering,
    BertTokenizer,
)
from transformers.pipelines import AggregationStrategy
import torch
from dotenv import load_dotenv


# =====[ DEFINE PIPELINE ]===== #
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs,
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])


load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# =====[ LOAD PIPELINE ]===== #
keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
model = BertForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad"
)
tokenizer = BertTokenizer.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad"
)


def wikipedia_search(input: str) -> str:
    """Perform a Wikipedia search using keyphrases.

    Args:
        input (str): The input text.

    Returns:
        str: The summary of the Wikipedia page.
    """

    input = input.replace("\n", " ")
    keyphrases = extractor(input)

    wiki = wk.Wikipedia("en")

    try:
        if len(keyphrases) == 0:
            return "Can you add more details to your question?"

        query_suggestion = wikipedia.suggest(keyphrases[0])
        if query_suggestion != None:
            results = wikipedia.search(query_suggestion)
        else:
            results = wikipedia.search(keyphrases[0])

        index = 0
        page = wiki.page(results[index])
        while not ("." in page.summary) or not page.exists():
            index += 1
            if index == len(results):
                raise Exception
            page = wiki.page(results[index])
        return page.summary

    except:
        return "I cannot answer this question"


def answer_question(question: str) -> str:
    """Answer the question using the context from the Wikipedia search.

    Args:
        question (str): The input question.

    Returns:
        str: The answer to the question.
    """

    context = wikipedia_search(question)
    if (context == "I cannot answer this question") or (
        context == "Can you add more details to your question?"
    ):
        return context

    # Tokenize and split input
    input_ids = tokenizer.encode(question, context)
    question_ids = input_ids[: input_ids.index(tokenizer.sep_token_id) + 1]

    # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
    length_of_group = 512 - len(question_ids)
    input_ids_without_question = input_ids[
        input_ids.index(tokenizer.sep_token_id) + 1 :
    ]
    print(
        f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n"
    )

    input_ids_split = []
    for group in range(len(input_ids_without_question) // length_of_group + 1):
        input_ids_split.append(
            question_ids
            + input_ids_without_question[
                length_of_group * group : length_of_group * (group + 1) - 1
            ]
        )
    input_ids_split.append(
        question_ids
        + input_ids_without_question[
            length_of_group
            * (len(input_ids_without_question) // length_of_group + 1) : len(
                input_ids_without_question
            )
            - 1
        ]
    )

    scores = []
    for input in input_ids_split:
        # set Segment IDs
        # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input.index(tokenizer.sep_token_id)
        num_seg_a = sep_index + 1
        segment_ids = [0] * num_seg_a + [1] * (len(input) - num_seg_a)
        assert len(segment_ids) == len(input)

        # evaulate the model
        outputs = model(
            torch.tensor([input]),  # The tokens representing our input text.
            token_type_ids=torch.tensor(
                [segment_ids]
            ),  # The segment IDs to differentiate question from answer_text
            return_dict=True,
        )

        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        max_start_score = torch.max(start_scores)
        max_end_score = torch.max(end_scores)

        print(max_start_score)
        print(max_end_score)

        #  reconstruct answer from the tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer = tokens[torch.argmax(start_scores)]

        for i in range(torch.argmax(start_scores) + 1, torch.argmax(end_scores) + 1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
        scores.append((max_start_score, max_end_score, answer))

    # Compare scores for answers found and each paragraph and pick the most relevant.
    answer = max(scores, key=lambda x: x[0] + x[1])[2]

    response = openai.Completion.create(
        model="text-davinci-003",
        prompt="Answer the question " + question + "using this answer: " + answer,
        max_tokens=3000,
    )
    return response.choices[0].text.replace("\n\n", " ")


# =====[ DEFINE INTERFACE ]===== #'
title = "Azza Knowledge Agent"
examples = [["Where is the Eiffel Tower?"], ["What is the population of France?"]]
demo = gr.Interface(
    title=title,
    fn=answer_question,
    inputs="text",
    outputs="text",
    examples=examples,
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch()