Spaces:
Runtime error
Runtime error
Jingxiang Mo
commited on
Commit
·
77e7345
1
Parent(s):
4071dd4
Lint and code optimization
Browse files- .vscode/settings.json +5 -0
- __pycache__/app.cpython-39.pyc +0 -0
- app.py +74 -73
.vscode/settings.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"python.linting.pylintEnabled": true,
|
3 |
+
"python.linting.enabled": true,
|
4 |
+
"python.formatting.provider": "yapf"
|
5 |
+
}
|
__pycache__/app.cpython-39.pyc
CHANGED
Binary files a/__pycache__/app.cpython-39.pyc and b/__pycache__/app.cpython-39.pyc differ
|
|
app.py
CHANGED
@@ -3,16 +3,18 @@ import gradio as gr
|
|
3 |
import numpy as np
|
4 |
import wikipediaapi as wk
|
5 |
import wikipedia
|
|
|
6 |
from transformers import (
|
7 |
TokenClassificationPipeline,
|
8 |
AutoModelForTokenClassification,
|
9 |
AutoTokenizer,
|
10 |
BertForQuestionAnswering,
|
11 |
-
BertTokenizer
|
12 |
)
|
13 |
from transformers.pipelines import AggregationStrategy
|
14 |
import torch
|
15 |
|
|
|
16 |
# =====[ DEFINE PIPELINE ]===== #
|
17 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
18 |
def __init__(self, model, *args, **kwargs):
|
@@ -20,7 +22,7 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
|
20 |
model=AutoModelForTokenClassification.from_pretrained(model),
|
21 |
tokenizer=AutoTokenizer.from_pretrained(model),
|
22 |
*args,
|
23 |
-
**kwargs
|
24 |
)
|
25 |
|
26 |
def postprocess(self, model_outputs):
|
@@ -30,89 +32,109 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
|
30 |
)
|
31 |
return np.unique([result.get("word").strip() for result in results])
|
32 |
|
|
|
33 |
# =====[ LOAD PIPELINE ]===== #
|
34 |
keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
|
35 |
extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
|
36 |
-
model = BertForQuestionAnswering.from_pretrained(
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def keyphrases_extraction(text: str) -> str:
|
40 |
keyphrases = extractor(text)
|
41 |
return keyphrases
|
42 |
|
|
|
43 |
def wikipedia_search(input: str) -> str:
|
44 |
input = input.replace("\n", " ")
|
45 |
keyphrases = keyphrases_extraction(input)
|
46 |
|
47 |
-
wiki = wk.Wikipedia(
|
48 |
-
|
49 |
-
try
|
50 |
if len(keyphrases) == 0:
|
51 |
return "Can you add more details to your question?"
|
52 |
-
|
53 |
query_suggestion = wikipedia.suggest(keyphrases[0])
|
54 |
-
if
|
55 |
results = wikipedia.search(query_suggestion)
|
56 |
else:
|
57 |
results = wikipedia.search(keyphrases[0])
|
58 |
|
59 |
index = 0
|
60 |
page = wiki.page(results[index])
|
61 |
-
while not (
|
62 |
index += 1
|
63 |
if index == len(results):
|
64 |
raise Exception
|
65 |
page = wiki.page(results[index])
|
66 |
return page.summary
|
67 |
-
|
68 |
except:
|
69 |
return "I cannot answer this question"
|
70 |
-
|
71 |
-
def answer_question(question):
|
72 |
|
|
|
|
|
73 |
context = wikipedia_search(question)
|
74 |
-
if (context == "I cannot answer this question") or (
|
|
|
|
|
75 |
return context
|
76 |
|
77 |
-
#
|
78 |
# Apply the tokenizer to the input text, treating them as a text-pair.
|
79 |
-
|
80 |
input_ids = tokenizer.encode(question, context)
|
81 |
-
question_ids = input_ids[:input_ids.index(tokenizer.sep_token_id)+1]
|
82 |
|
83 |
# Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
|
84 |
length_of_group = 512 - len(question_ids)
|
85 |
-
input_ids_without_question = input_ids[
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
|
88 |
input_ids_split = []
|
89 |
-
for group in range(len(input_ids_without_question)//length_of_group + 1):
|
90 |
-
input_ids_split.append(
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
scores = []
|
94 |
for input in input_ids_split:
|
95 |
-
|
96 |
-
|
97 |
sep_index = input.index(tokenizer.sep_token_id)
|
98 |
-
|
99 |
-
# The number of segment A tokens includes the [SEP] token istelf.
|
100 |
num_seg_a = sep_index + 1
|
101 |
-
|
102 |
-
# The remainder are segment B.
|
103 |
-
num_seg_b = len(input) - num_seg_a
|
104 |
-
|
105 |
-
# Construct the list of 0s and 1s.
|
106 |
-
segment_ids = [0]*num_seg_a + [1]*num_seg_b
|
107 |
-
|
108 |
-
# There should be a segment_id for every input token.
|
109 |
assert len(segment_ids) == len(input)
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
116 |
|
117 |
start_scores = outputs.start_logits
|
118 |
end_scores = outputs.end_logits
|
@@ -123,53 +145,32 @@ def answer_question(question):
|
|
123 |
print(max_start_score)
|
124 |
print(max_end_score)
|
125 |
|
126 |
-
|
127 |
-
# Find the tokens with the highest `start` and `end` scores.
|
128 |
-
|
129 |
-
answer_start = torch.argmax(start_scores)
|
130 |
-
answer_end = torch.argmax(end_scores)
|
131 |
-
|
132 |
-
|
133 |
-
# Get the string versions of the input tokens.
|
134 |
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
|
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
# Select the remaining answer tokens and join them with whitespace.
|
140 |
-
for i in range(answer_start + 1, answer_end + 1):
|
141 |
-
|
142 |
-
# If it's a subword token, then recombine it with the previous token.
|
143 |
-
if tokens[i][0:2] == '##':
|
144 |
answer += tokens[i][2:]
|
145 |
-
|
146 |
-
# Otherwise, add a space then the token.
|
147 |
else:
|
148 |
-
answer +=
|
149 |
-
|
150 |
scores.append((max_start_score, max_end_score, answer))
|
151 |
|
152 |
# Compare scores for answers found and each paragraph and pick the most relevant.
|
|
|
153 |
|
154 |
-
final_answer = max(scores, key=lambda x: x[0] + x[1])[2]
|
155 |
-
|
156 |
-
return final_answer
|
157 |
|
158 |
# =====[ DEFINE INTERFACE ]===== #'
|
159 |
title = "Azza Knowledge Agent"
|
160 |
-
examples = [
|
161 |
-
["Where is the Eiffel Tower?"],
|
162 |
-
["What is the population of France?"]
|
163 |
-
]
|
164 |
demo = gr.Interface(
|
165 |
-
title
|
166 |
-
|
167 |
fn=answer_question,
|
168 |
-
inputs
|
169 |
-
outputs
|
170 |
examples=examples,
|
171 |
allow_flagging="never",
|
172 |
-
|
173 |
|
174 |
if __name__ == "__main__":
|
175 |
-
demo.launch()
|
|
|
3 |
import numpy as np
|
4 |
import wikipediaapi as wk
|
5 |
import wikipedia
|
6 |
+
import openai
|
7 |
from transformers import (
|
8 |
TokenClassificationPipeline,
|
9 |
AutoModelForTokenClassification,
|
10 |
AutoTokenizer,
|
11 |
BertForQuestionAnswering,
|
12 |
+
BertTokenizer,
|
13 |
)
|
14 |
from transformers.pipelines import AggregationStrategy
|
15 |
import torch
|
16 |
|
17 |
+
|
18 |
# =====[ DEFINE PIPELINE ]===== #
|
19 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
20 |
def __init__(self, model, *args, **kwargs):
|
|
|
22 |
model=AutoModelForTokenClassification.from_pretrained(model),
|
23 |
tokenizer=AutoTokenizer.from_pretrained(model),
|
24 |
*args,
|
25 |
+
**kwargs,
|
26 |
)
|
27 |
|
28 |
def postprocess(self, model_outputs):
|
|
|
32 |
)
|
33 |
return np.unique([result.get("word").strip() for result in results])
|
34 |
|
35 |
+
|
36 |
# =====[ LOAD PIPELINE ]===== #
|
37 |
keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
|
38 |
extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
|
39 |
+
model = BertForQuestionAnswering.from_pretrained(
|
40 |
+
"bert-large-uncased-whole-word-masking-finetuned-squad"
|
41 |
+
)
|
42 |
+
tokenizer = BertTokenizer.from_pretrained(
|
43 |
+
"bert-large-uncased-whole-word-masking-finetuned-squad"
|
44 |
+
)
|
45 |
+
|
46 |
|
47 |
def keyphrases_extraction(text: str) -> str:
|
48 |
keyphrases = extractor(text)
|
49 |
return keyphrases
|
50 |
|
51 |
+
|
52 |
def wikipedia_search(input: str) -> str:
|
53 |
input = input.replace("\n", " ")
|
54 |
keyphrases = keyphrases_extraction(input)
|
55 |
|
56 |
+
wiki = wk.Wikipedia("en")
|
57 |
+
|
58 |
+
try:
|
59 |
if len(keyphrases) == 0:
|
60 |
return "Can you add more details to your question?"
|
61 |
+
|
62 |
query_suggestion = wikipedia.suggest(keyphrases[0])
|
63 |
+
if query_suggestion != None:
|
64 |
results = wikipedia.search(query_suggestion)
|
65 |
else:
|
66 |
results = wikipedia.search(keyphrases[0])
|
67 |
|
68 |
index = 0
|
69 |
page = wiki.page(results[index])
|
70 |
+
while not ("." in page.summary) or not page.exists():
|
71 |
index += 1
|
72 |
if index == len(results):
|
73 |
raise Exception
|
74 |
page = wiki.page(results[index])
|
75 |
return page.summary
|
76 |
+
|
77 |
except:
|
78 |
return "I cannot answer this question"
|
|
|
|
|
79 |
|
80 |
+
|
81 |
+
def answer_question(question):
|
82 |
context = wikipedia_search(question)
|
83 |
+
if (context == "I cannot answer this question") or (
|
84 |
+
context == "Can you add more details to your question?"
|
85 |
+
):
|
86 |
return context
|
87 |
|
88 |
+
# Tokenize
|
89 |
# Apply the tokenizer to the input text, treating them as a text-pair.
|
|
|
90 |
input_ids = tokenizer.encode(question, context)
|
91 |
+
question_ids = input_ids[: input_ids.index(tokenizer.sep_token_id) + 1]
|
92 |
|
93 |
# Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
|
94 |
length_of_group = 512 - len(question_ids)
|
95 |
+
input_ids_without_question = input_ids[
|
96 |
+
input_ids.index(tokenizer.sep_token_id) + 1 :
|
97 |
+
]
|
98 |
+
print(
|
99 |
+
f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n"
|
100 |
+
)
|
101 |
|
102 |
input_ids_split = []
|
103 |
+
for group in range(len(input_ids_without_question) // length_of_group + 1):
|
104 |
+
input_ids_split.append(
|
105 |
+
question_ids
|
106 |
+
+ input_ids_without_question[
|
107 |
+
length_of_group * group : length_of_group * (group + 1) - 1
|
108 |
+
]
|
109 |
+
)
|
110 |
+
input_ids_split.append(
|
111 |
+
question_ids
|
112 |
+
+ input_ids_without_question[
|
113 |
+
length_of_group
|
114 |
+
* (len(input_ids_without_question) // length_of_group + 1) : len(
|
115 |
+
input_ids_without_question
|
116 |
+
)
|
117 |
+
- 1
|
118 |
+
]
|
119 |
+
)
|
120 |
+
|
121 |
scores = []
|
122 |
for input in input_ids_split:
|
123 |
+
# set Segment IDs
|
124 |
+
# Search the input_ids for the first instance of the `[SEP]` token.
|
125 |
sep_index = input.index(tokenizer.sep_token_id)
|
|
|
|
|
126 |
num_seg_a = sep_index + 1
|
127 |
+
segment_ids = [0] * num_seg_a + [1] * (len(input) - num_seg_a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
assert len(segment_ids) == len(input)
|
129 |
|
130 |
+
# evaulate the model
|
131 |
+
outputs = model(
|
132 |
+
torch.tensor([input]), # The tokens representing our input text.
|
133 |
+
token_type_ids=torch.tensor(
|
134 |
+
[segment_ids]
|
135 |
+
), # The segment IDs to differentiate question from answer_text
|
136 |
+
return_dict=True,
|
137 |
+
)
|
138 |
|
139 |
start_scores = outputs.start_logits
|
140 |
end_scores = outputs.end_logits
|
|
|
145 |
print(max_start_score)
|
146 |
print(max_end_score)
|
147 |
|
148 |
+
# reconstruct answer from the tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
150 |
+
answer = tokens[torch.argmax(start_scores)]
|
151 |
|
152 |
+
for i in range(torch.argmax(start_scores) + 1, torch.argmax(end_scores) + 1):
|
153 |
+
if tokens[i][0:2] == "##":
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
answer += tokens[i][2:]
|
|
|
|
|
155 |
else:
|
156 |
+
answer += " " + tokens[i]
|
|
|
157 |
scores.append((max_start_score, max_end_score, answer))
|
158 |
|
159 |
# Compare scores for answers found and each paragraph and pick the most relevant.
|
160 |
+
return max(scores, key=lambda x: x[0] + x[1])[2]
|
161 |
|
|
|
|
|
|
|
162 |
|
163 |
# =====[ DEFINE INTERFACE ]===== #'
|
164 |
title = "Azza Knowledge Agent"
|
165 |
+
examples = [["Where is the Eiffel Tower?"], ["What is the population of France?"]]
|
|
|
|
|
|
|
166 |
demo = gr.Interface(
|
167 |
+
title=title,
|
|
|
168 |
fn=answer_question,
|
169 |
+
inputs="text",
|
170 |
+
outputs="text",
|
171 |
examples=examples,
|
172 |
allow_flagging="never",
|
173 |
+
)
|
174 |
|
175 |
if __name__ == "__main__":
|
176 |
+
demo.launch()
|