Jingxiang Mo commited on
Commit
77e7345
·
1 Parent(s): 4071dd4

Lint and code optimization

Browse files
Files changed (3) hide show
  1. .vscode/settings.json +5 -0
  2. __pycache__/app.cpython-39.pyc +0 -0
  3. app.py +74 -73
.vscode/settings.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "python.linting.pylintEnabled": true,
3
+ "python.linting.enabled": true,
4
+ "python.formatting.provider": "yapf"
5
+ }
__pycache__/app.cpython-39.pyc CHANGED
Binary files a/__pycache__/app.cpython-39.pyc and b/__pycache__/app.cpython-39.pyc differ
 
app.py CHANGED
@@ -3,16 +3,18 @@ import gradio as gr
3
  import numpy as np
4
  import wikipediaapi as wk
5
  import wikipedia
 
6
  from transformers import (
7
  TokenClassificationPipeline,
8
  AutoModelForTokenClassification,
9
  AutoTokenizer,
10
  BertForQuestionAnswering,
11
- BertTokenizer
12
  )
13
  from transformers.pipelines import AggregationStrategy
14
  import torch
15
 
 
16
  # =====[ DEFINE PIPELINE ]===== #
17
  class KeyphraseExtractionPipeline(TokenClassificationPipeline):
18
  def __init__(self, model, *args, **kwargs):
@@ -20,7 +22,7 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
20
  model=AutoModelForTokenClassification.from_pretrained(model),
21
  tokenizer=AutoTokenizer.from_pretrained(model),
22
  *args,
23
- **kwargs
24
  )
25
 
26
  def postprocess(self, model_outputs):
@@ -30,89 +32,109 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
30
  )
31
  return np.unique([result.get("word").strip() for result in results])
32
 
 
33
  # =====[ LOAD PIPELINE ]===== #
34
  keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
35
  extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
36
- model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
37
- tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
 
 
 
 
 
38
 
39
  def keyphrases_extraction(text: str) -> str:
40
  keyphrases = extractor(text)
41
  return keyphrases
42
 
 
43
  def wikipedia_search(input: str) -> str:
44
  input = input.replace("\n", " ")
45
  keyphrases = keyphrases_extraction(input)
46
 
47
- wiki = wk.Wikipedia('en')
48
-
49
- try :
50
  if len(keyphrases) == 0:
51
  return "Can you add more details to your question?"
52
-
53
  query_suggestion = wikipedia.suggest(keyphrases[0])
54
- if(query_suggestion != None):
55
  results = wikipedia.search(query_suggestion)
56
  else:
57
  results = wikipedia.search(keyphrases[0])
58
 
59
  index = 0
60
  page = wiki.page(results[index])
61
- while not ('.' in page.summary) or not page.exists():
62
  index += 1
63
  if index == len(results):
64
  raise Exception
65
  page = wiki.page(results[index])
66
  return page.summary
67
-
68
  except:
69
  return "I cannot answer this question"
70
-
71
- def answer_question(question):
72
 
 
 
73
  context = wikipedia_search(question)
74
- if (context == "I cannot answer this question") or (context == "Can you add more details to your question?"):
 
 
75
  return context
76
 
77
- # ======== Tokenize ========
78
  # Apply the tokenizer to the input text, treating them as a text-pair.
79
-
80
  input_ids = tokenizer.encode(question, context)
81
- question_ids = input_ids[:input_ids.index(tokenizer.sep_token_id)+1]
82
 
83
  # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
84
  length_of_group = 512 - len(question_ids)
85
- input_ids_without_question = input_ids[input_ids.index(tokenizer.sep_token_id)+1:]
86
- print(f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n")
 
 
 
 
87
 
88
  input_ids_split = []
89
- for group in range(len(input_ids_without_question)//length_of_group + 1):
90
- input_ids_split.append(question_ids + input_ids_without_question[length_of_group*group:length_of_group*(group+1)-1])
91
- input_ids_split.append(question_ids + input_ids_without_question[length_of_group*(len(input_ids_without_question)//length_of_group + 1):len(input_ids_without_question)-1])
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  scores = []
94
  for input in input_ids_split:
95
- # ======== Set Segment IDs ========
96
- # Search the input_ids for the first instance of the `[SEP]` token.
97
  sep_index = input.index(tokenizer.sep_token_id)
98
-
99
- # The number of segment A tokens includes the [SEP] token istelf.
100
  num_seg_a = sep_index + 1
101
-
102
- # The remainder are segment B.
103
- num_seg_b = len(input) - num_seg_a
104
-
105
- # Construct the list of 0s and 1s.
106
- segment_ids = [0]*num_seg_a + [1]*num_seg_b
107
-
108
- # There should be a segment_id for every input token.
109
  assert len(segment_ids) == len(input)
110
 
111
- # ======== Evaluate ========
112
- # Run our example through the model.
113
- outputs = model(torch.tensor([input]), # The tokens representing our input text.
114
- token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
115
- return_dict=True)
 
 
 
116
 
117
  start_scores = outputs.start_logits
118
  end_scores = outputs.end_logits
@@ -123,53 +145,32 @@ def answer_question(question):
123
  print(max_start_score)
124
  print(max_end_score)
125
 
126
- # ======== Reconstruct Answer ========
127
- # Find the tokens with the highest `start` and `end` scores.
128
-
129
- answer_start = torch.argmax(start_scores)
130
- answer_end = torch.argmax(end_scores)
131
-
132
-
133
- # Get the string versions of the input tokens.
134
  tokens = tokenizer.convert_ids_to_tokens(input_ids)
 
135
 
136
- # Start with the first token.
137
- answer = tokens[answer_start]
138
-
139
- # Select the remaining answer tokens and join them with whitespace.
140
- for i in range(answer_start + 1, answer_end + 1):
141
-
142
- # If it's a subword token, then recombine it with the previous token.
143
- if tokens[i][0:2] == '##':
144
  answer += tokens[i][2:]
145
-
146
- # Otherwise, add a space then the token.
147
  else:
148
- answer += ' ' + tokens[i]
149
-
150
  scores.append((max_start_score, max_end_score, answer))
151
 
152
  # Compare scores for answers found and each paragraph and pick the most relevant.
 
153
 
154
- final_answer = max(scores, key=lambda x: x[0] + x[1])[2]
155
-
156
- return final_answer
157
 
158
  # =====[ DEFINE INTERFACE ]===== #'
159
  title = "Azza Knowledge Agent"
160
- examples = [
161
- ["Where is the Eiffel Tower?"],
162
- ["What is the population of France?"]
163
- ]
164
  demo = gr.Interface(
165
- title = title,
166
-
167
  fn=answer_question,
168
- inputs = "text",
169
- outputs = "text",
170
  examples=examples,
171
  allow_flagging="never",
172
- )
173
 
174
  if __name__ == "__main__":
175
- demo.launch()
 
3
  import numpy as np
4
  import wikipediaapi as wk
5
  import wikipedia
6
+ import openai
7
  from transformers import (
8
  TokenClassificationPipeline,
9
  AutoModelForTokenClassification,
10
  AutoTokenizer,
11
  BertForQuestionAnswering,
12
+ BertTokenizer,
13
  )
14
  from transformers.pipelines import AggregationStrategy
15
  import torch
16
 
17
+
18
  # =====[ DEFINE PIPELINE ]===== #
19
  class KeyphraseExtractionPipeline(TokenClassificationPipeline):
20
  def __init__(self, model, *args, **kwargs):
 
22
  model=AutoModelForTokenClassification.from_pretrained(model),
23
  tokenizer=AutoTokenizer.from_pretrained(model),
24
  *args,
25
+ **kwargs,
26
  )
27
 
28
  def postprocess(self, model_outputs):
 
32
  )
33
  return np.unique([result.get("word").strip() for result in results])
34
 
35
+
36
  # =====[ LOAD PIPELINE ]===== #
37
  keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
38
  extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
39
+ model = BertForQuestionAnswering.from_pretrained(
40
+ "bert-large-uncased-whole-word-masking-finetuned-squad"
41
+ )
42
+ tokenizer = BertTokenizer.from_pretrained(
43
+ "bert-large-uncased-whole-word-masking-finetuned-squad"
44
+ )
45
+
46
 
47
  def keyphrases_extraction(text: str) -> str:
48
  keyphrases = extractor(text)
49
  return keyphrases
50
 
51
+
52
  def wikipedia_search(input: str) -> str:
53
  input = input.replace("\n", " ")
54
  keyphrases = keyphrases_extraction(input)
55
 
56
+ wiki = wk.Wikipedia("en")
57
+
58
+ try:
59
  if len(keyphrases) == 0:
60
  return "Can you add more details to your question?"
61
+
62
  query_suggestion = wikipedia.suggest(keyphrases[0])
63
+ if query_suggestion != None:
64
  results = wikipedia.search(query_suggestion)
65
  else:
66
  results = wikipedia.search(keyphrases[0])
67
 
68
  index = 0
69
  page = wiki.page(results[index])
70
+ while not ("." in page.summary) or not page.exists():
71
  index += 1
72
  if index == len(results):
73
  raise Exception
74
  page = wiki.page(results[index])
75
  return page.summary
76
+
77
  except:
78
  return "I cannot answer this question"
 
 
79
 
80
+
81
+ def answer_question(question):
82
  context = wikipedia_search(question)
83
+ if (context == "I cannot answer this question") or (
84
+ context == "Can you add more details to your question?"
85
+ ):
86
  return context
87
 
88
+ # Tokenize
89
  # Apply the tokenizer to the input text, treating them as a text-pair.
 
90
  input_ids = tokenizer.encode(question, context)
91
+ question_ids = input_ids[: input_ids.index(tokenizer.sep_token_id) + 1]
92
 
93
  # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
94
  length_of_group = 512 - len(question_ids)
95
+ input_ids_without_question = input_ids[
96
+ input_ids.index(tokenizer.sep_token_id) + 1 :
97
+ ]
98
+ print(
99
+ f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n"
100
+ )
101
 
102
  input_ids_split = []
103
+ for group in range(len(input_ids_without_question) // length_of_group + 1):
104
+ input_ids_split.append(
105
+ question_ids
106
+ + input_ids_without_question[
107
+ length_of_group * group : length_of_group * (group + 1) - 1
108
+ ]
109
+ )
110
+ input_ids_split.append(
111
+ question_ids
112
+ + input_ids_without_question[
113
+ length_of_group
114
+ * (len(input_ids_without_question) // length_of_group + 1) : len(
115
+ input_ids_without_question
116
+ )
117
+ - 1
118
+ ]
119
+ )
120
+
121
  scores = []
122
  for input in input_ids_split:
123
+ # set Segment IDs
124
+ # Search the input_ids for the first instance of the `[SEP]` token.
125
  sep_index = input.index(tokenizer.sep_token_id)
 
 
126
  num_seg_a = sep_index + 1
127
+ segment_ids = [0] * num_seg_a + [1] * (len(input) - num_seg_a)
 
 
 
 
 
 
 
128
  assert len(segment_ids) == len(input)
129
 
130
+ # evaulate the model
131
+ outputs = model(
132
+ torch.tensor([input]), # The tokens representing our input text.
133
+ token_type_ids=torch.tensor(
134
+ [segment_ids]
135
+ ), # The segment IDs to differentiate question from answer_text
136
+ return_dict=True,
137
+ )
138
 
139
  start_scores = outputs.start_logits
140
  end_scores = outputs.end_logits
 
145
  print(max_start_score)
146
  print(max_end_score)
147
 
148
+ # reconstruct answer from the tokens
 
 
 
 
 
 
 
149
  tokens = tokenizer.convert_ids_to_tokens(input_ids)
150
+ answer = tokens[torch.argmax(start_scores)]
151
 
152
+ for i in range(torch.argmax(start_scores) + 1, torch.argmax(end_scores) + 1):
153
+ if tokens[i][0:2] == "##":
 
 
 
 
 
 
154
  answer += tokens[i][2:]
 
 
155
  else:
156
+ answer += " " + tokens[i]
 
157
  scores.append((max_start_score, max_end_score, answer))
158
 
159
  # Compare scores for answers found and each paragraph and pick the most relevant.
160
+ return max(scores, key=lambda x: x[0] + x[1])[2]
161
 
 
 
 
162
 
163
  # =====[ DEFINE INTERFACE ]===== #'
164
  title = "Azza Knowledge Agent"
165
+ examples = [["Where is the Eiffel Tower?"], ["What is the population of France?"]]
 
 
 
166
  demo = gr.Interface(
167
+ title=title,
 
168
  fn=answer_question,
169
+ inputs="text",
170
+ outputs="text",
171
  examples=examples,
172
  allow_flagging="never",
173
+ )
174
 
175
  if __name__ == "__main__":
176
+ demo.launch()