awacke1 commited on
Commit
6875ee8
·
1 Parent(s): 0123c84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -37
app.py CHANGED
@@ -13,8 +13,8 @@ try:
13
  except:
14
  spacy.cli.download("en_core_web_sm")
15
  nlp = spacy.load("en_core_web_sm")
16
-
17
  wh_words = ['what', 'who', 'how', 'when', 'which']
 
18
  def get_concepts(text):
19
  text = text.lower()
20
  doc = nlp(text)
@@ -38,14 +38,12 @@ def get_passages(text, k=100):
38
  passage = sen.text
39
  passage_len = len(sen)
40
  continue
41
-
42
  elif i==(len(sents)-1):
43
  passage+=" "+sen.text
44
  passages.append(passage)
45
  passage = ""
46
  passage_len = 0
47
  continue
48
-
49
  passage+=" "+sen.text
50
  return passages
51
 
@@ -58,10 +56,8 @@ def get_dicts_for_dpr(concepts, n_results=20, k=100):
58
  try:
59
  html_page = wikipedia.page(title = wiki, auto_suggest = False)
60
  except DisambiguationError:
61
- continue
62
-
63
  htmlResults=html_page.content
64
-
65
  passages = get_passages(htmlResults, k=k)
66
  for passage in passages:
67
  i_dicts = {}
@@ -90,11 +86,13 @@ def extracted_passage_embeddings(processed_passages, max_length=156):
90
  max_length=max_length,
91
  return_token_type_ids=True
92
  )
93
- passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
94
- np.array(passage_inputs['attention_mask']),
95
- np.array(passage_inputs['token_type_ids'])],
96
- batch_size=64,
97
- verbose=1)
 
 
98
  return passage_embeddings
99
 
100
  def extracted_query_embeddings(queries, max_length=64):
@@ -106,70 +104,56 @@ def extracted_query_embeddings(queries, max_length=64):
106
  max_length=max_length,
107
  return_token_type_ids=True
108
  )
109
- query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
- np.array(query_inputs['attention_mask']),
111
- np.array(query_inputs['token_type_ids'])],
112
- batch_size=1,
113
- verbose=1)
 
 
114
  return query_embeddings
115
 
116
- #Wikipedia API:
117
 
118
  def get_pagetext(page):
119
  s=str(page).replace("/t","")
120
-
121
  return s
122
 
123
  def get_wiki_summary(search):
124
  wiki_wiki = wikipediaapi.Wikipedia('en')
125
  page = wiki_wiki.page(search)
126
-
127
  isExist = page.exists()
128
  if not isExist:
129
  return isExist, "Not found", "Not found", "Not found", "Not found"
130
-
131
  pageurl = page.fullurl
132
  pagetitle = page.title
133
  pagesummary = page.summary[0:60]
134
  pagetext = get_pagetext(page.text)
135
-
136
  backlinks = page.backlinks
137
  linklist = ""
138
  for link in backlinks.items():
139
  pui = link[0]
140
  linklist += pui + " , "
141
  a=1
142
-
143
  categories = page.categories
144
  categorylist = ""
145
  for category in categories.items():
146
  pui = category[0]
147
  categorylist += pui + " , "
148
  a=1
149
-
150
  links = page.links
151
  linklist2 = ""
152
  for link in links.items():
153
  pui = link[0]
154
  linklist2 += pui + " , "
155
  a=1
156
-
157
  sections = page.sections
158
-
159
-
160
  ex_dic = {
161
  'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
162
  'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
163
  }
164
-
165
- #columns = [pageurl,pagetitle]
166
- #index = [pagesummary,pagetext]
167
- #df = pd.DataFrame(page, columns=columns, index=index)
168
- #df = pd.DataFrame(ex_dic, columns=columns, index=index)
169
  df = pd.DataFrame(ex_dic)
170
-
171
  return df
172
-
173
 
174
  def search(question):
175
  concepts = get_concepts(question)
@@ -184,11 +168,9 @@ def search(question):
184
  query_embeddings = extracted_query_embeddings([question])
185
  faiss_index = faiss.IndexFlatL2(128)
186
  faiss_index.add(passage_embeddings.pooler_output)
187
- # prob, index = faiss_index.search(query_embeddings.pooler_output, k=1000)
188
  prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
189
  return pd.DataFrame([dicts[i] for i in index[0]])
190
 
191
-
192
  # AI UI SOTA - radio blocks with UI formatting, and event driven UI
193
  with gr.Blocks() as demo: # Block documentation on event listeners, start here: https://gradio.app/blocks_and_event_listeners/
194
  gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
@@ -205,5 +187,4 @@ with gr.Blocks() as demo: # Block documentation on event listeners, start he
205
  inp.submit(fn=get_wiki_summary, inputs=inp, outputs=out_DF)
206
  b3.click(fn=search, inputs=inp, outputs=out)
207
  b4.click(fn=get_wiki_summary, inputs=inp, outputs=out_DF )
208
- demo.launch(debug=True, show_error=True)
209
-
 
13
  except:
14
  spacy.cli.download("en_core_web_sm")
15
  nlp = spacy.load("en_core_web_sm")
 
16
  wh_words = ['what', 'who', 'how', 'when', 'which']
17
+
18
  def get_concepts(text):
19
  text = text.lower()
20
  doc = nlp(text)
 
38
  passage = sen.text
39
  passage_len = len(sen)
40
  continue
 
41
  elif i==(len(sents)-1):
42
  passage+=" "+sen.text
43
  passages.append(passage)
44
  passage = ""
45
  passage_len = 0
46
  continue
 
47
  passage+=" "+sen.text
48
  return passages
49
 
 
56
  try:
57
  html_page = wikipedia.page(title = wiki, auto_suggest = False)
58
  except DisambiguationError:
59
+ continue
 
60
  htmlResults=html_page.content
 
61
  passages = get_passages(htmlResults, k=k)
62
  for passage in passages:
63
  i_dicts = {}
 
86
  max_length=max_length,
87
  return_token_type_ids=True
88
  )
89
+ passage_embeddings = passage_encoder.predict(
90
+ [np.array(passage_inputs['input_ids']),
91
+ np.array(passage_inputs['attention_mask']),
92
+ np.array(passage_inputs['token_type_ids'])],
93
+ batch_size=64,
94
+ verbose=1
95
+ )
96
  return passage_embeddings
97
 
98
  def extracted_query_embeddings(queries, max_length=64):
 
104
  max_length=max_length,
105
  return_token_type_ids=True
106
  )
107
+ query_embeddings = query_encoder.predict(
108
+ [np.array(query_inputs['input_ids']),
109
+ np.array(query_inputs['attention_mask']),
110
+ np.array(query_inputs['token_type_ids'])],
111
+ batch_size=1,
112
+ verbose=1
113
+ )
114
  return query_embeddings
115
 
116
+ # Wikipedia API:
117
 
118
  def get_pagetext(page):
119
  s=str(page).replace("/t","")
 
120
  return s
121
 
122
  def get_wiki_summary(search):
123
  wiki_wiki = wikipediaapi.Wikipedia('en')
124
  page = wiki_wiki.page(search)
 
125
  isExist = page.exists()
126
  if not isExist:
127
  return isExist, "Not found", "Not found", "Not found", "Not found"
 
128
  pageurl = page.fullurl
129
  pagetitle = page.title
130
  pagesummary = page.summary[0:60]
131
  pagetext = get_pagetext(page.text)
 
132
  backlinks = page.backlinks
133
  linklist = ""
134
  for link in backlinks.items():
135
  pui = link[0]
136
  linklist += pui + " , "
137
  a=1
 
138
  categories = page.categories
139
  categorylist = ""
140
  for category in categories.items():
141
  pui = category[0]
142
  categorylist += pui + " , "
143
  a=1
 
144
  links = page.links
145
  linklist2 = ""
146
  for link in links.items():
147
  pui = link[0]
148
  linklist2 += pui + " , "
149
  a=1
 
150
  sections = page.sections
 
 
151
  ex_dic = {
152
  'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
153
  'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
154
  }
 
 
 
 
 
155
  df = pd.DataFrame(ex_dic)
 
156
  return df
 
157
 
158
  def search(question):
159
  concepts = get_concepts(question)
 
168
  query_embeddings = extracted_query_embeddings([question])
169
  faiss_index = faiss.IndexFlatL2(128)
170
  faiss_index.add(passage_embeddings.pooler_output)
 
171
  prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
172
  return pd.DataFrame([dicts[i] for i in index[0]])
173
 
 
174
  # AI UI SOTA - radio blocks with UI formatting, and event driven UI
175
  with gr.Blocks() as demo: # Block documentation on event listeners, start here: https://gradio.app/blocks_and_event_listeners/
176
  gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
 
187
  inp.submit(fn=get_wiki_summary, inputs=inp, outputs=out_DF)
188
  b3.click(fn=search, inputs=inp, outputs=out)
189
  b4.click(fn=get_wiki_summary, inputs=inp, outputs=out_DF )
190
+ demo.launch(debug=True, show_error=True)