Spaces:

awacke1
/

WikipediaUltimateAISearch

Running

App Files Files Community

awacke1 commited on Feb 19, 2023

Commit

58955da

1 Parent(s): 5340ad4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -20

app.py CHANGED Viewed

@@ -13,8 +13,8 @@ try:
 except:
   spacy.cli.download("en_core_web_sm")
   nlp = spacy.load("en_core_web_sm")
-wh_words = ['what', 'who', 'how', 'when', 'which']
 def get_concepts(text):
   text = text.lower()
   doc = nlp(text)
@@ -38,12 +38,14 @@ def get_passages(text, k=100):
             passage = sen.text
             passage_len = len(sen)
             continue
         elif i==(len(sents)-1):
             passage+=" "+sen.text
             passages.append(passage)
             passage = ""
             passage_len = 0
             continue
         passage+=" "+sen.text
     return passages
@@ -56,8 +58,10 @@ def get_dicts_for_dpr(concepts, n_results=20, k=100):
         try:
           html_page = wikipedia.page(title = wiki, auto_suggest = False)
         except DisambiguationError:
-          continue
         htmlResults=html_page.content
         passages = get_passages(htmlResults, k=k)
         for passage in passages:
           i_dicts = {}
@@ -86,13 +90,11 @@ def extracted_passage_embeddings(processed_passages, max_length=156):
                     max_length=max_length,
                     return_token_type_ids=True
                 )
-    passage_embeddings = passage_encoder.predict(
-                    [np.array(passage_inputs['input_ids']),
-                    np.array(passage_inputs['attention_mask']),
-                    np.array(passage_inputs['token_type_ids'])],
-                    batch_size=64,
-                    verbose=1
-                )
     return passage_embeddings
 def extracted_query_embeddings(queries, max_length=64):
@@ -104,57 +106,65 @@ def extracted_query_embeddings(queries, max_length=64):
                     max_length=max_length,
                     return_token_type_ids=True
                 )
-    query_embeddings = query_encoder.predict(
-                    [np.array(query_inputs['input_ids']),
-                    np.array(query_inputs['attention_mask']),
-                    np.array(query_inputs['token_type_ids'])],
-                    batch_size=1,
-                    verbose=1
-                )
     return query_embeddings
-# Wikipedia API:
 def get_pagetext(page):
   s=str(page).replace("/t","")
   return s
 def get_wiki_summary(search):
     wiki_wiki = wikipediaapi.Wikipedia('en')
     page = wiki_wiki.page(search)
     isExist = page.exists()
     if not isExist:
         return isExist, "Not found", "Not found", "Not found", "Not found"
     pageurl = page.fullurl
     pagetitle = page.title
     pagesummary = page.summary[0:60]
     pagetext = get_pagetext(page.text)
     backlinks = page.backlinks
     linklist = ""
     for link in backlinks.items():
       pui = link[0]
       linklist += pui + " ,  "
       a=1
     categories = page.categories
     categorylist = ""
     for category in categories.items():
       pui = category[0]
       categorylist += pui + " ,  "
       a=1
     links = page.links
     linklist2 = ""
     for link in links.items():
       pui = link[0]
       linklist2 += pui + " ,  "
       a=1
     sections = page.sections
     ex_dic = {
       'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
       'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
     }
     df = pd.DataFrame(ex_dic)
-    return df
 def search(question):
   concepts = get_concepts(question)
   print("concepts: ",concepts)
@@ -171,12 +181,12 @@ def search(question):
   prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
   return pd.DataFrame([dicts[i] for i in index[0]])
-# AI UI SOTA - radio blocks with UI formatting, and event driven UI
 with gr.Blocks() as demo:     # Block documentation on event listeners, start here:  https://gradio.app/blocks_and_event_listeners/
   gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
   gr.Markdown("""<div align="center">Search and Find Anything Then Use in AI!  <a href="https://www.mediawiki.org/wiki/API:Main_page">MediaWiki - API for Wikipedia</a>.  <a href="https://paperswithcode.com/datasets?q=wikipedia&v=lst&o=newest">Papers,Code,Datasets for SOTA w/ Wikipedia</a>""")
   with gr.Row(): # inputs and buttons
-    inp = gr.Textbox(lines=1, default="cognitive architecture", label="Question/Prompt (lower case)")
   with gr.Row(): # inputs and buttons
     b3 = gr.Button("Search AI Summaries")
     b4 = gr.Button("Search Web Live")

 except:
   spacy.cli.download("en_core_web_sm")
   nlp = spacy.load("en_core_web_sm")
+wh_words = ['what', 'who', 'how', 'when', 'which']
 def get_concepts(text):
   text = text.lower()
   doc = nlp(text)
             passage = sen.text
             passage_len = len(sen)
             continue
         elif i==(len(sents)-1):
             passage+=" "+sen.text
             passages.append(passage)
             passage = ""
             passage_len = 0
             continue
         passage+=" "+sen.text
     return passages
         try:
           html_page = wikipedia.page(title = wiki, auto_suggest = False)
         except DisambiguationError:
+          continue
         htmlResults=html_page.content
         passages = get_passages(htmlResults, k=k)
         for passage in passages:
           i_dicts = {}
                     max_length=max_length,
                     return_token_type_ids=True
                 )
+    passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
+                                                np.array(passage_inputs['attention_mask']),
+                                                np.array(passage_inputs['token_type_ids'])],
+                                                batch_size=64,
+                                                verbose=1)
     return passage_embeddings
 def extracted_query_embeddings(queries, max_length=64):
                     max_length=max_length,
                     return_token_type_ids=True
                 )
+    query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
+                                                np.array(query_inputs['attention_mask']),
+                                                np.array(query_inputs['token_type_ids'])],
+                                                batch_size=1,
+                                                verbose=1)
     return query_embeddings
+#Wikipedia API:
 def get_pagetext(page):
   s=str(page).replace("/t","")
   return s
 def get_wiki_summary(search):
     wiki_wiki = wikipediaapi.Wikipedia('en')
     page = wiki_wiki.page(search)
     isExist = page.exists()
     if not isExist:
         return isExist, "Not found", "Not found", "Not found", "Not found"
     pageurl = page.fullurl
     pagetitle = page.title
     pagesummary = page.summary[0:60]
     pagetext = get_pagetext(page.text)
     backlinks = page.backlinks
     linklist = ""
     for link in backlinks.items():
       pui = link[0]
       linklist += pui + " ,  "
       a=1
     categories = page.categories
     categorylist = ""
     for category in categories.items():
       pui = category[0]
       categorylist += pui + " ,  "
       a=1
     links = page.links
     linklist2 = ""
     for link in links.items():
       pui = link[0]
       linklist2 += pui + " ,  "
       a=1
     sections = page.sections
     ex_dic = {
       'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
       'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
     }
     df = pd.DataFrame(ex_dic)
+    return df
 def search(question):
   concepts = get_concepts(question)
   print("concepts: ",concepts)
   prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
   return pd.DataFrame([dicts[i] for i in index[0]])
+# AI UI SOTA - Gradio blocks with UI formatting, and event driven UI
 with gr.Blocks() as demo:     # Block documentation on event listeners, start here:  https://gradio.app/blocks_and_event_listeners/
   gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
   gr.Markdown("""<div align="center">Search and Find Anything Then Use in AI!  <a href="https://www.mediawiki.org/wiki/API:Main_page">MediaWiki - API for Wikipedia</a>.  <a href="https://paperswithcode.com/datasets?q=wikipedia&v=lst&o=newest">Papers,Code,Datasets for SOTA w/ Wikipedia</a>""")
   with gr.Row(): # inputs and buttons
+    inp = gr.Textbox(lines=1, default="Syd Mead", label="Question")
   with gr.Row(): # inputs and buttons
     b3 = gr.Button("Search AI Summaries")
     b4 = gr.Button("Search Web Live")