Spaces:

abokbot
/

wikipedia-search-engine

Sleeping

App Files Files Community

abokbot commited on Jun 4, 2023

Commit

1e57a2c

1 Parent(s): c1b147b

Create app.py

Browse files

Files changed (1) hide show

app.py +61 -0

app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import streamlit as st
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import torch
+from huggingface_hub import hf_hub_download
+embedding_path = "abokbot/wikipedia-embedding"
+st.header("Wikipedia Search Engine app")
+st_model_load = st.text('Loading wikipedia embedding...')
+@st.cache_resource
+def load_model():
+    print("Loading embedding...")
+    hf_hub_download(repo_id="abokbot/wikipedia-embedding", filename="simple_wikipedia_embedding.pt")
+    wikipedia_embedding = torch.load("wikipedia-embedding/simple_wikipedia_embedding.pt")
+    print("Embedding loaded!")
+    return wikipedia_embedding
+"""
+#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
+# cf https://www.sbert.net/docs/pretrained-models/msmarco-v3.html
+bi_encoder = SentenceTransformer('msmarco-MiniLM-L-6-v3')
+bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
+top_k = 32                          #Number of passages we want to retrieve with the bi-encoder
+#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
+def search(query):
+    print("Input question:", query)
+    ##### Sematic Search #####
+    # Encode the query using the bi-encoder and find potentially relevant passages
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    question_embedding = question_embedding.cuda()
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+    hits = hits[0]  # Get the hits for the first query
+    ##### Re-Ranking #####
+    # Now, score all retrieved passages with the cross_encoder
+    cross_inp = [[query, dataset["text"][hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-3 hits from re-ranker
+    print("\n-------------------------\n")
+    print("Top-3 Cross-Encoder Re-ranker hits")
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    for hit in hits[0:3]:
+        print("score: ",  round(hit['cross-score'], 3),"\n",
+              "title: ", dataset["title"][hit['corpus_id']], "\n",
+              "substract: ", dataset["text"][hit['corpus_id']].replace("\n", " "), "\n",
+              "link: ", dataset["url"][hit['corpus_id']],"\n")
+"""