Spaces:
Running
Running
import faiss | |
import pickle | |
import pandas as pd | |
import streamlit as st | |
from sentence_transformers import SentenceTransformer | |
from vector_engine.utils import vector_search | |
def read_data(pibdata="pib2022_23_cleaned_abs.csv"): | |
"""Read the pib data.""" | |
return pd.read_csv(pibdata) | |
def load_bert_model(name="pushpdeep/sbertmsmarco-en_to_indic_ur-murilv1"): | |
"""Instantiate a sentence-level DistilBERT model.""" | |
return SentenceTransformer(name) | |
def load_faiss_index(path_to_faiss="models/faiss_index_ip.pickle"): | |
"""Load and deserialize the Faiss index.""" | |
with open(path_to_faiss, "rb") as h: | |
data = pickle.load(h) | |
return faiss.deserialize_index(data) | |
def main(): | |
# Load data and models | |
data = read_data() | |
model = load_bert_model() | |
faiss_index = load_faiss_index() | |
st.title("Vector-based search with Sentence Transformers and Faiss") | |
# User search | |
user_input = st.text_area("Search box", "हिंद महासागर") | |
# Filters | |
st.sidebar.markdown("**Filters**") | |
# filter_year = st.sidebar.slider("Publication year", 2010, 2021, (2010, 2021), 1) | |
# filter_citations = st.sidebar.slider("Citations", 0, 250, 0) | |
num_results = st.sidebar.slider("Number of search results", 10, 50, 10) | |
# Fetch results | |
if user_input: | |
# Get paper IDs | |
D, I = vector_search([user_input], model, faiss_index, num_results) | |
# Slice data on year | |
frame = data | |
# Get individual results | |
for id_ in I.flatten().tolist(): | |
if id_ in set(frame.rid): | |
f = frame[(frame.rid == id_)] | |
else: | |
continue | |
st.write( | |
f""" | |
**Language**: {f.iloc[0].language} | |
**Article**: {f.iloc[0].abstract} https://pib.gov.in/PressReleasePage.aspx?PRID={f.iloc[0].rid} | |
""" | |
) | |
if __name__ == "__main__": | |
main() |