import streamlit as st from dotenv import load_dotenv import json import os, time import uuid from retrieval_pipeline import get_retriever, get_compression_retriever import benchmark def get_result(query, compression_retriever): t0 = time.time() retrieved_chunks = compression_retriever.get_relevant_documents(query) latency = time.time() - t0 return retrieved_chunks, latency st.set_page_config( layout="wide", page_title="Retrieval Demo" ) def setup(): load_dotenv() ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL') retriever = get_retriever(index='masa.ai', elasticsearch_url=ELASTICSEARCH_URL) compression_retriever = get_compression_retriever(retriever) return compression_retriever def main(): st.title("Part 3: Search") # st.sidebar.write("According to the Model Size 👇") # menu = ["Nano", "Small", "Medium", "Large"] # choice = st.sidebar.selectbox("Choose", menu) st.sidebar.info(""" **Model Options:** - **Nano**: ~4MB, blazing fast model with competitive performance (ranking precision). - **Small**: ~34MB, slightly slower with the best performance (ranking precision). - **Medium**: ~110MB, slower model with the best zero-shot performance (ranking precision). - **Large**: ~150MB, slower model with competitive performance (ranking precision) for 100+ languages. """) with st.spinner('Setting up...'): compression_retriever = setup() with st.expander("Tech Stack Used"): st.markdown(""" **Flash Rank**: Ultra-lite & Super-fast Python library for search & retrieval re-ranking. - **Ultra-lite**: No heavy dependencies. Runs on CPU with a tiny ~4MB reranking model. - **Super-fast**: Speed depends on the number of tokens in passages and query, plus model depth. - **Cost-efficient**: Ideal for serverless deployments with low memory and time requirements. - **Based on State-of-the-Art Cross-encoders**: Includes models like ms-marco-TinyBERT-L-2-v2 (default), ms-marco-MiniLM-L-12-v2, rank-T5-flan, and ms-marco-MultiBERT-L-12. - **Sleek Models for Efficiency**: Designed for minimal overhead in user-facing scenarios. _Flash Rank is tailored for scenarios requiring efficient and effective reranking, balancing performance with resource usage._ """) with st.form(key='input_form'): query_input = st.text_area("Query Input") # context_input = st.text_area("Context Input") submit_button = st.form_submit_button(label='Retrieve') if submit_button: st.session_state.submitted = True if 'submitted' in st.session_state: with st.spinner('Processing...'): result, latency = get_result(query_input, compression_retriever) st.subheader("Please find the retrieved documents below 👇") st.write("latency:", latency, " ms") st.json(result) if __name__ == "__main__": main()