File size: 4,096 Bytes
a37b18d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever

# Page config
st.set_page_config(
    page_title="Freud Works Search",
    page_icon="πŸ“š",
    layout="wide"
)

# Title
st.title("Freud Works Hybrid Search")
st.markdown("""
This demo allows you to search through Freud's complete works using a hybrid approach combining:
- BM25 (keyword-based search)
- Vector search (semantic similarity)
""")

@st.cache_resource
def load_indices():
    """Load the index and create retrievers"""
    # Load embeddings
    embed_model = HuggingFaceEmbedding(model_name="multi-qa-MiniLM-L6-cos-v1")
    Settings.embed_model = embed_model
    
    # Load index
    storage_context = StorageContext.from_defaults(persist_dir="freud_index")
    index = load_index_from_storage(storage_context=storage_context)
    
    # Create retrievers
    vector_retriever = index.as_retriever(similarity_top_k=10)
    bm25_retriever = BM25Retriever.from_defaults(
        index, similarity_top_k=10
    )
    
    # Create hybrid retriever
    hybrid_retriever = QueryFusionRetriever(
        [vector_retriever, bm25_retriever],
        similarity_top_k=10,
        num_queries=1,  # set this to 1 to disable query generation
        mode="reciprocal_rerank",
        use_async=True,
        verbose=True,
    )
    
    return index, vector_retriever, bm25_retriever, hybrid_retriever

# Load indices
index, vector_retriever, bm25_retriever, hybrid_retriever = load_indices()

# Search interface
search_query = st.text_input("Enter your search query:", placeholder="e.g. Oedipus complex")

# Add top_k selector
top_k = st.slider("Number of results to return:", min_value=1, max_value=20, value=10)

# Update retrievers with new top_k
vector_retriever.similarity_top_k = top_k
bm25_retriever.similarity_top_k = top_k
hybrid_retriever.similarity_top_k = top_k

# Search type selector
search_type = st.radio(
    "Select search method:",
    ["Hybrid", "Vector", "BM25"],
    horizontal=True,
    help="""
    - **BM25**: Keyword-based search that works best for exact matches and specific terms. Similar to traditional search engines.
    - **Vector**: Semantic search that understands the meaning of your query, even if it uses different words than the source text.
    - **Hybrid**: Combines both approaches for better overall results, balancing exact matches with semantic understanding.
    """
)

if search_query:
    with st.spinner('Searching...'):
        if search_type == "Hybrid":
            nodes = hybrid_retriever.retrieve(search_query)
        elif search_type == "Vector":
            nodes = vector_retriever.retrieve(search_query)
        else:  # BM25
            nodes = bm25_retriever.retrieve(search_query)
            
        # Display results
        st.subheader(f"Search Results")
        
        for i, node in enumerate(nodes, 1):
            # Create a preview of the text (first 200 characters)
            preview = node.text[:200] + "..." if len(node.text) > 200 else node.text
            
            # Format score to 3 decimal places
            score = f"{node.score:.3f}" if hasattr(node, 'score') else "N/A"
            
            # Create expandable container with new title format
            with st.expander(f"Result {i} (score: {score})\n\n{preview}", expanded=False):
                st.markdown(node.text)
                if node.metadata:
                    st.markdown("---")
                    st.markdown("**Source:**")
                    st.json(node.metadata)

# Add sidebar with information
with st.sidebar:
    st.header("About")
    st.markdown("""
    This demo searches through Freud's complete works using:
    
    - **BM25**: Traditional keyword-based search
    - **Vector Search**: Semantic similarity using embeddings
    - **Hybrid**: Combines both approaches
    """)