from sentence_transformers import SentenceTransformer from scipy.spatial.distance import cosine import numpy as np from data_ret import search_relevant_data # Assuming this function fetches the data from some source import streamlit as st # Load the Sentence Transformer model for similarity search def load_similarity_model(): st.write("Loading similarity model...") # Show status on Streamlit retriever_model = SentenceTransformer("all-mpnet-base-v2") st.write("Similarity model loaded.") return retriever_model # Create embeddings for the retrieved documents def create_embeddings(documents, model): if not documents: st.write("No documents provided for embedding.") return np.array([]) # Return empty array if no documents st.write(f"Creating embeddings for {len(documents)} documents...") # Show progress embeddings = [] # Track progress of the embedding creation using Streamlit's progress bar progress_bar = st.progress(0) step = 1 / len(documents) # This ensures the progress bar value stays within [0.0, 1.0] # Include 'text' in the document text along with 'question' and 'answer' document_texts = [doc['question'] + " " + doc['answer'] + " " + doc.get('text', '') for doc in documents] for i, doc_text in enumerate(document_texts): embedding = model.encode(doc_text) embeddings.append(embedding) progress_bar.progress(i * step) # Update the progress bar within valid range embeddings = np.array(embeddings) st.write(f"Embeddings created with shape: {embeddings.shape}") return embeddings # Retrieve documents based on the question embedding def retrieve_documents(question_embedding, document_embeddings, top_k=5): if document_embeddings.size == 0: st.write("No document embeddings available for retrieval.") return [] st.write("Calculating similarities between question and documents...") similarities = np.array([1 - cosine(question_embedding, doc_embedding) for doc_embedding in document_embeddings]) # Get indices of top K similarities (highest similarity first) top_indices = similarities.argsort()[-top_k:][::-1] # Sort in descending order return top_indices # Main function to get the context from the most relevant documents based on topic and question def get_relevant_context(question, topic): try: st.write("Searching for relevant documents based on the topic...") relevant_documents = search_relevant_data(topic) # Use dynamic topic for search query st.write(f"Found {len(relevant_documents)} relevant documents.") if not relevant_documents: return "No relevant documents found." retriever_model = load_similarity_model() # Load the similarity model # Create document embeddings and show progress document_embeddings = create_embeddings(relevant_documents, retriever_model) if document_embeddings.size == 0: return "No embeddings created for relevant documents." st.write("Generating question embedding and retrieving relevant documents...") question_embedding = retriever_model.encode(question) relevant_doc_indices = retrieve_documents(question_embedding, document_embeddings) if len(relevant_doc_indices) == 0: return "No relevant documents found after embedding." # Extract context from the top relevant documents contexts = [] for idx in relevant_doc_indices: doc = relevant_documents[idx] context = doc.get('answer', '') + " " + doc.get('text', '') if context.strip(): contexts.append(context) if not contexts: return "No valid contexts available for answering." # Return the combined context for question answering return " ".join(contexts) except Exception as e: st.write(f"Error processing question: {str(e)}") return f"Error: {str(e)}"