Spaces:

AreesaAshfaq
/

BlogRetrievalQA

Sleeping

File size: 6,086 Bytes

import streamlit as st
from sentence_transformers import SentenceTransformer
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
import bs4
import torch
from langchain_groq import ChatGroq

# APP Title
st.title("Blog Retrieval and Question Answering")

# Prompt the user to enter their Langchain API key
api_key_langchain = st.text_input("Enter your LANGCHAIN_API_KEY", type="password")

# Prompt the user to enter their Groq API key
api_key_Groq = st.text_input("Enter your Groq_API_KEY", type="password")

# Check if both API keys have been provided
if not api_key_langchain or not api_key_Groq:
    st.write("Please enter both API keys to access this APP.")
else:
    st.write("Both API keys are set.")

    # Initialize the LLM with the provided Groq API key
    llm = ChatGroq(model="llama3-8b-8192", groq_api_key=api_key_Groq)

    # Define the embedding class
    class SentenceTransformerEmbedding:
        def __init__(self, model_name):
            self.model = SentenceTransformer(model_name)
        
        def embed_documents(self, texts):
            embeddings = self.model.encode(texts, convert_to_tensor=True)
            if isinstance(embeddings, torch.Tensor):
                return embeddings.cpu().detach().numpy().tolist()  # Convert tensor to list
            return embeddings
        
        def embed_query(self, query):
            embedding = self.model.encode([query], convert_to_tensor=True)
            if isinstance(embedding, torch.Tensor):
                return embedding.cpu().detach().numpy().tolist()[0]  # Convert tensor to list
            return embedding[0]

    # Initialize the embedding class
    embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

    # Streamlit UI for blog URL input
    blog_url = st.text_input("Enter the URL of the blog to retrieve:")

    # Load, chunk, and index the contents of the blog
    def load_data(url):
          try:
              loader = WebBaseLoader(
                  web_paths=(url,),
                  bs_kwargs=dict(
                      parse_only=bs4.SoupStrainer(
                      )
                  ),
              )
              docs = loader.load()
            
              # Debugging output
              st.write(f"Loaded {len(docs)} documents from the URL.")
        
              if not docs:
                  st.error("No documents were loaded. Please check the URL or content.")
                  return None
            
              # Check the first document's content to ensure it's loaded correctly
              #st.write(f"First document content preview: {docs[0].page_content[:500]}")  # Show the first 500 characters of the first document
        
              text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
              splits = text_splitter.split_documents(docs)
            
              # Debugging output
              st.write(f"Created {len(splits)} document splits.")
        
              if not splits:
                  st.error("No document splits were created. Please check the document content.")
                  return None
            
              # Check the first split's content to ensure it's split correctly
              #st.write(f"First split content preview: {splits[0].page_content[:500]}")  # Show the first 500 characters of the first split
        
              vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
            
              # Debugging output
              #st.write(f"Vectorstore created with {len(splits)} documents.")
            
              if vectorstore is None:
                  st.error("Failed to create the vectorstore.")
                  return None
            
              return vectorstore
          except Exception as e:
            st.error(f"An error occurred while loading the blog: {e}")
            return None

    # def load_data(url):
    #     try:
    #         loader = WebBaseLoader(
    #             web_paths=(url,),
    #             bs_kwargs=dict(
    #                 parse_only=bs4.SoupStrainer(
    #                     class_=("post-content", "post-title", "post-header")
    #                 )
    #             ),
    #         )
    #         docs = loader.load()
    #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    #         splits = text_splitter.split_documents(docs)
    #         vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
    #         return vectorstore
    #     except Exception as e:
    #         st.error(f"An error occurred while loading the blog: {e}")
    #         return None

    # Load the data if a URL is provided
    if blog_url:
        vectorstore = load_data(blog_url)
        if vectorstore:
            # Streamlit UI for question input
            question = st.text_input("Enter your question:")

            if question:
                retriever = vectorstore.as_retriever()
                prompt = hub.pull("rlm/rag-prompt", api_key=api_key_langchain)

                def format_docs(docs):
                    return "\n\n".join(doc.page_content for doc in docs)

                rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                    | StrOutputParser()
                )

                # Example invocation
                try:
                    result = rag_chain.invoke(question)
                    st.write("Answer:", result)
                except Exception as e:
                    st.error(f"An error occurred while generating the answer: {e}")
        else:
            st.write("Failed to load the blog content. Please check the URL and try again.")