Spaces:

AreesaAshfaq
/

BlogRetrievalQA

Sleeping

App Files Files Community

BlogRetrievalQA / app.py

AreesaAshfaq

Update app.py

56defad verified 10 months ago

raw

history blame contribute delete

6.1 kB

	import streamlit as st
	from sentence_transformers import SentenceTransformer
	from langchain import hub
	from langchain_chroma import Chroma
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	import bs4
	import torch
	from langchain_groq import ChatGroq

	# APP Title
	st.title("Blog Retrieval and Question Answering")

	# Prompt the user to enter their Langchain API key
	api_key_langchain = st.text_input("Enter your LANGCHAIN_API_KEY", type="password")

	# Prompt the user to enter their Groq API key
	api_key_Groq = st.text_input("Enter your Groq_API_KEY", type="password")

	# Check if both API keys have been provided
	if not api_key_langchain or not api_key_Groq:
	st.write("Please enter both API keys to access this APP.")
	else:
	st.write("Both API keys are set.")

	# Initialize the LLM with the provided Groq API key
	llm = ChatGroq(model="llama3-8b-8192", groq_api_key=api_key_Groq)

	# Define the embedding class
	class SentenceTransformerEmbedding:
	def __init__(self, model_name):
	self.model = SentenceTransformer(model_name)

	def embed_documents(self, texts):
	embeddings = self.model.encode(texts, convert_to_tensor=True)
	if isinstance(embeddings, torch.Tensor):
	return embeddings.cpu().detach().numpy().tolist() # Convert tensor to list
	return embeddings

	def embed_query(self, query):
	embedding = self.model.encode([query], convert_to_tensor=True)
	if isinstance(embedding, torch.Tensor):
	return embedding.cpu().detach().numpy().tolist()[0] # Convert tensor to list
	return embedding[0]

	# Initialize the embedding class
	embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

	# Streamlit UI for blog URL input
	blog_url = st.text_input("Enter the URL of the blog to retrieve:")

	# Load, chunk, and index the contents of the blog
	def load_data(url):
	try:
	loader = WebBaseLoader(
	web_paths=(url,),
	bs_kwargs=dict(
	parse_only=bs4.SoupStrainer(
	)
	),
	)
	docs = loader.load()

	# Debugging output
	#st.write(f"Loaded {len(docs)} documents from the URL.")

	if not docs:
	st.error("No documents were loaded. Please check the URL or content.")
	return None

	# Check the first document's content to ensure it's loaded correctly
	#st.write(f"First document content preview: {docs[0].page_content[:500]}") # Show the first 500 characters of the first document

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	splits = text_splitter.split_documents(docs)

	# Debugging output
	#st.write(f"Created {len(splits)} document splits.")

	if not splits:
	st.error("No document splits were created. Please check the document content.")
	return None

	# Check the first split's content to ensure it's split correctly
	#st.write(f"First split content preview: {splits[0].page_content[:500]}") # Show the first 500 characters of the first split

	vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

	# Debugging output
	#st.write(f"Vectorstore created with {len(splits)} documents.")

	if vectorstore is None:
	st.error("Failed to create the vectorstore.")
	return None

	return vectorstore
	except Exception as e:
	st.error(f"An error occurred while loading the blog: {e}")
	return None

	# def load_data(url):
	# try:
	# loader = WebBaseLoader(
	# web_paths=(url,),
	# bs_kwargs=dict(
	# parse_only=bs4.SoupStrainer(
	# class_=("post-content", "post-title", "post-header")
	# )
	# ),
	# )
	# docs = loader.load()
	# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	# splits = text_splitter.split_documents(docs)
	# vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
	# return vectorstore
	# except Exception as e:
	# st.error(f"An error occurred while loading the blog: {e}")
	# return None

	# Load the data if a URL is provided
	if blog_url:
	vectorstore = load_data(blog_url)
	if vectorstore:
	# Streamlit UI for question input
	question = st.text_input("Enter your question:")

	if question:
	retriever = vectorstore.as_retriever()
	prompt = hub.pull("rlm/rag-prompt", api_key=api_key_langchain)

	def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

	rag_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	# Example invocation
	try:
	result = rag_chain.invoke(question)
	st.write("Answer:", result)
	except Exception as e:
	st.error(f"An error occurred while generating the answer: {e}")
	else:
	st.write("Failed to load the blog content. Please check the URL and try again.")