Spaces:
Sleeping
Sleeping
File size: 6,086 Bytes
0a6ed15 46018ec 98cc5ee cf293a5 8483ca7 8c7e036 c469b78 3d0b71c 8c7e036 0a6ed15 812e3d9 46018ec 8c7e036 812e3d9 46018ec 812e3d9 46018ec 62815d7 4ce2673 62815d7 4ce2673 62815d7 4ce2673 62815d7 b5124d9 46018ec 5e084ab 62815d7 5e084ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import streamlit as st
from sentence_transformers import SentenceTransformer
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
import bs4
import torch
from langchain_groq import ChatGroq
# APP Title
st.title("Blog Retrieval and Question Answering")
# Prompt the user to enter their Langchain API key
api_key_langchain = st.text_input("Enter your LANGCHAIN_API_KEY", type="password")
# Prompt the user to enter their Groq API key
api_key_Groq = st.text_input("Enter your Groq_API_KEY", type="password")
# Check if both API keys have been provided
if not api_key_langchain or not api_key_Groq:
st.write("Please enter both API keys to access this APP.")
else:
st.write("Both API keys are set.")
# Initialize the LLM with the provided Groq API key
llm = ChatGroq(model="llama3-8b-8192", groq_api_key=api_key_Groq)
# Define the embedding class
class SentenceTransformerEmbedding:
def __init__(self, model_name):
self.model = SentenceTransformer(model_name)
def embed_documents(self, texts):
embeddings = self.model.encode(texts, convert_to_tensor=True)
if isinstance(embeddings, torch.Tensor):
return embeddings.cpu().detach().numpy().tolist() # Convert tensor to list
return embeddings
def embed_query(self, query):
embedding = self.model.encode([query], convert_to_tensor=True)
if isinstance(embedding, torch.Tensor):
return embedding.cpu().detach().numpy().tolist()[0] # Convert tensor to list
return embedding[0]
# Initialize the embedding class
embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')
# Streamlit UI for blog URL input
blog_url = st.text_input("Enter the URL of the blog to retrieve:")
# Load, chunk, and index the contents of the blog
def load_data(url):
try:
loader = WebBaseLoader(
web_paths=(url,),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
)
),
)
docs = loader.load()
# Debugging output
st.write(f"Loaded {len(docs)} documents from the URL.")
if not docs:
st.error("No documents were loaded. Please check the URL or content.")
return None
# Check the first document's content to ensure it's loaded correctly
#st.write(f"First document content preview: {docs[0].page_content[:500]}") # Show the first 500 characters of the first document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Debugging output
st.write(f"Created {len(splits)} document splits.")
if not splits:
st.error("No document splits were created. Please check the document content.")
return None
# Check the first split's content to ensure it's split correctly
#st.write(f"First split content preview: {splits[0].page_content[:500]}") # Show the first 500 characters of the first split
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
# Debugging output
#st.write(f"Vectorstore created with {len(splits)} documents.")
if vectorstore is None:
st.error("Failed to create the vectorstore.")
return None
return vectorstore
except Exception as e:
st.error(f"An error occurred while loading the blog: {e}")
return None
# def load_data(url):
# try:
# loader = WebBaseLoader(
# web_paths=(url,),
# bs_kwargs=dict(
# parse_only=bs4.SoupStrainer(
# class_=("post-content", "post-title", "post-header")
# )
# ),
# )
# docs = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(docs)
# vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
# return vectorstore
# except Exception as e:
# st.error(f"An error occurred while loading the blog: {e}")
# return None
# Load the data if a URL is provided
if blog_url:
vectorstore = load_data(blog_url)
if vectorstore:
# Streamlit UI for question input
question = st.text_input("Enter your question:")
if question:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt", api_key=api_key_langchain)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Example invocation
try:
result = rag_chain.invoke(question)
st.write("Answer:", result)
except Exception as e:
st.error(f"An error occurred while generating the answer: {e}")
else:
st.write("Failed to load the blog content. Please check the URL and try again.") |