Spaces:
Sleeping
Sleeping
File size: 4,736 Bytes
0a6ed15 46018ec 98cc5ee cf293a5 8483ca7 8c7e036 c469b78 3d0b71c 8c7e036 0a6ed15 812e3d9 46018ec 8c7e036 812e3d9 46018ec 812e3d9 46018ec b5124d9 46018ec b5124d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import streamlit as st
from sentence_transformers import SentenceTransformer
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
import bs4
import torch
from langchain_groq import ChatGroq
# APP Title
st.title("Blog Retrieval and Question Answering")
# Prompt the user to enter their Langchain API key
api_key_langchain = st.text_input("Enter your LANGCHAIN_API_KEY", type="password")
# Prompt the user to enter their Groq API key
api_key_Groq = st.text_input("Enter your Groq_API_KEY", type="password")
# Check if both API keys have been provided
if not api_key_langchain or not api_key_Groq:
st.write("Please enter both API keys to access this APP.")
else:
st.write("Both API keys are set.")
# Initialize the LLM with the provided Groq API key
llm = ChatGroq(model="llama3-8b-8192", groq_api_key=api_key_Groq)
# Define the embedding class
class SentenceTransformerEmbedding:
def __init__(self, model_name):
self.model = SentenceTransformer(model_name)
def embed_documents(self, texts):
embeddings = self.model.encode(texts, convert_to_tensor=True)
if isinstance(embeddings, torch.Tensor):
return embeddings.cpu().detach().numpy().tolist() # Convert tensor to list
return embeddings
def embed_query(self, query):
embedding = self.model.encode([query], convert_to_tensor=True)
if isinstance(embedding, torch.Tensor):
return embedding.cpu().detach().numpy().tolist()[0] # Convert tensor to list
return embedding[0]
# Initialize the embedding class
embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')
# Streamlit UI for blog URL input
blog_url = st.text_input("Enter the URL of the blog to retrieve:")
# Load, chunk, and index the contents of the blog
def load_data(url):
try:
loader = WebBaseLoader(
web_paths=(url,),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("post-content", "post-title", "post-header")
)
),
)
docs = loader.load()
if not docs:
st.error("No documents were loaded. Please check the URL and try again.")
return None
st.write(f"Loaded {len(docs)} documents.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
if not splits:
st.error("No document splits were created. Please check the document content.")
return None
st.write(f"Created {len(splits)} document splits.")
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
if vectorstore is None:
st.error("Failed to create the vectorstore.")
return None
return vectorstore
except Exception as e:
st.error(f"An error occurred while loading the blog: {e}")
return None
# Load the data if a URL is provided
if blog_url:
vectorstore = load_data(blog_url)
if vectorstore:
# Streamlit UI for question input
question = st.text_input("Enter your question:")
if question:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt", api_key=api_key_langchain)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Example invocation
try:
result = rag_chain.invoke(question)
st.write("Answer:", result)
except Exception as e:
st.error(f"An error occurred while generating the answer: {e}")
else:
st.write("Failed to load the blog content. Please check the URL and try again.")
|