File size: 8,097 Bytes
4a118a7 479c15b 4e001cd 479c15b 0953464 479c15b 4a118a7 80d5167 4a118a7 80d5167 4a118a7 80d5167 df535b0 80d5167 4a118a7 80d5167 4a118a7 80d5167 4a118a7 80d5167 bb7bc72 0f09491 4a118a7 0953464 ab5521c 0953464 ab5521c 0953464 b7471e0 0953464 ab5521c c7d5b43 80d5167 4a118a7 ab5521c 709bbfd 479c15b 709bbfd 479c15b 709bbfd 479c15b 709bbfd 479c15b 709bbfd 479c15b 709bbfd c7d5b43 709bbfd 479c15b 709bbfd c7d5b43 709bbfd ab5521c 709bbfd b39d435 709bbfd 479c15b 709bbfd 479c15b 709bbfd 4a118a7 80d5167 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import streamlit as st
import re
import os
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import bs4
import torch
from transformers import pipeline
# Define the embedding class
class SentenceTransformerEmbedding:
def __init__(self, model_name):
self.model = SentenceTransformer(model_name)
def embed_documents(self, texts):
embeddings = self.model.encode(texts, convert_to_tensor=True)
if isinstance(embeddings, torch.Tensor):
return embeddings.cpu().detach().numpy().tolist() # Convert tensor to list
return embeddings
def embed_query(self, query):
embedding = self.model.encode([query], convert_to_tensor=True)
if isinstance(embedding, torch.Tensor):
return embedding.cpu().detach().numpy().tolist()[0] # Convert tensor to list
return embedding[0]
# Streamlit UI setup
st.title("🤖 Chatbot with URL-based Document Retrieval")
# Sidebar Style with Multicolored Background
sidebar_bg_style = """
<style>
[data-testid="stSidebar"] {
background: linear-gradient(135deg, #ffafbd, #ffc3a0, #2193b0, #6dd5ed);
}
</style>
"""
st.markdown(sidebar_bg_style, unsafe_allow_html=True)
# Main Content Style with Multicolored Background
main_bg_style = """
<style>
.main .block-container {
background: linear-gradient(135deg, #ff9a9e, #fad0c4, #fbc2eb, #a18cd1);
padding: 2rem;
}
.css-18e3th9 {
background: linear-gradient(135deg, #ff9a9e, #fad0c4, #fbc2eb, #a18cd1);
}
</style>
"""
st.markdown(main_bg_style, unsafe_allow_html=True)
# Sidebar: Input for URL and API keys
st.sidebar.title("Settings")
# Input field for entering URL dynamically with placeholder and help text
url_input = st.sidebar.text_input("Enter Blog Post URL", placeholder="e.g., https://example.com/blog", help="Paste the full URL of the blog post you want to retrieve data from")
# Validate the URL and show a success message when correct
if url_input:
if re.match(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", url_input):
st.sidebar.markdown('<p style="color:green; font-weight:bold;">URL is correctly entered</p>', unsafe_allow_html=True)
else:
st.sidebar.markdown('<p style="color:red; font-weight:bold;">Invalid URL, please enter a valid one</p>', unsafe_allow_html=True)
# Option to use pre-provided API keys
use_preprovided_keys = st.sidebar.checkbox("Use pre-provided API keys")
# Input fields for API keys with placeholders and helper text
if not use_preprovided_keys:
api_key_1 = st.sidebar.text_input("Enter LangChain API Key", type="password", placeholder="Enter your LangChain API Key", help="Please enter a valid LangChain API key here")
api_key_2 = st.sidebar.text_input("Enter Groq API Key", type="password", placeholder="Enter your Groq API Key", help="Please enter your Groq API key here")
else:
api_key_1 = "your-preprovided-langchain-api-key" # Replace with your actual pre-provided key
api_key_2 = "your-preprovided-groq-api-key" # Replace with your actual pre-provided key
st.sidebar.markdown('<p style="color:blue; font-weight:bold;">Using pre-provided API keys</p>', unsafe_allow_html=True)
# Submit button for API keys with a success/warning message
if st.sidebar.button("Submit API Keys"):
if use_preprovided_keys or (api_key_1 and api_key_2):
os.environ["LANGCHAIN_API_KEY"] = api_key_1
os.environ["GROQ_API_KEY"] = api_key_2
st.sidebar.markdown('<p style="color:green; font-weight:bold;">API keys are set</p>', unsafe_allow_html=True)
else:
st.sidebar.markdown('<p style="color:red; font-weight:bold;">Please fill in both API keys or select the option to use pre-provided keys</p>', unsafe_allow_html=True)
# Marquee effect with bold, stylish text and a LinkedIn link
st.markdown("""
<marquee behavior="scroll" direction="left" scrollamount="10">
<p style='font-size:24px; color:#FF5733; font-weight:bold;'>
Created by: <a href="https://www.linkedin.com/in/datascientisthameshraj/" target="_blank" style="color:#1E90FF; text-decoration:none;">Engr. Hamesh Raj</a>
</p>
</marquee>
""", unsafe_allow_html=True)
# Title of the chatbot
st.markdown('<h1 style="color:#4CAF50; font-weight:bold;">🤖 Chatbot with URL-based Document Retrieval</h1>', unsafe_allow_html=True)
# Chat query input field with placeholder and help text
query = st.text_input("Ask a question based on the blog post", placeholder="Type your question here...", help="Enter a question related to the content of the blog post")
# Placeholder to display responses
if 'chat_history' not in st.session_state:
st.session_state['chat_history'] = []
# CustomLanguageModel class with summarization
class CustomLanguageModel:
def __init__(self):
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Replace with desired model
def generate(self, prompt, context):
summary = self.summarize_context(context)
return f"Generated response: '{prompt}'. Summary: '{summary}'."
def summarize_context(self, context):
summarized = self.summarizer(context, max_length=200, min_length=100, do_sample=False)
return summarized[0]['summary_text'] # Ensure it outputs full, meaningful sentences
# Define a callable class for RAGPrompt
class RAGPrompt:
def __call__(self, data):
return {"question": data["question"], "context": data["context"]}
# Submit button for chat
if st.button("Submit Query"):
if not query:
st.warning("Please enter a query before submitting!")
elif not url_input:
st.warning("Please enter a valid URL in the sidebar.")
else:
try:
# Blog loading logic based on user input URL
loader = WebBaseLoader(
web_paths=(url_input,), # Use the user-input URL
bs_kwargs=dict(
parse_only=bs4.SoupStrainer() # Adjust based on the user's URL structure
),
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
splits = text_splitter.split_documents(docs)
# Initialize the embedding model
embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')
# Initialize Chroma with the embedding class
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
# Retrieve and generate using the relevant snippets of the blog
retriever = vectorstore.as_retriever()
# Retrieve relevant documents
retrieved_docs = retriever.get_relevant_documents(query)
# Format the retrieved documents
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
context = format_docs(retrieved_docs)
# Initialize the language model
custom_llm = CustomLanguageModel()
# Initialize RAG chain using the prompt
prompt = RAGPrompt()
# Apply the prompt directly to the data (no chaining using `|`)
prompt_data = prompt({"question": query, "context": context})
# Generate the response using the language model, focusing on the answer from the retrieved context
result = custom_llm.generate(prompt_data["question"], prompt_data["context"])
# Store query and response in session for chat history
st.session_state['chat_history'].append((query, result))
except Exception as e:
st.error(f"An error occurred: {e}")
# Display chat history
for q, r in st.session_state['chat_history']:
st.write(f"**User:** {q}")
st.write(f"**Bot:** {r}") |