File size: 7,755 Bytes
4a118a7
 
 
479c15b
 
 
 
 
4e001cd
 
479c15b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a118a7
 
 
 
 
 
 
 
 
 
 
80d5167
 
 
 
 
 
 
 
 
 
 
 
 
 
4a118a7
 
 
 
 
 
 
 
 
 
 
 
 
80d5167
 
 
4a118a7
80d5167
 
 
 
df535b0
 
80d5167
4a118a7
 
 
80d5167
4a118a7
 
80d5167
4a118a7
80d5167
bb7bc72
0f09491
 
 
 
 
 
 
 
 
4a118a7
 
 
 
 
 
 
 
 
 
ab5521c
 
 
b39d435
 
ab5521c
c7d5b43
 
 
 
 
80d5167
4a118a7
ab5521c
 
 
 
 
479c15b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7d5b43
 
 
 
479c15b
 
 
c7d5b43
 
ab5521c
479c15b
ab5521c
c7d5b43
 
b39d435
479c15b
c7d5b43
 
 
 
479c15b
 
 
b39d435
479c15b
 
 
4a118a7
 
 
 
80d5167
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import streamlit as st
import re
import os
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import bs4
import torch

# Define the embedding class
class SentenceTransformerEmbedding:
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_tensor=True)
        if isinstance(embeddings, torch.Tensor):
            return embeddings.cpu().detach().numpy().tolist()  # Convert tensor to list
        return embeddings

    def embed_query(self, query):
        embedding = self.model.encode([query], convert_to_tensor=True)
        if isinstance(embedding, torch.Tensor):
            return embedding.cpu().detach().numpy().tolist()[0]  # Convert tensor to list
        return embedding[0]

# Streamlit UI setup
st.title("🤖 Chatbot with URL-based Document Retrieval")

# Sidebar Style with Multicolored Background
sidebar_bg_style = """
    <style>
        [data-testid="stSidebar"] {
            background: linear-gradient(135deg, #ffafbd, #ffc3a0, #2193b0, #6dd5ed);
        }
    </style>
"""
st.markdown(sidebar_bg_style, unsafe_allow_html=True)

# Main Content Style with Multicolored Background
main_bg_style = """
    <style>
        .main .block-container {
            background: linear-gradient(135deg, #ff9a9e, #fad0c4, #fbc2eb, #a18cd1);
            padding: 2rem;
        }
        .css-18e3th9 {
            background: linear-gradient(135deg, #ff9a9e, #fad0c4, #fbc2eb, #a18cd1);
        }
    </style>
"""
st.markdown(main_bg_style, unsafe_allow_html=True)

# Sidebar: Input for URL and API keys
st.sidebar.title("Settings")

# Input field for entering URL dynamically with placeholder and help text
url_input = st.sidebar.text_input("Enter Blog Post URL", placeholder="e.g., https://example.com/blog", help="Paste the full URL of the blog post you want to retrieve data from")

# Validate the URL and show a success message when correct
if url_input:
    if re.match(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", url_input):
        st.sidebar.markdown('<p style="color:green; font-weight:bold;">URL is correctly entered</p>', unsafe_allow_html=True)
    else:
        st.sidebar.markdown('<p style="color:red; font-weight:bold;">Invalid URL, please enter a valid one</p>', unsafe_allow_html=True)

# Option to use pre-provided API keys
use_preprovided_keys = st.sidebar.checkbox("Use pre-provided API keys")

# Input fields for API keys with placeholders and helper text
if not use_preprovided_keys:
    api_key_1 = st.sidebar.text_input("Enter LangChain API Key", type="password", placeholder="Enter your LangChain API Key", help="Please enter a valid LangChain API key here")
    api_key_2 = st.sidebar.text_input("Enter Groq API Key", type="password", placeholder="Enter your Groq API Key", help="Please enter your Groq API key here")
else:
    api_key_1 = "your-preprovided-langchain-api-key"  # Replace with your actual pre-provided key
    api_key_2 = "your-preprovided-groq-api-key"  # Replace with your actual pre-provided key
    st.sidebar.markdown('<p style="color:blue; font-weight:bold;">Using pre-provided API keys</p>', unsafe_allow_html=True)

# Submit button for API keys with a success/warning message
if st.sidebar.button("Submit API Keys"):
    if use_preprovided_keys or (api_key_1 and api_key_2):
        os.environ["LANGCHAIN_API_KEY"] = api_key_1
        os.environ["GROQ_API_KEY"] = api_key_2
        st.sidebar.markdown('<p style="color:green; font-weight:bold;">API keys are set</p>', unsafe_allow_html=True)
    else:
        st.sidebar.markdown('<p style="color:red; font-weight:bold;">Please fill in both API keys or select the option to use pre-provided keys</p>', unsafe_allow_html=True)

# Marquee effect with bold, stylish text and a LinkedIn link
st.markdown("""
    <marquee behavior="scroll" direction="left" scrollamount="10">
        <p style='font-size:24px; color:#FF5733; font-weight:bold;'>
            Created by: <a href="https://www.linkedin.com/in/datascientisthameshraj/" target="_blank" style="color:#1E90FF; text-decoration:none;">Engr. Hamesh Raj</a>
        </p>
    </marquee>
    """, unsafe_allow_html=True)

# Title of the chatbot
st.markdown('<h1 style="color:#4CAF50; font-weight:bold;">🤖 Chatbot with URL-based Document Retrieval</h1>', unsafe_allow_html=True)

# Chat query input field with placeholder and help text
query = st.text_input("Ask a question based on the blog post", placeholder="Type your question here...", help="Enter a question related to the content of the blog post")

# Placeholder to display responses
if 'chat_history' not in st.session_state:
    st.session_state['chat_history'] = []

# CustomLanguageModel class with proper context argument
class CustomLanguageModel:
    def generate(self, prompt, context):
        # Implement logic to generate a response based on prompt and context
        return f"Generated response based on prompt: '{prompt}' and context: '{context}'."

# Define a callable class for RAGPrompt
class RAGPrompt:
    def __call__(self, data):
        return {"question": data["question"], "context": data["context"]}

# Submit button for chat
if st.button("Submit Query"):
    if not query:
        st.warning("Please enter a query before submitting!")
    elif not url_input:
        st.warning("Please enter a valid URL in the sidebar.")
    else:
        # Blog loading logic based on user input URL
        loader = WebBaseLoader(
            web_paths=(url_input,),  # Use the user-input URL
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer()  # Adjust based on the user's URL structure
            ),
        )
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)

        # Initialize the embedding model
        embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

        # Initialize Chroma with the embedding class
        vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

        # Retrieve and generate using the relevant snippets of the blog
        retriever = vectorstore.as_retriever()

        # Retrieve relevant documents
        retrieved_docs = retriever.get_relevant_documents(query)

        # Format the retrieved documents
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        context = format_docs(retrieved_docs)

        # Initialize the language model
        custom_llm = CustomLanguageModel()

        # Initialize RAG chain using the prompt
        prompt = RAGPrompt()

        rag_chain = (
            {"context": context, "question": query}  # Start the chain with context and question
            | prompt                                 # Use the custom prompt
            | (lambda data: custom_llm.generate(data["question"], data["context"]))  # Pass question and context to LLM
            | StrOutputParser()  # Parse the output
        )

        # Generate the answer using the user's query
        result = rag_chain.invoke({"question": query, "context": context})

        # Store query and response in session for chat history
        st.session_state['chat_history'].append((query, result))

# Display chat history
for q, r in st.session_state['chat_history']:
    st.write(f"**User:** {q}")
    st.write(f"**Bot:** {r}")