File size: 6,086 Bytes
0a6ed15
 
 
 
 
 
 
 
 
 
46018ec
 
98cc5ee
cf293a5
8483ca7
8c7e036
c469b78
3d0b71c
8c7e036
 
0a6ed15
812e3d9
 
46018ec
8c7e036
812e3d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46018ec
 
812e3d9
46018ec
 
62815d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ce2673
62815d7
 
 
 
 
 
 
 
 
 
 
 
4ce2673
62815d7
 
 
 
4ce2673
62815d7
 
 
 
 
 
 
b5124d9
46018ec
5e084ab
62815d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e084ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
from sentence_transformers import SentenceTransformer
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
import bs4
import torch
from langchain_groq import ChatGroq

# APP Title
st.title("Blog Retrieval and Question Answering")

# Prompt the user to enter their Langchain API key
api_key_langchain = st.text_input("Enter your LANGCHAIN_API_KEY", type="password")

# Prompt the user to enter their Groq API key
api_key_Groq = st.text_input("Enter your Groq_API_KEY", type="password")

# Check if both API keys have been provided
if not api_key_langchain or not api_key_Groq:
    st.write("Please enter both API keys to access this APP.")
else:
    st.write("Both API keys are set.")

    # Initialize the LLM with the provided Groq API key
    llm = ChatGroq(model="llama3-8b-8192", groq_api_key=api_key_Groq)

    # Define the embedding class
    class SentenceTransformerEmbedding:
        def __init__(self, model_name):
            self.model = SentenceTransformer(model_name)
        
        def embed_documents(self, texts):
            embeddings = self.model.encode(texts, convert_to_tensor=True)
            if isinstance(embeddings, torch.Tensor):
                return embeddings.cpu().detach().numpy().tolist()  # Convert tensor to list
            return embeddings
        
        def embed_query(self, query):
            embedding = self.model.encode([query], convert_to_tensor=True)
            if isinstance(embedding, torch.Tensor):
                return embedding.cpu().detach().numpy().tolist()[0]  # Convert tensor to list
            return embedding[0]

    # Initialize the embedding class
    embedding_model = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

    # Streamlit UI for blog URL input
    blog_url = st.text_input("Enter the URL of the blog to retrieve:")

    # Load, chunk, and index the contents of the blog
    def load_data(url):
          try:
              loader = WebBaseLoader(
                  web_paths=(url,),
                  bs_kwargs=dict(
                      parse_only=bs4.SoupStrainer(
                      )
                  ),
              )
              docs = loader.load()
            
              # Debugging output
              st.write(f"Loaded {len(docs)} documents from the URL.")
        
              if not docs:
                  st.error("No documents were loaded. Please check the URL or content.")
                  return None
            
              # Check the first document's content to ensure it's loaded correctly
              #st.write(f"First document content preview: {docs[0].page_content[:500]}")  # Show the first 500 characters of the first document
        
              text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
              splits = text_splitter.split_documents(docs)
            
              # Debugging output
              st.write(f"Created {len(splits)} document splits.")
        
              if not splits:
                  st.error("No document splits were created. Please check the document content.")
                  return None
            
              # Check the first split's content to ensure it's split correctly
              #st.write(f"First split content preview: {splits[0].page_content[:500]}")  # Show the first 500 characters of the first split
        
              vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
            
              # Debugging output
              #st.write(f"Vectorstore created with {len(splits)} documents.")
            
              if vectorstore is None:
                  st.error("Failed to create the vectorstore.")
                  return None
            
              return vectorstore
          except Exception as e:
            st.error(f"An error occurred while loading the blog: {e}")
            return None

    # def load_data(url):
    #     try:
    #         loader = WebBaseLoader(
    #             web_paths=(url,),
    #             bs_kwargs=dict(
    #                 parse_only=bs4.SoupStrainer(
    #                     class_=("post-content", "post-title", "post-header")
    #                 )
    #             ),
    #         )
    #         docs = loader.load()
    #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    #         splits = text_splitter.split_documents(docs)
    #         vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
    #         return vectorstore
    #     except Exception as e:
    #         st.error(f"An error occurred while loading the blog: {e}")
    #         return None

    # Load the data if a URL is provided
    if blog_url:
        vectorstore = load_data(blog_url)
        if vectorstore:
            # Streamlit UI for question input
            question = st.text_input("Enter your question:")

            if question:
                retriever = vectorstore.as_retriever()
                prompt = hub.pull("rlm/rag-prompt", api_key=api_key_langchain)

                def format_docs(docs):
                    return "\n\n".join(doc.page_content for doc in docs)

                rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                    | StrOutputParser()
                )

                # Example invocation
                try:
                    result = rag_chain.invoke(question)
                    st.write("Answer:", result)
                except Exception as e:
                    st.error(f"An error occurred while generating the answer: {e}")
        else:
            st.write("Failed to load the blog content. Please check the URL and try again.")