Spaces:

AreesaAshfaq
/

BlogRetrievalQA

Sleeping

App Files Files Community

AreesaAshfaq commited on Aug 28, 2024

Commit

62815d7

verified ·

1 Parent(s): 5e084ab

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -15

app.py CHANGED Viewed

@@ -53,24 +53,73 @@ else:
     # Load, chunk, and index the contents of the blog
     def load_data(url):
-        try:
-            loader = WebBaseLoader(
-                web_paths=(url,),
-                bs_kwargs=dict(
-                    parse_only=bs4.SoupStrainer(
-                        class_=("post-content", "post-title", "post-header")
-                    )
-                ),
-            )
-            docs = loader.load()
-            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-            splits = text_splitter.split_documents(docs)
-            vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
-            return vectorstore
-        except Exception as e:
             st.error(f"An error occurred while loading the blog: {e}")
             return None
     # Load the data if a URL is provided
     if blog_url:
         vectorstore = load_data(blog_url)

     # Load, chunk, and index the contents of the blog
     def load_data(url):
+          try:
+              loader = WebBaseLoader(
+                  web_paths=(url,),
+                  bs_kwargs=dict(
+                      parse_only=bs4.SoupStrainer(
+                          class_=("post-content", "post-title", "post-header")
+                      )
+                  ),
+              )
+              docs = loader.load()
+              # Debugging output
+              st.write(f"Loaded {len(docs)} documents from the URL.")
+              if not docs:
+                  st.error("No documents were loaded. Please check the URL or content.")
+                  return None
+              # Check the first document's content to ensure it's loaded correctly
+              st.write(f"First document content preview: {docs[0].page_content[:500]}")  # Show the first 500 characters of the first document
+              text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+              splits = text_splitter.split_documents(docs)
+              # Debugging output
+              st.write(f"Created {len(splits)} document splits.")
+              if not splits:
+                  st.error("No document splits were created. Please check the document content.")
+                  return None
+              # Check the first split's content to ensure it's split correctly
+              st.write(f"First split content preview: {splits[0].page_content[:500]}")  # Show the first 500 characters of the first split
+              vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
+              # Debugging output
+              st.write(f"Vectorstore created with {len(splits)} documents.")
+              if vectorstore is None:
+                  st.error("Failed to create the vectorstore.")
+                  return None
+              return vectorstore
+          except Exception as e:
             st.error(f"An error occurred while loading the blog: {e}")
             return None
+    # def load_data(url):
+    #     try:
+    #         loader = WebBaseLoader(
+    #             web_paths=(url,),
+    #             bs_kwargs=dict(
+    #                 parse_only=bs4.SoupStrainer(
+    #                     class_=("post-content", "post-title", "post-header")
+    #                 )
+    #             ),
+    #         )
+    #         docs = loader.load()
+    #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    #         splits = text_splitter.split_documents(docs)
+    #         vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
+    #         return vectorstore
+    #     except Exception as e:
+    #         st.error(f"An error occurred while loading the blog: {e}")
+    #         return None
     # Load the data if a URL is provided
     if blog_url:
         vectorstore = load_data(blog_url)