AreesaAshfaq commited on
Commit
62815d7
·
verified ·
1 Parent(s): 5e084ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -15
app.py CHANGED
@@ -53,24 +53,73 @@ else:
53
 
54
  # Load, chunk, and index the contents of the blog
55
  def load_data(url):
56
- try:
57
- loader = WebBaseLoader(
58
- web_paths=(url,),
59
- bs_kwargs=dict(
60
- parse_only=bs4.SoupStrainer(
61
- class_=("post-content", "post-title", "post-header")
62
- )
63
- ),
64
- )
65
- docs = loader.load()
66
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
67
- splits = text_splitter.split_documents(docs)
68
- vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
69
- return vectorstore
70
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  st.error(f"An error occurred while loading the blog: {e}")
72
  return None
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  # Load the data if a URL is provided
75
  if blog_url:
76
  vectorstore = load_data(blog_url)
 
53
 
54
  # Load, chunk, and index the contents of the blog
55
  def load_data(url):
56
+ try:
57
+ loader = WebBaseLoader(
58
+ web_paths=(url,),
59
+ bs_kwargs=dict(
60
+ parse_only=bs4.SoupStrainer(
61
+ class_=("post-content", "post-title", "post-header")
62
+ )
63
+ ),
64
+ )
65
+ docs = loader.load()
66
+
67
+ # Debugging output
68
+ st.write(f"Loaded {len(docs)} documents from the URL.")
69
+
70
+ if not docs:
71
+ st.error("No documents were loaded. Please check the URL or content.")
72
+ return None
73
+
74
+ # Check the first document's content to ensure it's loaded correctly
75
+ st.write(f"First document content preview: {docs[0].page_content[:500]}") # Show the first 500 characters of the first document
76
+
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
78
+ splits = text_splitter.split_documents(docs)
79
+
80
+ # Debugging output
81
+ st.write(f"Created {len(splits)} document splits.")
82
+
83
+ if not splits:
84
+ st.error("No document splits were created. Please check the document content.")
85
+ return None
86
+
87
+ # Check the first split's content to ensure it's split correctly
88
+ st.write(f"First split content preview: {splits[0].page_content[:500]}") # Show the first 500 characters of the first split
89
+
90
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
91
+
92
+ # Debugging output
93
+ st.write(f"Vectorstore created with {len(splits)} documents.")
94
+
95
+ if vectorstore is None:
96
+ st.error("Failed to create the vectorstore.")
97
+ return None
98
+
99
+ return vectorstore
100
+ except Exception as e:
101
  st.error(f"An error occurred while loading the blog: {e}")
102
  return None
103
 
104
+ # def load_data(url):
105
+ # try:
106
+ # loader = WebBaseLoader(
107
+ # web_paths=(url,),
108
+ # bs_kwargs=dict(
109
+ # parse_only=bs4.SoupStrainer(
110
+ # class_=("post-content", "post-title", "post-header")
111
+ # )
112
+ # ),
113
+ # )
114
+ # docs = loader.load()
115
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
116
+ # splits = text_splitter.split_documents(docs)
117
+ # vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
118
+ # return vectorstore
119
+ # except Exception as e:
120
+ # st.error(f"An error occurred while loading the blog: {e}")
121
+ # return None
122
+
123
  # Load the data if a URL is provided
124
  if blog_url:
125
  vectorstore = load_data(blog_url)