svishal2001 commited on
Commit
52d4691
·
1 Parent(s): aa1de59

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +3 -12
utils.py CHANGED
@@ -117,17 +117,7 @@ def create_chunk_documents():
117
 
118
  source_chunks = splitter.split_documents(sources)
119
 
120
- for chunk in source_chunks:
121
- print("Size of chunk: " + str(len(chunk.page_content) + len(chunk.metadata)))
122
- if chunk.page_content is None or chunk.page_content == '':
123
- print("removing chunk: "+ chunk.page_content)
124
- source_chunks.remove(chunk)
125
- elif len(chunk.page_content) >=1000:
126
- print("splitting document")
127
- source_chunks.extend(splitter.split_documents([chunk]))
128
- # print("Chunks: " + str(len(source_chunks)) + "and type " + str(type(source_chunks)))
129
- return source_chunks
130
-
131
 
132
  def fetch_data_for_embeddings(url, book_file, book_url):
133
  sources = get_website_data(url)
@@ -176,7 +166,8 @@ def get_links(index_url, paths):
176
  for path in paths:
177
  url = urljoin(index_url, path)
178
  parsed_url = urlparse(url)
179
- if parsed_url.scheme in ["http", "https"] and "squarespace" not in parsed_url.netloc:
 
180
  links.append(url)
181
  return links
182
 
 
117
 
118
  source_chunks = splitter.split_documents(sources)
119
 
120
+
 
 
 
 
 
 
 
 
 
 
121
 
122
  def fetch_data_for_embeddings(url, book_file, book_url):
123
  sources = get_website_data(url)
 
166
  for path in paths:
167
  url = urljoin(index_url, path)
168
  parsed_url = urlparse(url)
169
+ if parsed_url.scheme in ["http", "https"] and "shreyasachdev" in parsed_url.netloc:
170
+  
171
  links.append(url)
172
  return links
173