Spaces:
Runtime error
Runtime error
Commit
·
52d4691
1
Parent(s):
aa1de59
Update utils.py
Browse files
utils.py
CHANGED
@@ -117,17 +117,7 @@ def create_chunk_documents():
|
|
117 |
|
118 |
source_chunks = splitter.split_documents(sources)
|
119 |
|
120 |
-
|
121 |
-
print("Size of chunk: " + str(len(chunk.page_content) + len(chunk.metadata)))
|
122 |
-
if chunk.page_content is None or chunk.page_content == '':
|
123 |
-
print("removing chunk: "+ chunk.page_content)
|
124 |
-
source_chunks.remove(chunk)
|
125 |
-
elif len(chunk.page_content) >=1000:
|
126 |
-
print("splitting document")
|
127 |
-
source_chunks.extend(splitter.split_documents([chunk]))
|
128 |
-
# print("Chunks: " + str(len(source_chunks)) + "and type " + str(type(source_chunks)))
|
129 |
-
return source_chunks
|
130 |
-
|
131 |
|
132 |
def fetch_data_for_embeddings(url, book_file, book_url):
|
133 |
sources = get_website_data(url)
|
@@ -176,7 +166,8 @@ def get_links(index_url, paths):
|
|
176 |
for path in paths:
|
177 |
url = urljoin(index_url, path)
|
178 |
parsed_url = urlparse(url)
|
179 |
-
if parsed_url.scheme in ["http", "https"] and "
|
|
|
180 |
links.append(url)
|
181 |
return links
|
182 |
|
|
|
117 |
|
118 |
source_chunks = splitter.split_documents(sources)
|
119 |
|
120 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
def fetch_data_for_embeddings(url, book_file, book_url):
|
123 |
sources = get_website_data(url)
|
|
|
166 |
for path in paths:
|
167 |
url = urljoin(index_url, path)
|
168 |
parsed_url = urlparse(url)
|
169 |
+
if parsed_url.scheme in ["http", "https"] and "shreyasachdev" in parsed_url.netloc:
|
170 |
+
|
171 |
links.append(url)
|
172 |
return links
|
173 |
|