Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,11 +17,13 @@ vector_store = None
|
|
17 |
retriever = None
|
18 |
|
19 |
def extract_text_from_pdf(filepath):
|
20 |
-
|
|
|
|
|
21 |
pages = []
|
22 |
for doc in loader.lazy_load():
|
23 |
pages.append(doc)
|
24 |
-
return "\n".join([page.
|
25 |
|
26 |
def extract_tables_from_pdf(filepath):
|
27 |
tables = camelot.read_pdf(filepath, pages='1-end')
|
|
|
17 |
retriever = None
|
18 |
|
19 |
def extract_text_from_pdf(filepath):
|
20 |
+
chunk_size = 1000 # Example chunk size
|
21 |
+
overlap = 100 # Example overlap
|
22 |
+
loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
|
23 |
pages = []
|
24 |
for doc in loader.lazy_load():
|
25 |
pages.append(doc)
|
26 |
+
return "\n".join([page.contents for page in pages])
|
27 |
|
28 |
def extract_tables_from_pdf(filepath):
|
29 |
tables = camelot.read_pdf(filepath, pages='1-end')
|