raj999 commited on
Commit
f0e9961
·
verified ·
1 Parent(s): 4afb017

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -17,11 +17,13 @@ vector_store = None
17
  retriever = None
18
 
19
  def extract_text_from_pdf(filepath):
20
- loader = UnstructuredLoader([filepath])
 
 
21
  pages = []
22
  for doc in loader.lazy_load():
23
  pages.append(doc)
24
- return "\n".join([page.content for page in pages])
25
 
26
  def extract_tables_from_pdf(filepath):
27
  tables = camelot.read_pdf(filepath, pages='1-end')
 
17
  retriever = None
18
 
19
  def extract_text_from_pdf(filepath):
20
+ chunk_size = 1000 # Example chunk size
21
+ overlap = 100 # Example overlap
22
+ loader = UnstructuredLoader([filepath], chunk_size=chunk_size, overlap=overlap)
23
  pages = []
24
  for doc in loader.lazy_load():
25
  pages.append(doc)
26
+ return "\n".join([page.contents for page in pages])
27
 
28
  def extract_tables_from_pdf(filepath):
29
  tables = camelot.read_pdf(filepath, pages='1-end')