akshayp commited on
Commit
403cbcd
·
1 Parent(s): 20d179d

text split per page

Browse files
Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from fastapi import FastAPI, Query, HTTPException
2
  from extractous import Extractor, TesseractOcrConfig
 
3
 
4
  app = FastAPI()
5
 
@@ -8,6 +9,9 @@ def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")):
8
  if not link.startswith(("http://", "https://")):
9
  raise HTTPException(status_code=400, detail="Invalid URL format")
10
  extractor = Extractor().set_ocr_config(TesseractOcrConfig())
11
- extractor = extractor.set_xml_output(False)
12
  content, metadata = extractor.extract_url_to_string(link)
13
- return {"received_link": link, "content": content}
 
 
 
 
1
  from fastapi import FastAPI, Query, HTTPException
2
  from extractous import Extractor, TesseractOcrConfig
3
+ from bs4 import BeautifulSoup
4
 
5
  app = FastAPI()
6
 
 
9
  if not link.startswith(("http://", "https://")):
10
  raise HTTPException(status_code=400, detail="Invalid URL format")
11
  extractor = Extractor().set_ocr_config(TesseractOcrConfig())
12
+ extractor = extractor.set_xml_output(True)
13
  content, metadata = extractor.extract_url_to_string(link)
14
+ soup = BeautifulSoup(content, 'html.parser')
15
+ pages = soup.find_all('div', class_='page')
16
+ pages_text = [p.get_text() for p in pages]
17
+ return {"received_link": link, "content": pages_text}