Spaces:
Sleeping
Sleeping
File size: 750 Bytes
eff991c 403cbcd eff991c 403cbcd eff991c 403cbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
from fastapi import FastAPI, Query, HTTPException
from extractous import Extractor, TesseractOcrConfig
from bs4 import BeautifulSoup
app = FastAPI()
@app.get("/")
def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")):
if not link.startswith(("http://", "https://")):
raise HTTPException(status_code=400, detail="Invalid URL format")
extractor = Extractor().set_ocr_config(TesseractOcrConfig())
extractor = extractor.set_xml_output(True)
content, metadata = extractor.extract_url_to_string(link)
soup = BeautifulSoup(content, 'html.parser')
pages = soup.find_all('div', class_='page')
pages_text = [p.get_text() for p in pages]
return {"received_link": link, "content": pages_text}
|