from fastapi import FastAPI, Query, HTTPException from extractous import Extractor, TesseractOcrConfig from bs4 import BeautifulSoup app = FastAPI() @app.get("/") def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")): if not link.startswith(("http://", "https://")): raise HTTPException(status_code=400, detail="Invalid URL format") extractor = Extractor().set_ocr_config(TesseractOcrConfig()) extractor = extractor.set_xml_output(True) content, metadata = extractor.extract_url_to_string(link) soup = BeautifulSoup(content, 'html.parser') pages = soup.find_all('div', class_='page') pages_text = [p.get_text() for p in pages] return {"received_link": link, "content": pages_text}