import io import asyncio import os from pathlib import Path import aiohttp from PyPDF2 import PdfReader, PdfWriter from fastapi import FastAPI, UploadFile, Form from fastapi.responses import JSONResponse from aiohttp import FormData import shutil app = FastAPI() async def call_pdfscraper(session, file_contents, pdf_name, processTables): headers = {"Origin": "http://localhost:8080"} url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" data = FormData() data.add_field( "pdf", file_contents, filename=os.path.basename(pdf_name), content_type="application/pdf", ) data.add_field("processTables", processTables) async with session.post(url, data=data, headers=headers) as resp: if resp.status == 200: response = await resp.json() else: return {}, pdf_name return response, pdf_name async def execute_pdfscraper_async(file_path: str, processTables: str): chunk_list = os.listdir(file_path) chunk_byte_list = [ (open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list ] response_list = [] async with aiohttp.ClientSession() as session: tasks = [ call_pdfscraper(session, file_all[0], file_all[1], processTables) for file_all in chunk_byte_list ] responses = await asyncio.gather(*tasks) for i, response in enumerate(responses): response_list.append(response[0]) return response_list def collect_pdfscraper_response(scrape_response_list): content_list = [] tables_dict = {} table_count = 1 for response in scrape_response_list: content = response.get("corpus", "") table_content = response.get("tables_raw", {}) content_list.append(content) try: for table_key in table_content.keys(): tables_dict[str(table_count)] = table_content[table_key] table_count += 1 except AttributeError: pass content_str = "\n".join(content_list) return content_str, tables_dict def split_pdf(file_contents, file_name, pages_per_chunk): file_bytes = io.BytesIO(file_contents) reader = PdfReader(file_bytes) total_pages = len(reader.pages) output_dir = Path(file_name).parent / "chunks" os.makedirs(output_dir, exist_ok=True) num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk for i in range(num_chunks): writer = PdfWriter() start_page = i * pages_per_chunk end_page = min(start_page + pages_per_chunk, total_pages) for page_number in range(start_page, end_page): writer.add_page(reader.pages[page_number]) chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" output_path = output_dir / chunk_file_name with open(output_path, "wb") as output_pdf: writer.write(output_pdf) return str(output_dir) @app.post("/process-pdf/") async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")): file_contents = await pdf_file.read() chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk) scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables) content, table_string = collect_pdfscraper_response(scrape_response_list) shutil.rmtree(chunks_dir) # Clean up chunks after processing return JSONResponse(content={"content": content, "tables": table_string}) # Starting point for running the FastAPI app # if __name__ == "__main__": # import uvicorn # uvicorn.run(app, host="0.0.0.0", port=8000)