import io import asyncio import os import uuid import logging from pathlib import Path import aiohttp from PyPDF2 import PdfReader, PdfWriter from fastapi import FastAPI, UploadFile, Form from fastapi.responses import JSONResponse from aiohttp import FormData import shutil app = FastAPI() async def call_pdfscraper(session, file_contents, pdf_name, processTables): headers = {"Origin": "http://localhost:8080"} url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" data = FormData() data.add_field( "pdf", file_contents, filename=os.path.basename(pdf_name), content_type="application/pdf", ) data.add_field("processTables", processTables) async with session.post(url, data=data, headers=headers) as resp: if resp.status == 200: response = await resp.json() else: return {}, pdf_name return response, pdf_name async def execute_pdfscraper_async(file_path: str, processTables: str): chunk_list = os.listdir(file_path) chunk_byte_list = [ (open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list ] response_list = [] async with aiohttp.ClientSession() as session: tasks = [ call_pdfscraper(session, file_all[0], file_all[1], processTables) for file_all in chunk_byte_list ] responses = await asyncio.gather(*tasks) for i, response in enumerate(responses): response_list.append(response[0]) return response_list def collect_pdfscraper_response(scrape_response_list): content_list = [] tables_dict = {} table_count = 1 for response in scrape_response_list: content = response.get("corpus", "") table_content = response.get("tables_raw", {}) content_list.append(content) try: for table_key in table_content.keys(): tables_dict[str(table_count)] = table_content[table_key] table_count += 1 except AttributeError: pass content_str = "\n".join(content_list) return content_str, tables_dict def split_pdf(file_contents, file_name, pages_per_chunk): file_bytes = io.BytesIO(file_contents) reader = PdfReader(file_bytes) total_pages = len(reader.pages) # Generate a unique directory for each request to avoid conflicts unique_dir = str(uuid.uuid4()) output_dir = Path(file_name).parent / f"chunks_{unique_dir}" os.makedirs(output_dir, exist_ok=True) num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk for i in range(num_chunks): writer = PdfWriter() start_page = i * pages_per_chunk end_page = min(start_page + pages_per_chunk, total_pages) for page_number in range(start_page, end_page): writer.add_page(reader.pages[page_number]) chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" output_path = output_dir / chunk_file_name with open(output_path, "wb") as output_pdf: writer.write(output_pdf) return str(output_dir) @app.post("/process-pdf/") async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")): # Read the PDF file file_contents = await pdf_file.read() # Split the PDF into chunks chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk) # Asynchronously process the PDF chunks scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables) # Collect the results content, table_string = collect_pdfscraper_response(scrape_response_list) # Ensure the directory exists before attempting to delete it if os.path.exists(chunks_dir): try: shutil.rmtree(chunks_dir) # Clean up chunks after processing except Exception as e: # Log any errors during cleanup logging.error(f"Error deleting directory {chunks_dir}: {e}") return JSONResponse(content={"content": content, "tables": table_string}) # If you want to run this locally, uncomment the lines below. # if __name__ == "__main__": # import uvicorn # uvicorn.run(app, host="0.0.0.0", port=8000) #uvicorn main:app --workers 2