File size: 4,242 Bytes
9ba3ade a81ff23 4c263a7 d16f678 d92c861 9c62372 4c263a7 d92c861 9c62372 4c263a7 9c62372 4c263a7 a81ff23 4c263a7 a81ff23 4c263a7 9c62372 4c263a7 9c62372 4c263a7 0f0c7dc 4c263a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
from pip.operations import freeze
pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)
import io
import asyncio
import time
import aiohttp
from PyPDF2 import PdfReader, PdfWriter
import os
from pathlib import Path
from aiohttp import FormData
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI()
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int):
split_pdf(file_contents, file_name, pages_per_chunk)
response_list = []
async with aiohttp.ClientSession() as session:
tasks = [
call_pdfscraper(session, chunk_data, chunk_name)
for chunk_data, chunk_name in load_chunks(file_name)
]
responses = await asyncio.gather(*tasks)
for response in responses:
response_list.append(response[0])
return response_list
async def call_pdfscraper(session, file_contents, pdf_name):
headers = {"Origin": "http://localhost:8080"}
url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
# Create a FormData object
data = FormData()
data.add_field(
"pdf",
file_contents,
filename=os.path.basename(pdf_name),
content_type="application/pdf",
)
data.add_field("processTables", "True")
async with session.post(url, data=data, headers=headers) as resp:
if resp.status == 200:
response = await resp.json()
else:
print(f"Failed to get response: {resp.status}")
return {}
return response, pdf_name
def collect_pdfscraper_response(scrape_response_list):
content_list = []
tables_dict = {}
table_count = 1
for response in scrape_response_list:
content = response["corpus"]
table_content = response["tables_raw"]
content_list.append(content)
try:
for table_key in table_content.keys():
tables_dict[str(table_count)] = table_content[table_key]
table_count += 1
except AttributeError:
pass
content_str = "\n".join(content_list)
return content_str, tables_dict
def split_pdf(file_contents, file_name, pages_per_chunk):
file_bytes = io.BytesIO(file_contents)
reader = PdfReader(file_bytes)
total_pages = len(reader.pages)
output_dir = Path(file_name).parent / "chunks"
os.makedirs(output_dir, exist_ok=True)
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
for i in range(num_chunks):
writer = PdfWriter()
start_page = i * pages_per_chunk
end_page = min(start_page + pages_per_chunk, total_pages)
for page_number in range(start_page, end_page):
writer.add_page(reader.pages[page_number])
chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
output_path = output_dir / chunk_file_name
with open(output_path, "wb") as output_pdf:
writer.write(output_pdf)
def load_chunks(file_name):
output_dir = Path(file_name).parent / "chunks"
chunk_list = os.listdir(output_dir)
chunk_byte_list = [
(open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list
]
return chunk_byte_list
@app.post("/process-pdf/")
async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2):
file_contents = await file.read()
file_name = file.filename
start_time = time.time()
scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk)
content, table_string = collect_pdfscraper_response(scrape_response_list)
end_time = time.time()
time_taken = end_time - start_time
return JSONResponse(content={"content": content, "tables": table_string, "time_taken": time_taken})
# Start the FastAPI app
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app,port=7000,workers=2)
|