File size: 4,242 Bytes
9ba3ade
 
 
 
 
 
a81ff23
4c263a7
 
 
 
 
 
 
 
 
 
 
d16f678
d92c861
9c62372
4c263a7
d92c861
 
 
 
 
 
 
9c62372
4c263a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c62372
4c263a7
 
 
 
a81ff23
4c263a7
 
 
 
 
 
 
a81ff23
4c263a7
 
 
 
9c62372
4c263a7
 
 
 
9c62372
4c263a7
 
0f0c7dc
4c263a7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
    from pip.operations import freeze

pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)

import io
import asyncio
import time
import aiohttp
from PyPDF2 import PdfReader, PdfWriter
import os
from pathlib import Path
from aiohttp import FormData
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int):
    split_pdf(file_contents, file_name, pages_per_chunk)
    response_list = []
    async with aiohttp.ClientSession() as session:
        tasks = [
            call_pdfscraper(session, chunk_data, chunk_name)
            for chunk_data, chunk_name in load_chunks(file_name)
        ]
        responses = await asyncio.gather(*tasks)
        for response in responses:
            response_list.append(response[0])

    return response_list

async def call_pdfscraper(session, file_contents, pdf_name):
    headers = {"Origin": "http://localhost:8080"}
    url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
    # Create a FormData object
    data = FormData()
    data.add_field(
        "pdf",
        file_contents,
        filename=os.path.basename(pdf_name),
        content_type="application/pdf",
    )
    data.add_field("processTables", "True")

    async with session.post(url, data=data, headers=headers) as resp:
        if resp.status == 200:
            response = await resp.json()
        else:
            print(f"Failed to get response: {resp.status}")
            return {}

    return response, pdf_name

def collect_pdfscraper_response(scrape_response_list):
    content_list = []
    tables_dict = {}
    table_count = 1
    for response in scrape_response_list:
        content = response["corpus"]
        table_content = response["tables_raw"]

        content_list.append(content)
        try:
            for table_key in table_content.keys():
                tables_dict[str(table_count)] = table_content[table_key]
                table_count += 1

        except AttributeError:
            pass

    content_str = "\n".join(content_list)

    return content_str, tables_dict

def split_pdf(file_contents, file_name, pages_per_chunk):
    file_bytes = io.BytesIO(file_contents)
    reader = PdfReader(file_bytes)
    total_pages = len(reader.pages)

    output_dir = Path(file_name).parent / "chunks"
    os.makedirs(output_dir, exist_ok=True)

    num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk

    for i in range(num_chunks):
        writer = PdfWriter()
        start_page = i * pages_per_chunk
        end_page = min(start_page + pages_per_chunk, total_pages)

        for page_number in range(start_page, end_page):
            writer.add_page(reader.pages[page_number])

        chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
        output_path = output_dir / chunk_file_name
        with open(output_path, "wb") as output_pdf:
            writer.write(output_pdf)

def load_chunks(file_name):
    output_dir = Path(file_name).parent / "chunks"
    chunk_list = os.listdir(output_dir)
    chunk_byte_list = [
        (open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list
    ]
    return chunk_byte_list

@app.post("/process-pdf/")
async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2):
    file_contents = await file.read()
    file_name = file.filename

    start_time = time.time()
    scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk)
    content, table_string = collect_pdfscraper_response(scrape_response_list)
    end_time = time.time()

    time_taken = end_time - start_time
    return JSONResponse(content={"content": content, "tables": table_string, "time_taken": time_taken})

# Start the FastAPI app
# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app,port=7000,workers=2)