Arafath10 commited on
Commit
1d9b15e
·
verified ·
1 Parent(s): c69582b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -60
main.py CHANGED
@@ -1,51 +1,19 @@
1
- try: from pip._internal.operations import freeze
2
- except ImportError: # pip < 10.0
3
- from pip.operations import freeze
4
-
5
- pkgs = freeze.freeze()
6
- for pkg in pkgs: print(pkg)
7
-
8
  import io
9
  import asyncio
10
- import time
11
- import aiohttp
12
- from PyPDF2 import PdfReader, PdfWriter
13
  import os
14
  from pathlib import Path
15
- from aiohttp import FormData
16
- from fastapi import FastAPI, File, UploadFile
 
17
  from fastapi.responses import JSONResponse
18
- from fastapi.middleware.cors import CORSMiddleware
 
19
 
20
  app = FastAPI()
21
 
22
- # Configure CORS
23
- app.add_middleware(
24
- CORSMiddleware,
25
- allow_origins=["*"],
26
- allow_credentials=True,
27
- allow_methods=["*"],
28
- allow_headers=["*"],
29
- )
30
-
31
- async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int):
32
- split_pdf(file_contents, file_name, pages_per_chunk)
33
- response_list = []
34
- async with aiohttp.ClientSession() as session:
35
- tasks = [
36
- call_pdfscraper(session, chunk_data, chunk_name)
37
- for chunk_data, chunk_name in load_chunks(file_name)
38
- ]
39
- responses = await asyncio.gather(*tasks)
40
- for response in responses:
41
- response_list.append(response[0])
42
-
43
- return response_list
44
-
45
- async def call_pdfscraper(session, file_contents, pdf_name):
46
  headers = {"Origin": "http://localhost:8080"}
47
  url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
48
- # Create a FormData object
49
  data = FormData()
50
  data.add_field(
51
  "pdf",
@@ -53,24 +21,42 @@ async def call_pdfscraper(session, file_contents, pdf_name):
53
  filename=os.path.basename(pdf_name),
54
  content_type="application/pdf",
55
  )
56
- data.add_field("processTables", "True")
57
 
58
  async with session.post(url, data=data, headers=headers) as resp:
59
  if resp.status == 200:
60
  response = await resp.json()
61
  else:
62
- print(f"Failed to get response: {resp.status}")
63
- return {}
64
 
65
  return response, pdf_name
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def collect_pdfscraper_response(scrape_response_list):
68
  content_list = []
69
  tables_dict = {}
70
  table_count = 1
71
  for response in scrape_response_list:
72
- content = response["corpus"]
73
- table_content = response["tables_raw"]
74
 
75
  content_list.append(content)
76
  try:
@@ -85,11 +71,11 @@ def collect_pdfscraper_response(scrape_response_list):
85
 
86
  return content_str, tables_dict
87
 
 
88
  def split_pdf(file_contents, file_name, pages_per_chunk):
89
  file_bytes = io.BytesIO(file_contents)
90
  reader = PdfReader(file_bytes)
91
  total_pages = len(reader.pages)
92
-
93
  output_dir = Path(file_name).parent / "chunks"
94
  os.makedirs(output_dir, exist_ok=True)
95
 
@@ -108,27 +94,23 @@ def split_pdf(file_contents, file_name, pages_per_chunk):
108
  with open(output_path, "wb") as output_pdf:
109
  writer.write(output_pdf)
110
 
111
- def load_chunks(file_name):
112
- output_dir = Path(file_name).parent / "chunks"
113
- chunk_list = os.listdir(output_dir)
114
- chunk_byte_list = [
115
- (open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list
116
- ]
117
- return chunk_byte_list
118
 
119
  @app.post("/process-pdf/")
120
- async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2):
121
- file_contents = await file.read()
122
- file_name = file.filename
123
 
124
- scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk)
 
125
  content, table_string = collect_pdfscraper_response(scrape_response_list)
126
-
127
- shutil.rmtree("chunks")
128
- print("old chunks removed")
129
  return JSONResponse(content={"content": content, "tables": table_string})
130
 
131
- # Start the FastAPI app
 
132
  # if __name__ == "__main__":
133
  # import uvicorn
134
- # uvicorn.run(app,port=7000,workers=2)
 
 
 
 
 
 
 
 
1
  import io
2
  import asyncio
 
 
 
3
  import os
4
  from pathlib import Path
5
+ import aiohttp
6
+ from PyPDF2 import PdfReader, PdfWriter
7
+ from fastapi import FastAPI, UploadFile, Form
8
  from fastapi.responses import JSONResponse
9
+ from aiohttp import FormData
10
+ import shutil
11
 
12
  app = FastAPI()
13
 
14
+ async def call_pdfscraper(session, file_contents, pdf_name, processTables):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  headers = {"Origin": "http://localhost:8080"}
16
  url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
 
17
  data = FormData()
18
  data.add_field(
19
  "pdf",
 
21
  filename=os.path.basename(pdf_name),
22
  content_type="application/pdf",
23
  )
24
+ data.add_field("processTables", processTables)
25
 
26
  async with session.post(url, data=data, headers=headers) as resp:
27
  if resp.status == 200:
28
  response = await resp.json()
29
  else:
30
+ return {}, pdf_name
 
31
 
32
  return response, pdf_name
33
 
34
+
35
+ async def execute_pdfscraper_async(file_path: str, processTables: str):
36
+ chunk_list = os.listdir(file_path)
37
+ chunk_byte_list = [
38
+ (open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list
39
+ ]
40
+ response_list = []
41
+ async with aiohttp.ClientSession() as session:
42
+ tasks = [
43
+ call_pdfscraper(session, file_all[0], file_all[1], processTables)
44
+ for file_all in chunk_byte_list
45
+ ]
46
+ responses = await asyncio.gather(*tasks)
47
+ for i, response in enumerate(responses):
48
+ response_list.append(response[0])
49
+
50
+ return response_list
51
+
52
+
53
  def collect_pdfscraper_response(scrape_response_list):
54
  content_list = []
55
  tables_dict = {}
56
  table_count = 1
57
  for response in scrape_response_list:
58
+ content = response.get("corpus", "")
59
+ table_content = response.get("tables_raw", {})
60
 
61
  content_list.append(content)
62
  try:
 
71
 
72
  return content_str, tables_dict
73
 
74
+
75
  def split_pdf(file_contents, file_name, pages_per_chunk):
76
  file_bytes = io.BytesIO(file_contents)
77
  reader = PdfReader(file_bytes)
78
  total_pages = len(reader.pages)
 
79
  output_dir = Path(file_name).parent / "chunks"
80
  os.makedirs(output_dir, exist_ok=True)
81
 
 
94
  with open(output_path, "wb") as output_pdf:
95
  writer.write(output_pdf)
96
 
97
+ return str(output_dir)
98
+
 
 
 
 
 
99
 
100
  @app.post("/process-pdf/")
101
+ async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
102
+ file_contents = await pdf_file.read()
 
103
 
104
+ chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
105
+ scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
106
  content, table_string = collect_pdfscraper_response(scrape_response_list)
107
+
108
+ shutil.rmtree(chunks_dir) # Clean up chunks after processing
109
+
110
  return JSONResponse(content={"content": content, "tables": table_string})
111
 
112
+
113
+ # Starting point for running the FastAPI app
114
  # if __name__ == "__main__":
115
  # import uvicorn
116
+ # uvicorn.run(app, host="0.0.0.0", port=8000)