Arafath10 commited on
Commit
4c263a7
·
verified ·
1 Parent(s): 2790e66

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +111 -33
main.py CHANGED
@@ -4,22 +4,22 @@ except ImportError: # pip < 10.0
4
 
5
  pkgs = freeze.freeze()
6
  for pkg in pkgs: print(pkg)
7
- import os
8
- from fastapi import FastAPI, HTTPException, File, UploadFile
9
- from fastapi.middleware.cors import CORSMiddleware
10
- from PyPDF2 import PdfReader
11
-
12
-
13
- import google.generativeai as genai
14
- import json
15
 
16
- secret = os.environ["key"]
17
- genai.configure(api_key=secret)
18
- model_vision = genai.GenerativeModel('gemini-pro-vision')
19
- model_text = genai.GenerativeModel('gemini-pro')
 
 
 
 
 
 
 
20
 
21
  app = FastAPI()
22
 
 
23
  app.add_middleware(
24
  CORSMiddleware,
25
  allow_origins=["*"],
@@ -28,30 +28,108 @@ app.add_middleware(
28
  allow_headers=["*"],
29
  )
30
 
31
- @app.post("/get_ocr_data/")
32
- async def get_data(pdf: UploadFile = File(...)):
33
- try:
34
- # Read PDF file using PyPDF2
35
- pdf_reader = PdfReader(pdf.file)
36
- text = ""
37
-
38
- # Extract text from each page
39
- for page in pdf_reader.pages:
40
- text += page.extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # call gemini to get required data extracted text
43
- prompt = f"""this is cv data : {text.strip()}
44
- i want only
 
45
 
46
- fisrtname,lastname,contact number,total years of experince,linkdn link,experinece,skils
 
 
 
 
 
 
47
 
48
- in json format only"""
49
-
50
- response = model_text.generate_content(prompt)
51
- data = json.loads(response.text.replace("```json","").replace("```",""))
52
- return {"data":data}
53
 
54
- except Exception as e:
55
- raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
 
 
56
 
 
 
57
 
 
 
 
 
 
4
 
5
  pkgs = freeze.freeze()
6
  for pkg in pkgs: print(pkg)
 
 
 
 
 
 
 
 
7
 
8
+ import io
9
+ import asyncio
10
+ import time
11
+ import aiohttp
12
+ from PyPDF2 import PdfReader, PdfWriter
13
+ import os
14
+ from pathlib import Path
15
+ from aiohttp import FormData
16
+ from fastapi import FastAPI, File, UploadFile
17
+ from fastapi.responses import JSONResponse
18
+ from fastapi.middleware.cors import CORSMiddleware
19
 
20
  app = FastAPI()
21
 
22
+ # Configure CORS
23
  app.add_middleware(
24
  CORSMiddleware,
25
  allow_origins=["*"],
 
28
  allow_headers=["*"],
29
  )
30
 
31
+ async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int):
32
+ split_pdf(file_contents, file_name, pages_per_chunk)
33
+ response_list = []
34
+ async with aiohttp.ClientSession() as session:
35
+ tasks = [
36
+ call_pdfscraper(session, chunk_data, chunk_name)
37
+ for chunk_data, chunk_name in load_chunks(file_name)
38
+ ]
39
+ responses = await asyncio.gather(*tasks)
40
+ for response in responses:
41
+ response_list.append(response[0])
42
+
43
+ return response_list
44
+
45
+ async def call_pdfscraper(session, file_contents, pdf_name):
46
+ headers = {"Origin": "http://localhost:8080"}
47
+ url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
48
+ # Create a FormData object
49
+ data = FormData()
50
+ data.add_field(
51
+ "pdf",
52
+ file_contents,
53
+ filename=os.path.basename(pdf_name),
54
+ content_type="application/pdf",
55
+ )
56
+ data.add_field("processTables", "True")
57
+
58
+ async with session.post(url, data=data, headers=headers) as resp:
59
+ if resp.status == 200:
60
+ response = await resp.json()
61
+ else:
62
+ print(f"Failed to get response: {resp.status}")
63
+ return {}
64
+
65
+ return response, pdf_name
66
+
67
+ def collect_pdfscraper_response(scrape_response_list):
68
+ content_list = []
69
+ tables_dict = {}
70
+ table_count = 1
71
+ for response in scrape_response_list:
72
+ content = response["corpus"]
73
+ table_content = response["tables_raw"]
74
+
75
+ content_list.append(content)
76
+ try:
77
+ for table_key in table_content.keys():
78
+ tables_dict[str(table_count)] = table_content[table_key]
79
+ table_count += 1
80
+
81
+ except AttributeError:
82
+ pass
83
+
84
+ content_str = "\n".join(content_list)
85
+
86
+ return content_str, tables_dict
87
+
88
+ def split_pdf(file_contents, file_name, pages_per_chunk):
89
+ file_bytes = io.BytesIO(file_contents)
90
+ reader = PdfReader(file_bytes)
91
+ total_pages = len(reader.pages)
92
+
93
+ output_dir = Path(file_name).parent / "chunks"
94
+ os.makedirs(output_dir, exist_ok=True)
95
+
96
+ num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
97
+
98
+ for i in range(num_chunks):
99
+ writer = PdfWriter()
100
+ start_page = i * pages_per_chunk
101
+ end_page = min(start_page + pages_per_chunk, total_pages)
102
+
103
+ for page_number in range(start_page, end_page):
104
+ writer.add_page(reader.pages[page_number])
105
 
106
+ chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
107
+ output_path = output_dir / chunk_file_name
108
+ with open(output_path, "wb") as output_pdf:
109
+ writer.write(output_pdf)
110
 
111
+ def load_chunks(file_name):
112
+ output_dir = Path(file_name).parent / "chunks"
113
+ chunk_list = os.listdir(output_dir)
114
+ chunk_byte_list = [
115
+ (open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list
116
+ ]
117
+ return chunk_byte_list
118
 
119
+ @app.post("/process-pdf/")
120
+ async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2):
121
+ file_contents = await file.read()
122
+ file_name = file.filename
 
123
 
124
+ start_time = time.time()
125
+ scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk)
126
+ content, table_string = collect_pdfscraper_response(scrape_response_list)
127
+ end_time = time.time()
128
 
129
+ time_taken = end_time - start_time
130
+ return JSONResponse(content={"content": content, "tables": table_string, "time_taken": time_taken})
131
 
132
+ # Start the FastAPI app
133
+ # if __name__ == "__main__":
134
+ # import uvicorn
135
+ # uvicorn.run(app,port=7000,workers=2)