Mallisetty Siva Mahesh commited on
Commit
7238fce
·
2 Parent(s): caa039b a65c5ef

added some changes

Browse files
Files changed (3) hide show
  1. app.py +121 -55
  2. requirements.txt +1 -0
  3. utils.py +124 -41
app.py CHANGED
@@ -1,10 +1,15 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from typing import Dict
4
- import os
5
  import shutil
6
  import torch
7
  import logging
 
 
 
 
 
 
8
  from s3_setup import s3_client
9
  import requests
10
  from fastapi import FastAPI, HTTPException, Request
@@ -129,6 +134,8 @@ process_dirs = {
129
  "pan_file": "processed_images/pan/",
130
  "cheque_file": "processed_images/cheque/",
131
  "gst_file": "processed_images/gst/",
 
 
132
  }
133
 
134
  # Ensure individual directories exist
@@ -143,7 +150,6 @@ for dir_path in process_dirs.values():
143
  logging.basicConfig(level=logging.INFO)
144
 
145
 
146
- # Perform Inference with optional S3 upload
147
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
148
  model_dirs = {
149
  "pan_file": pan_model,
@@ -154,48 +160,84 @@ def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
154
  inference_results = {}
155
 
156
  for doc_type, file_path in file_paths.items():
157
- if doc_type in model_dirs:
158
- print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
159
-
160
- processed_file_p = file_path.split("&&")[0]
161
- unprocessed_file_path = file_path.split("&&")[1]
162
- images_path = [processed_file_p]
163
- inference_batch = prepare_batch_for_inference(images_path)
164
-
165
- context = model_dirs[doc_type]
166
- processor = globals()[f"processor_{doc_type.split('_')[0]}"]
167
- name = doc_type.split("_")[0]
168
- attachemnt_num = {
169
- "pan_file": 2,
170
- "gst_file": 4,
171
- "msme_file": 5,
172
- "cin_llpin_file": 6,
173
- "cheque_file": 8,
174
- }[doc_type]
175
-
176
- if upload_to_s3:
177
- client = s3_client()
178
- bucket_name = "edgekycdocs"
179
- folder_name = f"{name}docs"
180
- file_name = unprocessed_file_path.split("/")[-1]
 
 
 
 
 
 
 
 
181
  response = client.upload_file(
182
  unprocessed_file_path, bucket_name, folder_name, file_name
183
  )
184
  print("The file has been uploaded to S3 bucket", response)
185
  attachment_url = response["url"]
186
- else:
 
 
187
  attachment_url = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- result = handle(inference_batch, context, processor, name)
190
- result["attachment_url"] = attachment_url
191
- result["detect"] = True
192
 
193
- inference_results[f"attachment_{attachemnt_num}"] = result
194
- else:
195
- print(f"Model directory not found for {doc_type}. Skipping.")
 
 
 
 
 
 
 
 
 
196
 
197
  return inference_results
198
- except:
 
 
199
  return {"status": "error", "message": "Text extraction failed."}
200
 
201
 
@@ -234,21 +276,31 @@ async def aadhar_ocr(
234
  print("file_paths", file_paths)
235
 
236
  files = {}
237
- for key, value in file_paths.items():
238
- name = value.split("/")[-1].split(".")[0]
239
- id_type = key.split("_")[0]
240
- doc_type = value.split("/")[-1].split(".")[-1]
241
- f_path = value
242
 
243
- print("variables required", name, id_type, doc_type, f_path)
244
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
245
- response = preprocessing.process()
246
 
247
- print("response after preprocessing", response)
 
 
 
 
 
 
248
 
249
- files[key] = response["output_p"] + "&&" + f_path
250
- # files["unprocessed_file_path"] = f_path
251
- print("response", response)
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  # Perform inference
254
  result = perform_inference(files, upload_to_s3)
@@ -307,16 +359,30 @@ async def document_ocr_s3(request: Request):
307
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
308
 
309
  files = {}
310
- for key, value in file_paths.items():
311
- name = value.split("/")[-1].split(".")[0]
312
- id_type = key.split("_")[0]
313
- doc_type = value.split("/")[-1].split(".")[-1]
314
- f_path = value
315
-
316
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
317
- response = preprocessing.process()
318
 
319
- files[key] = response["output_p"] + "&&" + f_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  result = perform_inference(files, upload_to_s3)
322
 
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from typing import Dict
 
4
  import shutil
5
  import torch
6
  import logging
7
+ import os
8
+
9
+ # Set Google Application Credentials
10
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
11
+ "titanium-scope-436311-t3-966373f5aa2f.json"
12
+ )
13
  from s3_setup import s3_client
14
  import requests
15
  from fastapi import FastAPI, HTTPException, Request
 
134
  "pan_file": "processed_images/pan/",
135
  "cheque_file": "processed_images/cheque/",
136
  "gst_file": "processed_images/gst/",
137
+ "msme_file": "processed_images/msme/",
138
+ "cin_llpin_file": "processed_images/cin_llpin/",
139
  }
140
 
141
  # Ensure individual directories exist
 
150
  logging.basicConfig(level=logging.INFO)
151
 
152
 
 
153
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
154
  model_dirs = {
155
  "pan_file": pan_model,
 
160
  inference_results = {}
161
 
162
  for doc_type, file_path in file_paths.items():
163
+ processed_file_p = file_path.split("&&")[
164
+ 0
165
+ ] # Extracted document number or processed image
166
+ unprocessed_file_path = file_path.split("&&")[1] # Original file path
167
+
168
+ print(f"Processing {doc_type}: {processed_file_p}")
169
+
170
+ # Determine the attachment number based on the document type
171
+ attachment_num = {
172
+ "pan_file": 2,
173
+ "gst_file": 4,
174
+ "msme_file": 5,
175
+ "cin_llpin_file": 6,
176
+ "cheque_file": 8,
177
+ }.get(doc_type, None)
178
+
179
+ if attachment_num is None:
180
+ print(f"Skipping {doc_type}, not recognized.")
181
+ continue
182
+
183
+ # Upload file to S3 if required
184
+ if upload_to_s3:
185
+ client = s3_client()
186
+ bucket_name = "edgekycdocs"
187
+ if doc_type == "cin_llpin":
188
+ folder_name = f"{doc_type.replace('_', '')}docs"
189
+ else:
190
+ folder_name = f"{doc_type.split('_')[0]}docs"
191
+
192
+ file_name = unprocessed_file_path.split("/")[-1].replace(" ", "_")
193
+
194
+ try:
195
  response = client.upload_file(
196
  unprocessed_file_path, bucket_name, folder_name, file_name
197
  )
198
  print("The file has been uploaded to S3 bucket", response)
199
  attachment_url = response["url"]
200
+ print(f"File uploaded to S3: {attachment_url}")
201
+ except Exception as e:
202
+ print(f"Failed to upload {file_name} to S3: {e}")
203
  attachment_url = None
204
+ else:
205
+ attachment_url = None
206
+ # If it's an OCR-based extraction (CIN, MSME, LLPIN, PAN, Aadhaar), return the extracted number
207
+ if doc_type in ["msme_file", "cin_llpin_file", "aadhar_file"]:
208
+ result = {
209
+ "attachment_num": processed_file_p, # Extracted CIN, LLPIN, MSME, PAN, or Aadhaar number
210
+ "attachment_url": attachment_url,
211
+ "attachment_status": 200,
212
+ "detect": True,
213
+ }
214
+ else:
215
+ # If the document needs ML model inference (PAN, GST, Cheque)
216
+ if doc_type in model_dirs:
217
+ print(
218
+ f"Running ML inference for {doc_type} using {model_dirs[doc_type]}"
219
+ )
220
 
221
+ images_path = [processed_file_p]
222
+ inference_batch = prepare_batch_for_inference(images_path)
 
223
 
224
+ context = model_dirs[doc_type]
225
+ processor = globals()[f"processor_{doc_type.split('_')[0]}"]
226
+ name = doc_type.split("_")[0]
227
+
228
+ result = handle(inference_batch, context, processor, name)
229
+ result["attachment_url"] = attachment_url
230
+ result["detect"] = True
231
+ else:
232
+ print(f"No model found for {doc_type}, skipping inference.")
233
+ continue
234
+
235
+ inference_results[f"attachment_{attachment_num}"] = result
236
 
237
  return inference_results
238
+
239
+ except Exception as e:
240
+ print(f"Error in perform_inference: {e}")
241
  return {"status": "error", "message": "Text extraction failed."}
242
 
243
 
 
276
  print("file_paths", file_paths)
277
 
278
  files = {}
 
 
 
 
 
279
 
280
+ for key, f_path in file_paths.items():
 
 
281
 
282
+ name = os.path.splitext(os.path.basename(f_path))[0]
283
+ # Determine id_type: for cin_llpin_file, explicitly set id_type to "cin_llpin"
284
+ if key == "cin_llpin_file":
285
+ id_type = "cin_llpin"
286
+ else:
287
+ id_type = key.split("_")[0]
288
+ doc_type = os.path.splitext(f_path)[-1].lstrip(".")
289
 
290
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
291
+ extracted_number = extract_document_number_from_file(f_path, id_type)
292
+ if not extracted_number:
293
+ logging.error(f"Failed to extract document number from {f_path}")
294
+ raise HTTPException(
295
+ status_code=400, detail=f"Invalid document format in {key}"
296
+ )
297
+ files[key] = extracted_number + "&&" + f_path
298
+ print("files", files[key])
299
+ else:
300
+ # For other files, use existing preprocessing.
301
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
302
+ response = preprocessing.process()
303
+ files[key] = response["output_p"] + "&&" + f_path
304
 
305
  # Perform inference
306
  result = perform_inference(files, upload_to_s3)
 
359
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
360
 
361
  files = {}
 
 
 
 
 
 
 
 
362
 
363
+ for key, f_path in file_paths.items():
364
+ name = f_path.split("/")[-1].split(".")[0]
365
+ if key == "cin_llpin_file":
366
+ id_type = "cin_llpin"
367
+ else:
368
+ id_type = key.split("_")[0]
369
+ # id_type = key.split("_")[0]
370
+ doc_type = f_path.split("/")[-1].split(".")[-1]
371
+
372
+ # For MSME and CIN/LLPIN files, extract document number via OCR and regex
373
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
374
+ extracted_number = extract_document_number_from_file(f_path, id_type)
375
+ if not extracted_number:
376
+ logging.error(f"Failed to extract document number from {f_path}")
377
+ raise HTTPException(
378
+ status_code=400, detail=f"Invalid document format in {key}"
379
+ )
380
+ files[key] = extracted_number + "&&" + f_path
381
+ else:
382
+ # For other documents, use the existing ML model preprocessing
383
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
384
+ response = preprocessing.process()
385
+ files[key] = response["output_p"] + "&&" + f_path
386
 
387
  result = perform_inference(files, upload_to_s3)
388
 
requirements.txt CHANGED
@@ -12,3 +12,4 @@ pillow
12
  boto3
13
 
14
  python-multipart
 
 
12
  boto3
13
 
14
  python-multipart
15
+
utils.py CHANGED
@@ -1,74 +1,79 @@
1
  import fitz
2
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
3
 
4
  class doc_processing:
5
 
6
  def __init__(self, name, id_type, doc_type, f_path):
7
-
8
  self.name = name
9
  self.id_type = id_type
10
  self.doc_type = doc_type
11
  self.f_path = f_path
12
  # self.o_path = o_path
13
-
14
-
15
  def pdf_to_image_scale(self):
16
  pdf_document = fitz.open(self.f_path)
17
  if self.id_type == "gst":
18
  page_num = 2
19
  else:
20
  page_num = 0
21
-
22
  page = pdf_document.load_page(page_num)
23
  pix = page.get_pixmap() # Render page as a pixmap (image)
24
-
25
  # Convert pixmap to PIL Image
26
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
-
28
  original_width, original_height = image.size
29
-
30
- print("original_width",original_width)
31
- print("original_height",original_height)
32
 
 
 
33
 
34
  new_width = (1000 / original_width) * original_width
35
  new_height = (1000 / original_height) * original_height
36
-
37
- print("new_width",new_width)
38
- print("new_height",new_height)
39
- # new_width =
40
- # new_height =
41
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
42
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
43
  image.save(output_path)
44
- return {"success":200,"output_p":output_path}
45
-
46
 
47
  def scale_img(self):
48
-
49
 
50
- print("path of file",self.f_path)
51
  image = Image.open(self.f_path).convert("RGB")
52
  original_width, original_height = image.size
53
-
54
- print("original_width",original_width)
55
- print("original_height",original_height)
56
 
 
 
57
 
58
  new_width = (1000 / original_width) * original_width
59
  new_height = (1000 / original_height) * original_height
60
-
61
- print("new_width",new_width)
62
- print("new_height",new_height)
63
- # new_width =
64
- # new_height =
65
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
66
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
67
  image.save(output_path)
68
- return {"success":200,"output_p":output_path}
69
 
70
  def process(self):
71
- if self.doc_type == "pdf":
72
  response = self.pdf_to_image_scale()
73
  else:
74
  response = self.scale_img()
@@ -76,12 +81,95 @@ class doc_processing:
76
  return response
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
-
82
  # files = {
83
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
84
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
85
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
86
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
87
  # }
@@ -89,7 +177,7 @@ class doc_processing:
89
 
90
  # files = {
91
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
92
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
93
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
94
  # "gst_file": "test_Images_folder/gst/e.pdf"
95
  # }
@@ -102,11 +190,6 @@ class doc_processing:
102
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
103
  # response = preprocessing.process()
104
  # print("response",response)
105
-
106
-
107
-
108
-
109
-
110
- # id_type, doc_type, f_path
111
-
112
-
 
1
  import fitz
2
  from PIL import Image
3
+ import re
4
+ import io
5
+ import os
6
+ import logging
7
+ import shutil
8
+ from fastapi import FastAPI, UploadFile, File, HTTPException
9
+ from google.cloud import vision
10
+
11
+ # from pdf2image import convert_from_path
12
+
13
 
14
  class doc_processing:
15
 
16
  def __init__(self, name, id_type, doc_type, f_path):
17
+
18
  self.name = name
19
  self.id_type = id_type
20
  self.doc_type = doc_type
21
  self.f_path = f_path
22
  # self.o_path = o_path
23
+
 
24
  def pdf_to_image_scale(self):
25
  pdf_document = fitz.open(self.f_path)
26
  if self.id_type == "gst":
27
  page_num = 2
28
  else:
29
  page_num = 0
30
+
31
  page = pdf_document.load_page(page_num)
32
  pix = page.get_pixmap() # Render page as a pixmap (image)
33
+
34
  # Convert pixmap to PIL Image
35
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
36
+
37
  original_width, original_height = image.size
 
 
 
38
 
39
+ print("original_width", original_width)
40
+ print("original_height", original_height)
41
 
42
  new_width = (1000 / original_width) * original_width
43
  new_height = (1000 / original_height) * original_height
44
+
45
+ print("new_width", new_width)
46
+ print("new_height", new_height)
47
+ # new_width =
48
+ # new_height =
49
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
50
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
51
  image.save(output_path)
52
+ return {"success": 200, "output_p": output_path}
 
53
 
54
  def scale_img(self):
 
55
 
56
+ print("path of file", self.f_path)
57
  image = Image.open(self.f_path).convert("RGB")
58
  original_width, original_height = image.size
 
 
 
59
 
60
+ print("original_width", original_width)
61
+ print("original_height", original_height)
62
 
63
  new_width = (1000 / original_width) * original_width
64
  new_height = (1000 / original_height) * original_height
65
+
66
+ print("new_width", new_width)
67
+ print("new_height", new_height)
68
+ # new_width =
69
+ # new_height =
70
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
71
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
72
  image.save(output_path)
73
+ return {"success": 200, "output_p": output_path}
74
 
75
  def process(self):
76
+ if self.doc_type == "pdf" or self.doc_type == "PDF":
77
  response = self.pdf_to_image_scale()
78
  else:
79
  response = self.scale_img()
 
81
  return response
82
 
83
 
84
+ from google.cloud import vision
85
+
86
+ vision_client = vision.ImageAnnotatorClient()
87
+
88
+
89
+ def extract_document_number(ocr_text: str, id_type: str) -> str:
90
+ """
91
+ Searches the OCR text for a valid document number based on regex patterns.
92
+ Checks for CIN, then MSME, and finally LLPIN.
93
+ """
94
+ patterns = {
95
+ "cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"),
96
+ "msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"),
97
+ "llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"),
98
+ "pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"),
99
+ "aadhaar": re.compile(r"^\d{12}$"),
100
+ }
101
+
102
+ if id_type == "cin_llpin":
103
+ # Try CIN first
104
+ match = patterns["cin"].search(ocr_text)
105
+ if match:
106
+ return match.group(0)
107
+ # If CIN not found, try LLPIN
108
+ match = patterns["llpin"].search(ocr_text)
109
+ if match:
110
+ return match.group(0)
111
+ elif id_type in patterns:
112
+ match = patterns[id_type].search(ocr_text)
113
+ if match:
114
+ return match.group(0)
115
+
116
+ return None
117
+
118
+
119
+ def run_google_vision(file_content: bytes) -> str:
120
+ """
121
+ Uses Google Vision OCR to extract text from binary file content.
122
+ """
123
+ image = vision.Image(content=file_content)
124
+ response = vision_client.text_detection(image=image)
125
+ texts = response.text_annotations
126
+ if texts:
127
+ # The first annotation contains the complete detected text
128
+ return texts[0].description
129
+ return ""
130
+
131
+
132
+ def extract_text_from_file(file_path: str) -> str:
133
+ """
134
+ Reads the file from file_path. If it's a PDF, converts only the first page to an image,
135
+ then runs OCR using Google Vision.
136
+ """
137
+ if file_path.lower().endswith(".pdf"):
138
+ try:
139
+ # Open the PDF file using PyMuPDF (fitz)
140
+ pdf_document = fitz.open(file_path)
141
+ page = pdf_document.load_page(0) # Load the first page
142
+ pix = page.get_pixmap() # Render page as an image
143
+
144
+ # Convert pixmap to PIL Image
145
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
146
+
147
+ # Convert image to bytes for OCR
148
+ img_byte_arr = io.BytesIO()
149
+ image.save(img_byte_arr, format="JPEG")
150
+ file_content = img_byte_arr.getvalue()
151
+
152
+ except Exception as e:
153
+ logging.error(f"Error converting PDF to image: {e}")
154
+ return ""
155
+ else:
156
+ with open(file_path, "rb") as f:
157
+ file_content = f.read()
158
+
159
+ return run_google_vision(file_content)
160
+
161
+
162
+ def extract_document_number_from_file(file_path: str, id_type: str) -> str:
163
+ """
164
+ Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path.
165
+ """
166
+ ocr_text = extract_text_from_file(file_path)
167
+ return extract_document_number(ocr_text, id_type)
168
 
169
 
 
170
  # files = {
171
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
172
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
173
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
174
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
175
  # }
 
177
 
178
  # files = {
179
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
180
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
181
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
182
  # "gst_file": "test_Images_folder/gst/e.pdf"
183
  # }
 
190
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
191
  # response = preprocessing.process()
192
  # print("response",response)
193
+
194
+
195
+ # id_type, doc_type, f_path