Mallisetty Siva Mahesh commited on
Commit
c04d620
·
1 Parent(s): caa039b

added msme and cinllpin

Browse files
Files changed (2) hide show
  1. app.py +111 -55
  2. utils.py +122 -40
app.py CHANGED
@@ -11,7 +11,7 @@ from fastapi import FastAPI, HTTPException, Request
11
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
12
  from dotenv import load_dotenv
13
  import urllib.parse
14
- from utils import doc_processing
15
 
16
  # Load .env file
17
  load_dotenv()
@@ -143,59 +143,91 @@ for dir_path in process_dirs.values():
143
  logging.basicConfig(level=logging.INFO)
144
 
145
 
146
- # Perform Inference with optional S3 upload
147
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
148
  model_dirs = {
149
  "pan_file": pan_model,
150
  "gst_file": gst_model,
151
  "cheque_file": cheque_model,
152
  }
 
153
  try:
154
  inference_results = {}
155
 
156
  for doc_type, file_path in file_paths.items():
157
- if doc_type in model_dirs:
158
- print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
159
-
160
- processed_file_p = file_path.split("&&")[0]
161
- unprocessed_file_path = file_path.split("&&")[1]
162
- images_path = [processed_file_p]
163
- inference_batch = prepare_batch_for_inference(images_path)
164
-
165
- context = model_dirs[doc_type]
166
- processor = globals()[f"processor_{doc_type.split('_')[0]}"]
167
- name = doc_type.split("_")[0]
168
- attachemnt_num = {
169
- "pan_file": 2,
170
- "gst_file": 4,
171
- "msme_file": 5,
172
- "cin_llpin_file": 6,
173
- "cheque_file": 8,
174
- }[doc_type]
175
-
176
- if upload_to_s3:
177
- client = s3_client()
178
- bucket_name = "edgekycdocs"
179
- folder_name = f"{name}docs"
180
- file_name = unprocessed_file_path.split("/")[-1]
 
 
 
 
181
  response = client.upload_file(
182
  unprocessed_file_path, bucket_name, folder_name, file_name
183
  )
184
  print("The file has been uploaded to S3 bucket", response)
185
  attachment_url = response["url"]
186
- else:
 
 
187
  attachment_url = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- result = handle(inference_batch, context, processor, name)
190
- result["attachment_url"] = attachment_url
191
- result["detect"] = True
192
 
193
- inference_results[f"attachment_{attachemnt_num}"] = result
194
- else:
195
- print(f"Model directory not found for {doc_type}. Skipping.")
 
 
 
 
 
 
 
 
 
196
 
197
  return inference_results
198
- except:
 
 
199
  return {"status": "error", "message": "Text extraction failed."}
200
 
201
 
@@ -234,21 +266,31 @@ async def aadhar_ocr(
234
  print("file_paths", file_paths)
235
 
236
  files = {}
237
- for key, value in file_paths.items():
238
- name = value.split("/")[-1].split(".")[0]
239
- id_type = key.split("_")[0]
240
- doc_type = value.split("/")[-1].split(".")[-1]
241
- f_path = value
242
 
243
- print("variables required", name, id_type, doc_type, f_path)
244
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
245
- response = preprocessing.process()
246
 
247
- print("response after preprocessing", response)
 
 
 
 
 
 
248
 
249
- files[key] = response["output_p"] + "&&" + f_path
250
- # files["unprocessed_file_path"] = f_path
251
- print("response", response)
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  # Perform inference
254
  result = perform_inference(files, upload_to_s3)
@@ -307,16 +349,30 @@ async def document_ocr_s3(request: Request):
307
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
308
 
309
  files = {}
310
- for key, value in file_paths.items():
311
- name = value.split("/")[-1].split(".")[0]
312
- id_type = key.split("_")[0]
313
- doc_type = value.split("/")[-1].split(".")[-1]
314
- f_path = value
315
-
316
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
317
- response = preprocessing.process()
318
 
319
- files[key] = response["output_p"] + "&&" + f_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  result = perform_inference(files, upload_to_s3)
322
 
 
11
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
12
  from dotenv import load_dotenv
13
  import urllib.parse
14
+ from utils import doc_processing, extract_document_number_from_file
15
 
16
  # Load .env file
17
  load_dotenv()
 
143
  logging.basicConfig(level=logging.INFO)
144
 
145
 
 
146
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
147
  model_dirs = {
148
  "pan_file": pan_model,
149
  "gst_file": gst_model,
150
  "cheque_file": cheque_model,
151
  }
152
+
153
  try:
154
  inference_results = {}
155
 
156
  for doc_type, file_path in file_paths.items():
157
+ processed_file_p = file_path.split("&&")[
158
+ 0
159
+ ] # Extracted document number or processed image
160
+ unprocessed_file_path = file_path.split("&&")[1] # Original file path
161
+
162
+ print(f"Processing {doc_type}: {processed_file_p}")
163
+
164
+ # Determine the attachment number based on the document type
165
+ attachment_num = {
166
+ "pan_file": 2,
167
+ "gst_file": 4,
168
+ "msme_file": 5,
169
+ "cin_llpin_file": 6,
170
+ "cheque_file": 8,
171
+ }.get(doc_type, None)
172
+
173
+ if attachment_num is None:
174
+ print(f"Skipping {doc_type}, not recognized.")
175
+ continue
176
+
177
+ # Upload file to S3 if required
178
+ if upload_to_s3:
179
+ client = s3_client()
180
+ bucket_name = "edgekycdocs"
181
+ folder_name = f"{doc_type.split('_')[0]}docs"
182
+ file_name = unprocessed_file_path.split("/")[-1].replace(" ", "_")
183
+
184
+ try:
185
  response = client.upload_file(
186
  unprocessed_file_path, bucket_name, folder_name, file_name
187
  )
188
  print("The file has been uploaded to S3 bucket", response)
189
  attachment_url = response["url"]
190
+ print(f"File uploaded to S3: {attachment_url}")
191
+ except Exception as e:
192
+ print(f"Failed to upload {file_name} to S3: {e}")
193
  attachment_url = None
194
+ else:
195
+ attachment_url = None
196
+ # If it's an OCR-based extraction (CIN, MSME, LLPIN, PAN, Aadhaar), return the extracted number
197
+ if doc_type in ["msme_file", "cin_llpin_file", "aadhar_file"]:
198
+ result = {
199
+ "attachment_num": processed_file_p, # Extracted CIN, LLPIN, MSME, PAN, or Aadhaar number
200
+ "attachment_url": attachment_url,
201
+ "attachment_status": 200,
202
+ "detect": True,
203
+ }
204
+ else:
205
+ # If the document needs ML model inference (PAN, GST, Cheque)
206
+ if doc_type in model_dirs:
207
+ print(
208
+ f"Running ML inference for {doc_type} using {model_dirs[doc_type]}"
209
+ )
210
 
211
+ images_path = [processed_file_p]
212
+ inference_batch = prepare_batch_for_inference(images_path)
 
213
 
214
+ context = model_dirs[doc_type]
215
+ processor = globals()[f"processor_{doc_type.split('_')[0]}"]
216
+ name = doc_type.split("_")[0]
217
+
218
+ result = handle(inference_batch, context, processor, name)
219
+ result["attachment_url"] = attachment_url
220
+ result["detect"] = True
221
+ else:
222
+ print(f"No model found for {doc_type}, skipping inference.")
223
+ continue
224
+
225
+ inference_results[f"attachment_{attachment_num}"] = result
226
 
227
  return inference_results
228
+
229
+ except Exception as e:
230
+ print(f"Error in perform_inference: {e}")
231
  return {"status": "error", "message": "Text extraction failed."}
232
 
233
 
 
266
  print("file_paths", file_paths)
267
 
268
  files = {}
 
 
 
 
 
269
 
270
+ for key, f_path in file_paths.items():
 
 
271
 
272
+ name = os.path.splitext(os.path.basename(f_path))[0]
273
+ # Determine id_type: for cin_llpin_file, explicitly set id_type to "cin_llpin"
274
+ if key == "cin_llpin_file":
275
+ id_type = "cin_llpin"
276
+ else:
277
+ id_type = key.split("_")[0]
278
+ doc_type = os.path.splitext(f_path)[-1].lstrip(".")
279
 
280
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
281
+ extracted_number = extract_document_number_from_file(f_path, id_type)
282
+ if not extracted_number:
283
+ logging.error(f"Failed to extract document number from {f_path}")
284
+ raise HTTPException(
285
+ status_code=400, detail=f"Invalid document format in {key}"
286
+ )
287
+ files[key] = extracted_number + "&&" + f_path
288
+ print("files", files[key])
289
+ else:
290
+ # For other files, use existing preprocessing.
291
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
292
+ response = preprocessing.process()
293
+ files[key] = response["output_p"] + "&&" + f_path
294
 
295
  # Perform inference
296
  result = perform_inference(files, upload_to_s3)
 
349
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
350
 
351
  files = {}
 
 
 
 
 
 
 
 
352
 
353
+ for key, f_path in file_paths.items():
354
+ name = f_path.split("/")[-1].split(".")[0]
355
+ if key == "cin_llpin_file":
356
+ id_type = "cin_llpin"
357
+ else:
358
+ id_type = key.split("_")[0]
359
+ # id_type = key.split("_")[0]
360
+ doc_type = f_path.split("/")[-1].split(".")[-1]
361
+
362
+ # For MSME and CIN/LLPIN files, extract document number via OCR and regex
363
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
364
+ extracted_number = extract_document_number_from_file(f_path, id_type)
365
+ if not extracted_number:
366
+ logging.error(f"Failed to extract document number from {f_path}")
367
+ raise HTTPException(
368
+ status_code=400, detail=f"Invalid document format in {key}"
369
+ )
370
+ files[key] = extracted_number + "&&" + f_path
371
+ else:
372
+ # For other documents, use the existing ML model preprocessing
373
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
374
+ response = preprocessing.process()
375
+ files[key] = response["output_p"] + "&&" + f_path
376
 
377
  result = perform_inference(files, upload_to_s3)
378
 
utils.py CHANGED
@@ -1,71 +1,75 @@
1
  import fitz
2
  from PIL import Image
 
 
 
 
 
 
 
 
 
3
 
4
  class doc_processing:
5
 
6
  def __init__(self, name, id_type, doc_type, f_path):
7
-
8
  self.name = name
9
  self.id_type = id_type
10
  self.doc_type = doc_type
11
  self.f_path = f_path
12
  # self.o_path = o_path
13
-
14
-
15
  def pdf_to_image_scale(self):
16
  pdf_document = fitz.open(self.f_path)
17
  if self.id_type == "gst":
18
  page_num = 2
19
  else:
20
  page_num = 0
21
-
22
  page = pdf_document.load_page(page_num)
23
  pix = page.get_pixmap() # Render page as a pixmap (image)
24
-
25
  # Convert pixmap to PIL Image
26
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
-
28
  original_width, original_height = image.size
29
-
30
- print("original_width",original_width)
31
- print("original_height",original_height)
32
 
 
 
33
 
34
  new_width = (1000 / original_width) * original_width
35
  new_height = (1000 / original_height) * original_height
36
-
37
- print("new_width",new_width)
38
- print("new_height",new_height)
39
- # new_width =
40
- # new_height =
41
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
42
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
43
  image.save(output_path)
44
- return {"success":200,"output_p":output_path}
45
-
46
 
47
  def scale_img(self):
48
-
49
 
50
- print("path of file",self.f_path)
51
  image = Image.open(self.f_path).convert("RGB")
52
  original_width, original_height = image.size
53
-
54
- print("original_width",original_width)
55
- print("original_height",original_height)
56
 
 
 
57
 
58
  new_width = (1000 / original_width) * original_width
59
  new_height = (1000 / original_height) * original_height
60
-
61
- print("new_width",new_width)
62
- print("new_height",new_height)
63
- # new_width =
64
- # new_height =
65
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
66
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
67
  image.save(output_path)
68
- return {"success":200,"output_p":output_path}
69
 
70
  def process(self):
71
  if self.doc_type == "pdf":
@@ -76,12 +80,95 @@ class doc_processing:
76
  return response
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
-
82
  # files = {
83
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
84
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
85
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
86
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
87
  # }
@@ -89,7 +176,7 @@ class doc_processing:
89
 
90
  # files = {
91
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
92
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
93
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
94
  # "gst_file": "test_Images_folder/gst/e.pdf"
95
  # }
@@ -102,11 +189,6 @@ class doc_processing:
102
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
103
  # response = preprocessing.process()
104
  # print("response",response)
105
-
106
-
107
-
108
-
109
-
110
- # id_type, doc_type, f_path
111
-
112
-
 
1
  import fitz
2
  from PIL import Image
3
+ import re
4
+ import io
5
+ import os
6
+ import logging
7
+ import shutil
8
+ from fastapi import FastAPI, UploadFile, File, HTTPException
9
+ from google.cloud import vision
10
+ from pdf2image import convert_from_path
11
+
12
 
13
  class doc_processing:
14
 
15
  def __init__(self, name, id_type, doc_type, f_path):
16
+
17
  self.name = name
18
  self.id_type = id_type
19
  self.doc_type = doc_type
20
  self.f_path = f_path
21
  # self.o_path = o_path
22
+
 
23
  def pdf_to_image_scale(self):
24
  pdf_document = fitz.open(self.f_path)
25
  if self.id_type == "gst":
26
  page_num = 2
27
  else:
28
  page_num = 0
29
+
30
  page = pdf_document.load_page(page_num)
31
  pix = page.get_pixmap() # Render page as a pixmap (image)
32
+
33
  # Convert pixmap to PIL Image
34
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
35
+
36
  original_width, original_height = image.size
 
 
 
37
 
38
+ print("original_width", original_width)
39
+ print("original_height", original_height)
40
 
41
  new_width = (1000 / original_width) * original_width
42
  new_height = (1000 / original_height) * original_height
43
+
44
+ print("new_width", new_width)
45
+ print("new_height", new_height)
46
+ # new_width =
47
+ # new_height =
48
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
49
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
50
  image.save(output_path)
51
+ return {"success": 200, "output_p": output_path}
 
52
 
53
  def scale_img(self):
 
54
 
55
+ print("path of file", self.f_path)
56
  image = Image.open(self.f_path).convert("RGB")
57
  original_width, original_height = image.size
 
 
 
58
 
59
+ print("original_width", original_width)
60
+ print("original_height", original_height)
61
 
62
  new_width = (1000 / original_width) * original_width
63
  new_height = (1000 / original_height) * original_height
64
+
65
+ print("new_width", new_width)
66
+ print("new_height", new_height)
67
+ # new_width =
68
+ # new_height =
69
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
70
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
71
  image.save(output_path)
72
+ return {"success": 200, "output_p": output_path}
73
 
74
  def process(self):
75
  if self.doc_type == "pdf":
 
80
  return response
81
 
82
 
83
+ from google.cloud import vision
84
+
85
+ vision_client = vision.ImageAnnotatorClient()
86
+
87
+
88
+ def extract_document_number(ocr_text: str, id_type: str) -> str:
89
+ """
90
+ Searches the OCR text for a valid document number based on regex patterns.
91
+ Checks for CIN, then MSME, and finally LLPIN.
92
+ """
93
+ patterns = {
94
+ "cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"),
95
+ "msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"),
96
+ "llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"),
97
+ "pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"),
98
+ "aadhaar": re.compile(r"^\d{12}$"),
99
+ }
100
+
101
+ if id_type == "cin_llpin":
102
+ # Try CIN first
103
+ match = patterns["cin"].search(ocr_text)
104
+ if match:
105
+ return match.group(0)
106
+ # If CIN not found, try LLPIN
107
+ match = patterns["llpin"].search(ocr_text)
108
+ if match:
109
+ return match.group(0)
110
+ elif id_type in patterns:
111
+ match = patterns[id_type].search(ocr_text)
112
+ if match:
113
+ return match.group(0)
114
+
115
+ return None
116
+
117
+
118
+ def run_google_vision(file_content: bytes) -> str:
119
+ """
120
+ Uses Google Vision OCR to extract text from binary file content.
121
+ """
122
+ image = vision.Image(content=file_content)
123
+ response = vision_client.text_detection(image=image)
124
+ texts = response.text_annotations
125
+ if texts:
126
+ # The first annotation contains the complete detected text
127
+ return texts[0].description
128
+ return ""
129
+
130
+
131
+ def extract_text_from_file(file_path: str) -> str:
132
+ """
133
+ Reads the file from file_path. If it's a PDF, converts only the first page to an image,
134
+ then runs OCR using Google Vision.
135
+ """
136
+ if file_path.lower().endswith(".pdf"):
137
+ try:
138
+ # Open the PDF file using PyMuPDF (fitz)
139
+ pdf_document = fitz.open(file_path)
140
+ page = pdf_document.load_page(0) # Load the first page
141
+ pix = page.get_pixmap() # Render page as an image
142
+
143
+ # Convert pixmap to PIL Image
144
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
145
+
146
+ # Convert image to bytes for OCR
147
+ img_byte_arr = io.BytesIO()
148
+ image.save(img_byte_arr, format="JPEG")
149
+ file_content = img_byte_arr.getvalue()
150
+
151
+ except Exception as e:
152
+ logging.error(f"Error converting PDF to image: {e}")
153
+ return ""
154
+ else:
155
+ with open(file_path, "rb") as f:
156
+ file_content = f.read()
157
+
158
+ return run_google_vision(file_content)
159
+
160
+
161
+ def extract_document_number_from_file(file_path: str, id_type: str) -> str:
162
+ """
163
+ Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path.
164
+ """
165
+ ocr_text = extract_text_from_file(file_path)
166
+ return extract_document_number(ocr_text, id_type)
167
 
168
 
 
169
  # files = {
170
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
171
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
172
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
173
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
174
  # }
 
176
 
177
  # files = {
178
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
179
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
180
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
181
  # "gst_file": "test_Images_folder/gst/e.pdf"
182
  # }
 
189
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
190
  # response = preprocessing.process()
191
  # print("response",response)
192
+
193
+
194
+ # id_type, doc_type, f_path