Mallisetty Siva Mahesh commited on
Commit
6413971
·
1 Parent(s): 47d6c5f

added code for msme cinllpin

Browse files
Files changed (2) hide show
  1. app.py +115 -389
  2. utils.py +122 -40
app.py CHANGED
@@ -1,335 +1,3 @@
1
- # from fastapi import FastAPI, File, UploadFile, HTTPException
2
- # from fastapi.middleware.cors import CORSMiddleware
3
- # from typing import Dict
4
- # import os
5
- # import shutil
6
- # import logging
7
- # from s3_setup import s3_client
8
-
9
- # import torch
10
- # from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
11
-
12
- # from dotenv import load_dotenv
13
- # import os
14
-
15
- # from utils import doc_processing
16
-
17
- # # Load .env file
18
- # load_dotenv()
19
-
20
- # # Access variables
21
- # dummy_key = os.getenv("dummy_key")
22
- # HUGGINGFACE_AUTH_TOKEN = dummy_key
23
-
24
-
25
- # # Hugging Face model and token
26
- # aadhar_model = "AuditEdge/doc_ocr_a" # Replace with your fine-tuned model if applicable
27
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
- # print(f"Using device: {device}")
29
-
30
- # # Load the processor (tokenizer + image processor)
31
- # processor_aadhar = LayoutLMv3Processor.from_pretrained(
32
- # aadhar_model,
33
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
34
- # )
35
- # aadhar_model = LayoutLMv3ForTokenClassification.from_pretrained(
36
- # aadhar_model,
37
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
38
- # )
39
-
40
-
41
- # aadhar_model = aadhar_model.to(device)
42
-
43
- # # pan model
44
- # pan_model = "AuditEdge/doc_ocr_p" # Replace with your fine-tuned model if applicable
45
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
- # print(f"Using device: {device}")
47
-
48
-
49
-
50
- # # Load the processor (tokenizer + image processor)
51
- # processor_pan = LayoutLMv3Processor.from_pretrained(
52
- # pan_model,
53
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
54
- # )
55
- # pan_model = LayoutLMv3ForTokenClassification.from_pretrained(
56
- # pan_model,
57
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
58
- # )
59
- # pan_model = pan_model.to(device)
60
-
61
- # #
62
- # # gst model
63
- # gst_model = "AuditEdge/doc_ocr_new_g" # Replace with your fine-tuned model if applicable
64
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
65
- # print(f"Using device: {device}")
66
-
67
- # # Load the processor (tokenizer + image processor)
68
- # processor_gst = LayoutLMv3Processor.from_pretrained(
69
- # gst_model,
70
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
71
- # )
72
- # gst_model = LayoutLMv3ForTokenClassification.from_pretrained(
73
- # gst_model,
74
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
75
- # )
76
- # gst_model = gst_model.to(device)
77
-
78
- # #cheque model
79
-
80
- # cheque_model = "AuditEdge/doc_ocr_new_c" # Replace with your fine-tuned model if applicable
81
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
- # print(f"Using device: {device}")
83
-
84
- # # Load the processor (tokenizer + image processor)
85
- # processor_cheque = LayoutLMv3Processor.from_pretrained(
86
- # cheque_model,
87
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
88
- # )
89
- # cheque_model = LayoutLMv3ForTokenClassification.from_pretrained(
90
- # cheque_model,
91
- # use_auth_token=HUGGINGFACE_AUTH_TOKEN
92
- # )
93
- # cheque_model = cheque_model.to(device)
94
-
95
-
96
-
97
-
98
-
99
-
100
- # # Verify model and processor are loaded
101
- # print("Model and processor loaded successfully!")
102
- # print(f"Model is on device: {next(aadhar_model.parameters()).device}")
103
-
104
-
105
- # # Import inference modules
106
- # from layoutlmv3FineTuning.Layoutlm_inference.ocr import prepare_batch_for_inference
107
- # from layoutlmv3FineTuning.Layoutlm_inference.inference_handler import handle
108
-
109
- # # Create FastAPI instance
110
- # app = FastAPI(debug=True)
111
-
112
- # # Enable CORS
113
- # app.add_middleware(
114
- # CORSMiddleware,
115
- # allow_origins=["*"],
116
- # allow_credentials=True,
117
- # allow_methods=["*"],
118
- # allow_headers=["*"],
119
- # )
120
-
121
- # # Configure directories
122
- # UPLOAD_FOLDER = './uploads/'
123
- # processing_folder = "./processed_images"
124
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Ensure the main upload folder exists
125
- # os.makedirs(processing_folder,exist_ok=True)
126
-
127
- # UPLOAD_DIRS = {
128
- # "aadhar_file": "uploads/aadhar/",
129
- # "pan_file": "uploads/pan/",
130
- # "cheque_file": "uploads/cheque/",
131
- # "gst_file": "uploads/gst/",
132
- # }
133
-
134
- # process_dirs = {
135
- # "aadhar_file": "processed_images/aadhar/",
136
- # "pan_file": "processed_images/pan/",
137
- # "cheque_file": "processed_images/cheque/",
138
- # "gst_file": "processed_images/gst/",
139
-
140
- # }
141
-
142
- # # Ensure individual directories exist
143
- # for dir_path in UPLOAD_DIRS.values():
144
- # os.makedirs(dir_path, exist_ok=True)
145
-
146
- # for dir_path in process_dirs.values():
147
- # os.makedirs(dir_path, exist_ok=True)
148
-
149
-
150
-
151
- # # Logger configuration
152
- # logging.basicConfig(level=logging.INFO)
153
-
154
- # # Perform Inference
155
- # def perform_inference(file_paths: Dict[str, str]):
156
- # # Dictionary to map document types to their respective model directories
157
- # model_dirs = {
158
- # "aadhar_file": aadhar_model,
159
- # "pan_file": pan_model,
160
- # "cheque_file": cheque_model,
161
- # "gst_file": gst_model,
162
- # }
163
- # try:
164
- # # Dictionary to store results for each document type
165
- # inference_results = {}
166
-
167
- # # Loop through the file paths and perform inference
168
- # for doc_type, file_path in file_paths.items():
169
- # if doc_type in model_dirs:
170
- # print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
171
-
172
- # # Prepare batch for inference
173
- # processed_file_p = file_path.split("&&")[0]
174
- # unprocessed_file_path = file_path.split("&&")[1]
175
-
176
- # images_path = [processed_file_p]
177
- # inference_batch = prepare_batch_for_inference(images_path)
178
-
179
- # # Prepare context for the specific document type
180
- # # context = {"model_dir": model_dirs[doc_type]}
181
- # #initialize s3 client
182
- # client = s3_client()
183
-
184
- # local_file_path= unprocessed_file_path
185
- # bucket_name = "edgekycdocs"
186
-
187
- # file_name = unprocessed_file_path.split("/")[-1]
188
-
189
-
190
-
191
-
192
- # # context = aadhar_model
193
- # if doc_type == "aadhar_file":
194
- # context = aadhar_model
195
- # processor = processor_aadhar
196
- # name = "aadhar"
197
- # attachemnt_num = 3
198
- # folder_name = "aadhardocs"
199
-
200
-
201
- # if doc_type == "pan_file":
202
- # context = pan_model
203
- # processor = processor_pan
204
- # name = "pan"
205
- # attachemnt_num = 2
206
- # folder_name = "pandocs"
207
-
208
- # if doc_type == "gst_file":
209
- # context = gst_model
210
- # processor = processor_gst
211
- # name = "gst"
212
- # attachemnt_num = 4
213
- # folder_name = "gstdocs"
214
-
215
- # if doc_type == "cheque_file":
216
- # context = cheque_model
217
- # processor = processor_cheque
218
- # name = "cheque"
219
- # attachemnt_num = 8
220
- # folder_name = "bankchequedocs"
221
-
222
-
223
-
224
- # # upload the document to s3 bucket here
225
-
226
-
227
- # print("this is folder name",folder_name)
228
-
229
- # response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
230
-
231
- # print("The file has been uploaded to s3 bucket",response)
232
-
233
-
234
- # # Perform inference (replace `handle` with your actual function)
235
- # result = handle(inference_batch, context,processor,name)
236
- # # result["attachment_url": response["url"]]
237
- # result["attachment_url"] = response["url"]
238
- # result["detect"] = True
239
-
240
- # print("result required",result)
241
-
242
- # # if result[""]
243
-
244
- # # Store the result
245
- # inference_results["attachment_{}".format(attachemnt_num)] = result
246
- # else:
247
- # print(f"Model directory not found for {doc_type}. Skipping.")
248
- # # print(Javed)
249
-
250
- # return inference_results
251
- # except:
252
- # return {
253
- # "status": "error",
254
- # "message": "Text extraction failed."
255
- # }
256
-
257
-
258
- # # Routes
259
- # @app.get("/")
260
- # def greet_json():
261
- # return {"Hello": "World!"}
262
-
263
- # @app.post("/api/aadhar_ocr")
264
- # async def aadhar_ocr(
265
- # aadhar_file: UploadFile = File(None),
266
- # pan_file: UploadFile = File(None),
267
- # cheque_file: UploadFile = File(None),
268
- # gst_file: UploadFile = File(None),
269
- # ):
270
- # # try:
271
- # # Handle file uploads
272
- # file_paths = {}
273
- # for file_type, folder in UPLOAD_DIRS.items():
274
- # file = locals()[file_type] # Dynamically access the file arguments
275
- # if file:
276
- # # Save the file in the respective directory
277
- # file_path = os.path.join(folder, file.filename)
278
-
279
- # print("this is the filename",file.filename)
280
- # with open(file_path, "wb") as buffer:
281
- # shutil.copyfileobj(file.file, buffer)
282
- # file_paths[file_type] = file_path
283
-
284
- # # Log received files
285
- # logging.info(f"Received files: {list(file_paths.keys())}")
286
- # print("file_paths",file_paths)
287
-
288
- # files = {}
289
- # for key, value in file_paths.items():
290
- # name = value.split("/")[-1].split(".")[0]
291
- # id_type = key.split("_")[0]
292
- # doc_type = value.split("/")[-1].split(".")[-1]
293
- # f_path = value
294
-
295
- # print("variables required",name,id_type,doc_type,f_path)
296
- # preprocessing = doc_processing(name,id_type,doc_type,f_path)
297
- # response = preprocessing.process()
298
-
299
- # print("response after preprocessing",response)
300
-
301
- # files[key] = response["output_p"] + "&&" + f_path
302
- # # files["unprocessed_file_path"] = f_path
303
- # print("response",response)
304
-
305
-
306
- # # Perform inference
307
- # result = perform_inference(files)
308
-
309
- # print("this is the result we got",result)
310
- # if "status" in list(result.keys()):
311
- # raise Exception("Custom error message")
312
- # # if result["status"] == "error":
313
-
314
-
315
-
316
- # return {"status": "success", "result": result}
317
-
318
-
319
- # # except Exception as e:
320
- # # logging.error(f"Error processing files: {e}")
321
- # # # raise HTTPException(status_code=500, detail="Internal Server Error")
322
- # # return {
323
- # # "status": 400,
324
- # # "message": "Text extraction failed."
325
- # # }
326
-
327
-
328
-
329
-
330
-
331
-
332
-
333
  from fastapi import FastAPI, File, UploadFile, HTTPException
334
  from fastapi.middleware.cors import CORSMiddleware
335
  from typing import Dict
@@ -343,7 +11,7 @@ from fastapi import FastAPI, HTTPException, Request
343
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
344
  from dotenv import load_dotenv
345
  import urllib.parse
346
- from utils import doc_processing
347
 
348
  # Load .env file
349
  load_dotenv()
@@ -475,59 +143,95 @@ for dir_path in process_dirs.values():
475
  logging.basicConfig(level=logging.INFO)
476
 
477
 
478
- # Perform Inference with optional S3 upload
479
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
480
  model_dirs = {
481
  "pan_file": pan_model,
482
  "gst_file": gst_model,
483
  "cheque_file": cheque_model,
484
  }
 
485
  try:
486
  inference_results = {}
487
 
488
  for doc_type, file_path in file_paths.items():
489
- if doc_type in model_dirs:
490
- print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
491
-
492
- processed_file_p = file_path.split("&&")[0]
493
- unprocessed_file_path = file_path.split("&&")[1]
494
- images_path = [processed_file_p]
495
- inference_batch = prepare_batch_for_inference(images_path)
496
-
497
- context = model_dirs[doc_type]
498
- processor = globals()[f"processor_{doc_type.split('_')[0]}"]
499
- name = doc_type.split("_")[0]
500
- attachemnt_num = {
501
- "pan_file": 2,
502
- "gst_file": 4,
503
- "msme_file": 5,
504
- "cin_llpin_file": 6,
505
- "cheque_file": 8,
506
- }[doc_type]
507
-
508
- if upload_to_s3:
509
- client = s3_client()
510
- bucket_name = "edgekycdocs"
511
- folder_name = f"{name}docs"
512
- file_name = unprocessed_file_path.split("/")[-1]
 
 
 
 
 
 
 
 
513
  response = client.upload_file(
514
  unprocessed_file_path, bucket_name, folder_name, file_name
515
  )
516
  print("The file has been uploaded to S3 bucket", response)
517
  attachment_url = response["url"]
518
- else:
 
 
519
  attachment_url = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
- result = handle(inference_batch, context, processor, name)
522
- result["attachment_url"] = attachment_url
523
- result["detect"] = True
524
 
525
- inference_results[f"attachment_{attachemnt_num}"] = result
526
- else:
527
- print(f"Model directory not found for {doc_type}. Skipping.")
 
 
 
 
 
 
 
 
 
528
 
529
  return inference_results
530
- except:
 
 
531
  return {"status": "error", "message": "Text extraction failed."}
532
 
533
 
@@ -566,21 +270,31 @@ async def aadhar_ocr(
566
  print("file_paths", file_paths)
567
 
568
  files = {}
569
- for key, value in file_paths.items():
570
- name = value.split("/")[-1].split(".")[0]
571
- id_type = key.split("_")[0]
572
- doc_type = value.split("/")[-1].split(".")[-1]
573
- f_path = value
574
 
575
- print("variables required", name, id_type, doc_type, f_path)
576
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
577
- response = preprocessing.process()
578
 
579
- print("response after preprocessing", response)
 
 
 
 
 
 
580
 
581
- files[key] = response["output_p"] + "&&" + f_path
582
- # files["unprocessed_file_path"] = f_path
583
- print("response", response)
 
 
 
 
 
 
 
 
 
 
 
584
 
585
  # Perform inference
586
  result = perform_inference(files, upload_to_s3)
@@ -639,16 +353,30 @@ async def document_ocr_s3(request: Request):
639
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
640
 
641
  files = {}
642
- for key, value in file_paths.items():
643
- name = value.split("/")[-1].split(".")[0]
644
- id_type = key.split("_")[0]
645
- doc_type = value.split("/")[-1].split(".")[-1]
646
- f_path = value
647
-
648
- preprocessing = doc_processing(name, id_type, doc_type, f_path)
649
- response = preprocessing.process()
650
 
651
- files[key] = response["output_p"] + "&&" + f_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
  result = perform_inference(files, upload_to_s3)
654
 
@@ -656,5 +384,3 @@ async def document_ocr_s3(request: Request):
656
  raise HTTPException(status_code=500, detail="Custom error message")
657
 
658
  return {"status": "success", "result": result}
659
-
660
- print("hello")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from typing import Dict
 
11
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
12
  from dotenv import load_dotenv
13
  import urllib.parse
14
+ from utils import doc_processing, extract_document_number_from_file
15
 
16
  # Load .env file
17
  load_dotenv()
 
143
  logging.basicConfig(level=logging.INFO)
144
 
145
 
 
146
  def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
147
  model_dirs = {
148
  "pan_file": pan_model,
149
  "gst_file": gst_model,
150
  "cheque_file": cheque_model,
151
  }
152
+
153
  try:
154
  inference_results = {}
155
 
156
  for doc_type, file_path in file_paths.items():
157
+ processed_file_p = file_path.split("&&")[
158
+ 0
159
+ ] # Extracted document number or processed image
160
+ unprocessed_file_path = file_path.split("&&")[1] # Original file path
161
+
162
+ print(f"Processing {doc_type}: {processed_file_p}")
163
+
164
+ # Determine the attachment number based on the document type
165
+ attachment_num = {
166
+ "pan_file": 2,
167
+ "gst_file": 4,
168
+ "msme_file": 5,
169
+ "cin_llpin_file": 6,
170
+ "cheque_file": 8,
171
+ }.get(doc_type, None)
172
+
173
+ if attachment_num is None:
174
+ print(f"Skipping {doc_type}, not recognized.")
175
+ continue
176
+
177
+ # Upload file to S3 if required
178
+ if upload_to_s3:
179
+ client = s3_client()
180
+ bucket_name = "edgekycdocs"
181
+ if doc_type == "cin_llpin":
182
+ folder_name = f"{doc_type.replace('_', '')}docs"
183
+ else:
184
+ folder_name = f"{doc_type.split('_')[0]}docs"
185
+
186
+ file_name = unprocessed_file_path.split("/")[-1].replace(" ", "_")
187
+
188
+ try:
189
  response = client.upload_file(
190
  unprocessed_file_path, bucket_name, folder_name, file_name
191
  )
192
  print("The file has been uploaded to S3 bucket", response)
193
  attachment_url = response["url"]
194
+ print(f"File uploaded to S3: {attachment_url}")
195
+ except Exception as e:
196
+ print(f"Failed to upload {file_name} to S3: {e}")
197
  attachment_url = None
198
+ else:
199
+ attachment_url = None
200
+ # If it's an OCR-based extraction (CIN, MSME, LLPIN, PAN, Aadhaar), return the extracted number
201
+ if doc_type in ["msme_file", "cin_llpin_file", "aadhar_file"]:
202
+ result = {
203
+ "attachment_num": processed_file_p, # Extracted CIN, LLPIN, MSME, PAN, or Aadhaar number
204
+ "attachment_url": attachment_url,
205
+ "attachment_status": 200,
206
+ "detect": True,
207
+ }
208
+ else:
209
+ # If the document needs ML model inference (PAN, GST, Cheque)
210
+ if doc_type in model_dirs:
211
+ print(
212
+ f"Running ML inference for {doc_type} using {model_dirs[doc_type]}"
213
+ )
214
 
215
+ images_path = [processed_file_p]
216
+ inference_batch = prepare_batch_for_inference(images_path)
 
217
 
218
+ context = model_dirs[doc_type]
219
+ processor = globals()[f"processor_{doc_type.split('_')[0]}"]
220
+ name = doc_type.split("_")[0]
221
+
222
+ result = handle(inference_batch, context, processor, name)
223
+ result["attachment_url"] = attachment_url
224
+ result["detect"] = True
225
+ else:
226
+ print(f"No model found for {doc_type}, skipping inference.")
227
+ continue
228
+
229
+ inference_results[f"attachment_{attachment_num}"] = result
230
 
231
  return inference_results
232
+
233
+ except Exception as e:
234
+ print(f"Error in perform_inference: {e}")
235
  return {"status": "error", "message": "Text extraction failed."}
236
 
237
 
 
270
  print("file_paths", file_paths)
271
 
272
  files = {}
 
 
 
 
 
273
 
274
+ for key, f_path in file_paths.items():
 
 
275
 
276
+ name = os.path.splitext(os.path.basename(f_path))[0]
277
+ # Determine id_type: for cin_llpin_file, explicitly set id_type to "cin_llpin"
278
+ if key == "cin_llpin_file":
279
+ id_type = "cin_llpin"
280
+ else:
281
+ id_type = key.split("_")[0]
282
+ doc_type = os.path.splitext(f_path)[-1].lstrip(".")
283
 
284
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
285
+ extracted_number = extract_document_number_from_file(f_path, id_type)
286
+ if not extracted_number:
287
+ logging.error(f"Failed to extract document number from {f_path}")
288
+ raise HTTPException(
289
+ status_code=400, detail=f"Invalid document format in {key}"
290
+ )
291
+ files[key] = extracted_number + "&&" + f_path
292
+ print("files", files[key])
293
+ else:
294
+ # For other files, use existing preprocessing.
295
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
296
+ response = preprocessing.process()
297
+ files[key] = response["output_p"] + "&&" + f_path
298
 
299
  # Perform inference
300
  result = perform_inference(files, upload_to_s3)
 
353
  logging.info(f"Downloaded files: {list(file_paths.keys())}")
354
 
355
  files = {}
 
 
 
 
 
 
 
 
356
 
357
+ for key, f_path in file_paths.items():
358
+ name = f_path.split("/")[-1].split(".")[0]
359
+ if key == "cin_llpin_file":
360
+ id_type = "cin_llpin"
361
+ else:
362
+ id_type = key.split("_")[0]
363
+ # id_type = key.split("_")[0]
364
+ doc_type = f_path.split("/")[-1].split(".")[-1]
365
+
366
+ # For MSME and CIN/LLPIN files, extract document number via OCR and regex
367
+ if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
368
+ extracted_number = extract_document_number_from_file(f_path, id_type)
369
+ if not extracted_number:
370
+ logging.error(f"Failed to extract document number from {f_path}")
371
+ raise HTTPException(
372
+ status_code=400, detail=f"Invalid document format in {key}"
373
+ )
374
+ files[key] = extracted_number + "&&" + f_path
375
+ else:
376
+ # For other documents, use the existing ML model preprocessing
377
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
378
+ response = preprocessing.process()
379
+ files[key] = response["output_p"] + "&&" + f_path
380
 
381
  result = perform_inference(files, upload_to_s3)
382
 
 
384
  raise HTTPException(status_code=500, detail="Custom error message")
385
 
386
  return {"status": "success", "result": result}
 
 
utils.py CHANGED
@@ -1,71 +1,75 @@
1
  import fitz
2
  from PIL import Image
 
 
 
 
 
 
 
 
 
3
 
4
  class doc_processing:
5
 
6
  def __init__(self, name, id_type, doc_type, f_path):
7
-
8
  self.name = name
9
  self.id_type = id_type
10
  self.doc_type = doc_type
11
  self.f_path = f_path
12
  # self.o_path = o_path
13
-
14
-
15
  def pdf_to_image_scale(self):
16
  pdf_document = fitz.open(self.f_path)
17
  if self.id_type == "gst":
18
  page_num = 2
19
  else:
20
  page_num = 0
21
-
22
  page = pdf_document.load_page(page_num)
23
  pix = page.get_pixmap() # Render page as a pixmap (image)
24
-
25
  # Convert pixmap to PIL Image
26
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
27
-
28
  original_width, original_height = image.size
29
-
30
- print("original_width",original_width)
31
- print("original_height",original_height)
32
 
 
 
33
 
34
  new_width = (1000 / original_width) * original_width
35
  new_height = (1000 / original_height) * original_height
36
-
37
- print("new_width",new_width)
38
- print("new_height",new_height)
39
- # new_width =
40
- # new_height =
41
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
42
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
43
  image.save(output_path)
44
- return {"success":200,"output_p":output_path}
45
-
46
 
47
  def scale_img(self):
48
-
49
 
50
- print("path of file",self.f_path)
51
  image = Image.open(self.f_path).convert("RGB")
52
  original_width, original_height = image.size
53
-
54
- print("original_width",original_width)
55
- print("original_height",original_height)
56
 
 
 
57
 
58
  new_width = (1000 / original_width) * original_width
59
  new_height = (1000 / original_height) * original_height
60
-
61
- print("new_width",new_width)
62
- print("new_height",new_height)
63
- # new_width =
64
- # new_height =
65
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
66
- output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
67
  image.save(output_path)
68
- return {"success":200,"output_p":output_path}
69
 
70
  def process(self):
71
  if self.doc_type == "pdf":
@@ -76,12 +80,95 @@ class doc_processing:
76
  return response
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
-
82
  # files = {
83
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
84
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
85
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
86
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
87
  # }
@@ -89,7 +176,7 @@ class doc_processing:
89
 
90
  # files = {
91
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
92
- # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
93
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
94
  # "gst_file": "test_Images_folder/gst/e.pdf"
95
  # }
@@ -102,11 +189,6 @@ class doc_processing:
102
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
103
  # response = preprocessing.process()
104
  # print("response",response)
105
-
106
-
107
-
108
-
109
-
110
- # id_type, doc_type, f_path
111
-
112
-
 
1
  import fitz
2
  from PIL import Image
3
+ import re
4
+ import io
5
+ import os
6
+ import logging
7
+ import shutil
8
+ from fastapi import FastAPI, UploadFile, File, HTTPException
9
+ from google.cloud import vision
10
+ from pdf2image import convert_from_path
11
+
12
 
13
  class doc_processing:
14
 
15
  def __init__(self, name, id_type, doc_type, f_path):
16
+
17
  self.name = name
18
  self.id_type = id_type
19
  self.doc_type = doc_type
20
  self.f_path = f_path
21
  # self.o_path = o_path
22
+
 
23
  def pdf_to_image_scale(self):
24
  pdf_document = fitz.open(self.f_path)
25
  if self.id_type == "gst":
26
  page_num = 2
27
  else:
28
  page_num = 0
29
+
30
  page = pdf_document.load_page(page_num)
31
  pix = page.get_pixmap() # Render page as a pixmap (image)
32
+
33
  # Convert pixmap to PIL Image
34
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
35
+
36
  original_width, original_height = image.size
 
 
 
37
 
38
+ print("original_width", original_width)
39
+ print("original_height", original_height)
40
 
41
  new_width = (1000 / original_width) * original_width
42
  new_height = (1000 / original_height) * original_height
43
+
44
+ print("new_width", new_width)
45
+ print("new_height", new_height)
46
+ # new_width =
47
+ # new_height =
48
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
49
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
50
  image.save(output_path)
51
+ return {"success": 200, "output_p": output_path}
 
52
 
53
  def scale_img(self):
 
54
 
55
+ print("path of file", self.f_path)
56
  image = Image.open(self.f_path).convert("RGB")
57
  original_width, original_height = image.size
 
 
 
58
 
59
+ print("original_width", original_width)
60
+ print("original_height", original_height)
61
 
62
  new_width = (1000 / original_width) * original_width
63
  new_height = (1000 / original_height) * original_height
64
+
65
+ print("new_width", new_width)
66
+ print("new_height", new_height)
67
+ # new_width =
68
+ # new_height =
69
  image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
70
+ output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
71
  image.save(output_path)
72
+ return {"success": 200, "output_p": output_path}
73
 
74
  def process(self):
75
  if self.doc_type == "pdf":
 
80
  return response
81
 
82
 
83
+ from google.cloud import vision
84
+
85
+ vision_client = vision.ImageAnnotatorClient()
86
+
87
+
88
+ def extract_document_number(ocr_text: str, id_type: str) -> str:
89
+ """
90
+ Searches the OCR text for a valid document number based on regex patterns.
91
+ Checks for CIN, then MSME, and finally LLPIN.
92
+ """
93
+ patterns = {
94
+ "cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"),
95
+ "msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"),
96
+ "llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"),
97
+ "pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"),
98
+ "aadhaar": re.compile(r"^\d{12}$"),
99
+ }
100
+
101
+ if id_type == "cin_llpin":
102
+ # Try CIN first
103
+ match = patterns["cin"].search(ocr_text)
104
+ if match:
105
+ return match.group(0)
106
+ # If CIN not found, try LLPIN
107
+ match = patterns["llpin"].search(ocr_text)
108
+ if match:
109
+ return match.group(0)
110
+ elif id_type in patterns:
111
+ match = patterns[id_type].search(ocr_text)
112
+ if match:
113
+ return match.group(0)
114
+
115
+ return None
116
+
117
+
118
+ def run_google_vision(file_content: bytes) -> str:
119
+ """
120
+ Uses Google Vision OCR to extract text from binary file content.
121
+ """
122
+ image = vision.Image(content=file_content)
123
+ response = vision_client.text_detection(image=image)
124
+ texts = response.text_annotations
125
+ if texts:
126
+ # The first annotation contains the complete detected text
127
+ return texts[0].description
128
+ return ""
129
+
130
+
131
+ def extract_text_from_file(file_path: str) -> str:
132
+ """
133
+ Reads the file from file_path. If it's a PDF, converts only the first page to an image,
134
+ then runs OCR using Google Vision.
135
+ """
136
+ if file_path.lower().endswith(".pdf"):
137
+ try:
138
+ # Open the PDF file using PyMuPDF (fitz)
139
+ pdf_document = fitz.open(file_path)
140
+ page = pdf_document.load_page(0) # Load the first page
141
+ pix = page.get_pixmap() # Render page as an image
142
+
143
+ # Convert pixmap to PIL Image
144
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
145
+
146
+ # Convert image to bytes for OCR
147
+ img_byte_arr = io.BytesIO()
148
+ image.save(img_byte_arr, format="JPEG")
149
+ file_content = img_byte_arr.getvalue()
150
+
151
+ except Exception as e:
152
+ logging.error(f"Error converting PDF to image: {e}")
153
+ return ""
154
+ else:
155
+ with open(file_path, "rb") as f:
156
+ file_content = f.read()
157
+
158
+ return run_google_vision(file_content)
159
+
160
+
161
+ def extract_document_number_from_file(file_path: str, id_type: str) -> str:
162
+ """
163
+ Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path.
164
+ """
165
+ ocr_text = extract_text_from_file(file_path)
166
+ return extract_document_number(ocr_text, id_type)
167
 
168
 
 
169
  # files = {
170
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
171
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
172
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
173
  # "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
174
  # }
 
176
 
177
  # files = {
178
  # "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
179
+ # "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
180
  # "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
181
  # "gst_file": "test_Images_folder/gst/e.pdf"
182
  # }
 
189
  # preprocessing = doc_processing(name,id_type,doc_type,f_path)
190
  # response = preprocessing.process()
191
  # print("response",response)
192
+
193
+
194
+ # id_type, doc_type, f_path