Spaces:

AuditEdge
/

optimised-ocr

Running

App Files Files Community

Mallisetty Siva Mahesh commited on Feb 25

Commit

6413971

1 Parent(s): 47d6c5f

added code for msme cinllpin

Browse files

Files changed (2) hide show

app.py +115 -389
utils.py +122 -40

app.py CHANGED Viewed

@@ -1,335 +1,3 @@
-# from fastapi import FastAPI, File, UploadFile, HTTPException
-# from fastapi.middleware.cors import CORSMiddleware
-# from typing import Dict
-# import os
-# import shutil
-# import logging
-# from s3_setup import s3_client
-# import torch
-# from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
-# from dotenv import load_dotenv
-# import os
-# from utils import doc_processing
-# # Load .env file
-# load_dotenv()
-# # Access variables
-# dummy_key = os.getenv("dummy_key")
-# HUGGINGFACE_AUTH_TOKEN = dummy_key
-# # Hugging Face model and token
-# aadhar_model = "AuditEdge/doc_ocr_a"  # Replace with your fine-tuned model if applicable
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# print(f"Using device: {device}")
-# # Load the processor (tokenizer + image processor)
-# processor_aadhar = LayoutLMv3Processor.from_pretrained(
-#     aadhar_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# aadhar_model = LayoutLMv3ForTokenClassification.from_pretrained(
-#     aadhar_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# aadhar_model = aadhar_model.to(device)
-# # pan model
-# pan_model = "AuditEdge/doc_ocr_p"  # Replace with your fine-tuned model if applicable
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# print(f"Using device: {device}")
-# # Load the processor (tokenizer + image processor)
-# processor_pan = LayoutLMv3Processor.from_pretrained(
-#     pan_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# pan_model = LayoutLMv3ForTokenClassification.from_pretrained(
-#     pan_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# pan_model = pan_model.to(device)
-# #
-# # gst model
-# gst_model = "AuditEdge/doc_ocr_new_g"  # Replace with your fine-tuned model if applicable
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# print(f"Using device: {device}")
-# # Load the processor (tokenizer + image processor)
-# processor_gst = LayoutLMv3Processor.from_pretrained(
-#     gst_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# gst_model = LayoutLMv3ForTokenClassification.from_pretrained(
-#     gst_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# gst_model = gst_model.to(device)
-# #cheque model
-# cheque_model = "AuditEdge/doc_ocr_new_c"  # Replace with your fine-tuned model if applicable
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# print(f"Using device: {device}")
-# # Load the processor (tokenizer + image processor)
-# processor_cheque = LayoutLMv3Processor.from_pretrained(
-#     cheque_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# cheque_model = LayoutLMv3ForTokenClassification.from_pretrained(
-#     cheque_model,
-#     use_auth_token=HUGGINGFACE_AUTH_TOKEN
-# )
-# cheque_model = cheque_model.to(device)
-# # Verify model and processor are loaded
-# print("Model and processor loaded successfully!")
-# print(f"Model is on device: {next(aadhar_model.parameters()).device}")
-# # Import inference modules
-# from layoutlmv3FineTuning.Layoutlm_inference.ocr import prepare_batch_for_inference
-# from layoutlmv3FineTuning.Layoutlm_inference.inference_handler import handle
-# # Create FastAPI instance
-# app = FastAPI(debug=True)
-# # Enable CORS
-# app.add_middleware(
-#     CORSMiddleware,
-#     allow_origins=["*"],
-#     allow_credentials=True,
-#     allow_methods=["*"],
-#     allow_headers=["*"],
-# )
-# # Configure directories
-# UPLOAD_FOLDER = './uploads/'
-# processing_folder = "./processed_images"
-# os.makedirs(UPLOAD_FOLDER, exist_ok=True)  # Ensure the main upload folder exists
-# os.makedirs(processing_folder,exist_ok=True)
-# UPLOAD_DIRS = {
-#     "aadhar_file": "uploads/aadhar/",
-#     "pan_file": "uploads/pan/",
-#     "cheque_file": "uploads/cheque/",
-#     "gst_file": "uploads/gst/",
-# }
-# process_dirs = {
-#     "aadhar_file": "processed_images/aadhar/",
-#     "pan_file": "processed_images/pan/",
-#     "cheque_file": "processed_images/cheque/",
-#     "gst_file": "processed_images/gst/",
-# }
-# # Ensure individual directories exist
-# for dir_path in UPLOAD_DIRS.values():
-#     os.makedirs(dir_path, exist_ok=True)
-# for dir_path in process_dirs.values():
-#     os.makedirs(dir_path, exist_ok=True)
-# # Logger configuration
-# logging.basicConfig(level=logging.INFO)
-# # Perform Inference
-# def perform_inference(file_paths: Dict[str, str]):
-#     # Dictionary to map document types to their respective model directories
-#     model_dirs = {
-#         "aadhar_file": aadhar_model,
-#         "pan_file": pan_model,
-#         "cheque_file": cheque_model,
-#         "gst_file": gst_model,
-#     }
-#     try:
-#         # Dictionary to store results for each document type
-#         inference_results = {}
-#         # Loop through the file paths and perform inference
-#         for doc_type, file_path in file_paths.items():
-#             if doc_type in model_dirs:
-#                 print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
-#                 # Prepare batch for inference
-#                 processed_file_p = file_path.split("&&")[0]
-#                 unprocessed_file_path = file_path.split("&&")[1]
-#                 images_path = [processed_file_p]
-#                 inference_batch = prepare_batch_for_inference(images_path)
-#                 # Prepare context for the specific document type
-#                 # context = {"model_dir": model_dirs[doc_type]}
-#                 #initialize s3 client
-#                 client = s3_client()
-#                 local_file_path= unprocessed_file_path
-#                 bucket_name = "edgekycdocs"
-#                 file_name = unprocessed_file_path.split("/")[-1]
-#                 # context = aadhar_model
-#                 if doc_type == "aadhar_file":
-#                     context = aadhar_model
-#                     processor = processor_aadhar
-#                     name = "aadhar"
-#                     attachemnt_num = 3
-#                     folder_name = "aadhardocs"
-#                 if doc_type == "pan_file":
-#                     context = pan_model
-#                     processor = processor_pan
-#                     name = "pan"
-#                     attachemnt_num = 2
-#                     folder_name = "pandocs"
-#                 if doc_type == "gst_file":
-#                     context = gst_model
-#                     processor = processor_gst
-#                     name = "gst"
-#                     attachemnt_num = 4
-#                     folder_name = "gstdocs"
-#                 if doc_type == "cheque_file":
-#                     context = cheque_model
-#                     processor = processor_cheque
-#                     name = "cheque"
-#                     attachemnt_num = 8
-#                     folder_name = "bankchequedocs"
-#                 # upload the document to s3 bucket here
-#                 print("this is folder name",folder_name)
-#                 response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
-#                 print("The file has been uploaded to s3 bucket",response)
-#                 # Perform inference (replace `handle` with your actual function)
-#                 result = handle(inference_batch, context,processor,name)
-#                 # result["attachment_url": response["url"]]
-#                 result["attachment_url"] = response["url"]
-#                 result["detect"] = True
-#                 print("result required",result)
-#                 # if result[""]
-#                 # Store the result
-#                 inference_results["attachment_{}".format(attachemnt_num)] = result
-#             else:
-#                 print(f"Model directory not found for {doc_type}. Skipping.")
-#             # print(Javed)
-#             return inference_results
-#     except:
-#         return {
-#                 "status": "error",
-#                 "message": "Text extraction failed."
-#                 }
-# # Routes
-# @app.get("/")
-# def greet_json():
-#     return {"Hello": "World!"}
-# @app.post("/api/aadhar_ocr")
-# async def aadhar_ocr(
-#     aadhar_file: UploadFile = File(None),
-#     pan_file: UploadFile = File(None),
-#     cheque_file: UploadFile = File(None),
-#     gst_file: UploadFile = File(None),
-# ):
-#     # try:
-#         # Handle file uploads
-#     file_paths = {}
-#     for file_type, folder in UPLOAD_DIRS.items():
-#         file = locals()[file_type]  # Dynamically access the file arguments
-#         if file:
-#             # Save the file in the respective directory
-#             file_path = os.path.join(folder, file.filename)
-#             print("this is the filename",file.filename)
-#             with open(file_path, "wb") as buffer:
-#                 shutil.copyfileobj(file.file, buffer)
-#             file_paths[file_type] = file_path
-#     # Log received files
-#     logging.info(f"Received files: {list(file_paths.keys())}")
-#     print("file_paths",file_paths)
-#     files = {}
-#     for key, value in file_paths.items():
-#         name = value.split("/")[-1].split(".")[0]
-#         id_type = key.split("_")[0]
-#         doc_type = value.split("/")[-1].split(".")[-1]
-#         f_path = value
-#         print("variables required",name,id_type,doc_type,f_path)
-#         preprocessing = doc_processing(name,id_type,doc_type,f_path)
-#         response = preprocessing.process()
-#         print("response after preprocessing",response)
-#         files[key] = response["output_p"] + "&&" + f_path
-#         # files["unprocessed_file_path"] = f_path
-#         print("response",response)
-#     # Perform inference
-#     result = perform_inference(files)
-#     print("this is the result we got",result)
-#     if "status" in list(result.keys()):
-#         raise Exception("Custom error message")
-#     # if result["status"] == "error":
-#     return {"status": "success", "result": result}
-#     # except Exception as e:
-#     #     logging.error(f"Error processing files: {e}")
-#     #     # raise HTTPException(status_code=500, detail="Internal Server Error")
-#     #     return {
-#     #             "status": 400,
-#     #             "message": "Text extraction failed."
-#     #             }
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from typing import Dict
@@ -343,7 +11,7 @@ from fastapi import FastAPI, HTTPException, Request
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from dotenv import load_dotenv
 import urllib.parse
-from utils import doc_processing
 # Load .env file
 load_dotenv()
@@ -475,59 +143,95 @@ for dir_path in process_dirs.values():
 logging.basicConfig(level=logging.INFO)
-# Perform Inference with optional S3 upload
 def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
     model_dirs = {
         "pan_file": pan_model,
         "gst_file": gst_model,
         "cheque_file": cheque_model,
     }
     try:
         inference_results = {}
         for doc_type, file_path in file_paths.items():
-            if doc_type in model_dirs:
-                print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
-                processed_file_p = file_path.split("&&")[0]
-                unprocessed_file_path = file_path.split("&&")[1]
-                images_path = [processed_file_p]
-                inference_batch = prepare_batch_for_inference(images_path)
-                context = model_dirs[doc_type]
-                processor = globals()[f"processor_{doc_type.split('_')[0]}"]
-                name = doc_type.split("_")[0]
-                attachemnt_num = {
-                    "pan_file": 2,
-                    "gst_file": 4,
-                    "msme_file": 5,
-                    "cin_llpin_file": 6,
-                    "cheque_file": 8,
-                }[doc_type]
-                if upload_to_s3:
-                    client = s3_client()
-                    bucket_name = "edgekycdocs"
-                    folder_name = f"{name}docs"
-                    file_name = unprocessed_file_path.split("/")[-1]
                     response = client.upload_file(
                         unprocessed_file_path, bucket_name, folder_name, file_name
                     )
                     print("The file has been uploaded to S3 bucket", response)
                     attachment_url = response["url"]
-                else:
                     attachment_url = None
-                result = handle(inference_batch, context, processor, name)
-                result["attachment_url"] = attachment_url
-                result["detect"] = True
-                inference_results[f"attachment_{attachemnt_num}"] = result
-            else:
-                print(f"Model directory not found for {doc_type}. Skipping.")
         return inference_results
-    except:
         return {"status": "error", "message": "Text extraction failed."}
@@ -566,21 +270,31 @@ async def aadhar_ocr(
     print("file_paths", file_paths)
     files = {}
-    for key, value in file_paths.items():
-        name = value.split("/")[-1].split(".")[0]
-        id_type = key.split("_")[0]
-        doc_type = value.split("/")[-1].split(".")[-1]
-        f_path = value
-        print("variables required", name, id_type, doc_type, f_path)
-        preprocessing = doc_processing(name, id_type, doc_type, f_path)
-        response = preprocessing.process()
-        print("response after preprocessing", response)
-        files[key] = response["output_p"] + "&&" + f_path
-        # files["unprocessed_file_path"] = f_path
-        print("response", response)
     # Perform inference
     result = perform_inference(files, upload_to_s3)
@@ -639,16 +353,30 @@ async def document_ocr_s3(request: Request):
     logging.info(f"Downloaded files: {list(file_paths.keys())}")
     files = {}
-    for key, value in file_paths.items():
-        name = value.split("/")[-1].split(".")[0]
-        id_type = key.split("_")[0]
-        doc_type = value.split("/")[-1].split(".")[-1]
-        f_path = value
-        preprocessing = doc_processing(name, id_type, doc_type, f_path)
-        response = preprocessing.process()
-        files[key] = response["output_p"] + "&&" + f_path
     result = perform_inference(files, upload_to_s3)
@@ -656,5 +384,3 @@ async def document_ocr_s3(request: Request):
         raise HTTPException(status_code=500, detail="Custom error message")
     return {"status": "success", "result": result}
-print("hello")

 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from typing import Dict
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from dotenv import load_dotenv
 import urllib.parse
+from utils import doc_processing, extract_document_number_from_file
 # Load .env file
 load_dotenv()
 logging.basicConfig(level=logging.INFO)
 def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
     model_dirs = {
         "pan_file": pan_model,
         "gst_file": gst_model,
         "cheque_file": cheque_model,
     }
     try:
         inference_results = {}
         for doc_type, file_path in file_paths.items():
+            processed_file_p = file_path.split("&&")[
+                0
+            ]  # Extracted document number or processed image
+            unprocessed_file_path = file_path.split("&&")[1]  # Original file path
+            print(f"Processing {doc_type}: {processed_file_p}")
+            # Determine the attachment number based on the document type
+            attachment_num = {
+                "pan_file": 2,
+                "gst_file": 4,
+                "msme_file": 5,
+                "cin_llpin_file": 6,
+                "cheque_file": 8,
+            }.get(doc_type, None)
+            if attachment_num is None:
+                print(f"Skipping {doc_type}, not recognized.")
+                continue
+            # Upload file to S3 if required
+            if upload_to_s3:
+                client = s3_client()
+                bucket_name = "edgekycdocs"
+                if doc_type == "cin_llpin":
+                    folder_name = f"{doc_type.replace('_', '')}docs"
+                else:
+                    folder_name = f"{doc_type.split('_')[0]}docs"
+                file_name = unprocessed_file_path.split("/")[-1].replace(" ", "_")
+                try:
                     response = client.upload_file(
                         unprocessed_file_path, bucket_name, folder_name, file_name
                     )
                     print("The file has been uploaded to S3 bucket", response)
                     attachment_url = response["url"]
+                    print(f"File uploaded to S3: {attachment_url}")
+                except Exception as e:
+                    print(f"Failed to upload {file_name} to S3: {e}")
                     attachment_url = None
+            else:
+                attachment_url = None
+            # If it's an OCR-based extraction (CIN, MSME, LLPIN, PAN, Aadhaar), return the extracted number
+            if doc_type in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+                result = {
+                    "attachment_num": processed_file_p,  # Extracted CIN, LLPIN, MSME, PAN, or Aadhaar number
+                    "attachment_url": attachment_url,
+                    "attachment_status": 200,
+                    "detect": True,
+                }
+            else:
+                # If the document needs ML model inference (PAN, GST, Cheque)
+                if doc_type in model_dirs:
+                    print(
+                        f"Running ML inference for {doc_type} using {model_dirs[doc_type]}"
+                    )
+                    images_path = [processed_file_p]
+                    inference_batch = prepare_batch_for_inference(images_path)
+                    context = model_dirs[doc_type]
+                    processor = globals()[f"processor_{doc_type.split('_')[0]}"]
+                    name = doc_type.split("_")[0]
+                    result = handle(inference_batch, context, processor, name)
+                    result["attachment_url"] = attachment_url
+                    result["detect"] = True
+                else:
+                    print(f"No model found for {doc_type}, skipping inference.")
+                    continue
+            inference_results[f"attachment_{attachment_num}"] = result
         return inference_results
+    except Exception as e:
+        print(f"Error in perform_inference: {e}")
         return {"status": "error", "message": "Text extraction failed."}
     print("file_paths", file_paths)
     files = {}
+    for key, f_path in file_paths.items():
+        name = os.path.splitext(os.path.basename(f_path))[0]
+        # Determine id_type: for cin_llpin_file, explicitly set id_type to "cin_llpin"
+        if key == "cin_llpin_file":
+            id_type = "cin_llpin"
+        else:
+            id_type = key.split("_")[0]
+        doc_type = os.path.splitext(f_path)[-1].lstrip(".")
+        if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+            extracted_number = extract_document_number_from_file(f_path, id_type)
+            if not extracted_number:
+                logging.error(f"Failed to extract document number from {f_path}")
+                raise HTTPException(
+                    status_code=400, detail=f"Invalid document format in {key}"
+                )
+            files[key] = extracted_number + "&&" + f_path
+            print("files", files[key])
+        else:
+            # For other files, use existing preprocessing.
+            preprocessing = doc_processing(name, id_type, doc_type, f_path)
+            response = preprocessing.process()
+            files[key] = response["output_p"] + "&&" + f_path
     # Perform inference
     result = perform_inference(files, upload_to_s3)
     logging.info(f"Downloaded files: {list(file_paths.keys())}")
     files = {}
+    for key, f_path in file_paths.items():
+        name = f_path.split("/")[-1].split(".")[0]
+        if key == "cin_llpin_file":
+            id_type = "cin_llpin"
+        else:
+            id_type = key.split("_")[0]
+        # id_type = key.split("_")[0]
+        doc_type = f_path.split("/")[-1].split(".")[-1]
+        # For MSME and CIN/LLPIN files, extract document number via OCR and regex
+        if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+            extracted_number = extract_document_number_from_file(f_path, id_type)
+            if not extracted_number:
+                logging.error(f"Failed to extract document number from {f_path}")
+                raise HTTPException(
+                    status_code=400, detail=f"Invalid document format in {key}"
+                )
+            files[key] = extracted_number + "&&" + f_path
+        else:
+            # For other documents, use the existing ML model preprocessing
+            preprocessing = doc_processing(name, id_type, doc_type, f_path)
+            response = preprocessing.process()
+            files[key] = response["output_p"] + "&&" + f_path
     result = perform_inference(files, upload_to_s3)
         raise HTTPException(status_code=500, detail="Custom error message")
     return {"status": "success", "result": result}

utils.py CHANGED Viewed

@@ -1,71 +1,75 @@
 import fitz
 from PIL import Image
 class doc_processing:
     def __init__(self, name, id_type, doc_type, f_path):
         self.name = name
         self.id_type = id_type
         self.doc_type = doc_type
         self.f_path = f_path
         # self.o_path = o_path
     def pdf_to_image_scale(self):
         pdf_document = fitz.open(self.f_path)
         if self.id_type == "gst":
             page_num = 2
         else:
             page_num = 0
         page = pdf_document.load_page(page_num)
         pix = page.get_pixmap()  # Render page as a pixmap (image)
         # Convert pixmap to PIL Image
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         original_width, original_height = image.size
-        print("original_width",original_width)
-        print("original_height",original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
-        print("new_width",new_width)
-        print("new_height",new_height)
-        # new_width =
-        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
-        output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
         image.save(output_path)
-        return  {"success":200,"output_p":output_path}
     def scale_img(self):
-        print("path of file",self.f_path)
         image = Image.open(self.f_path).convert("RGB")
         original_width, original_height = image.size
-        print("original_width",original_width)
-        print("original_height",original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
-        print("new_width",new_width)
-        print("new_height",new_height)
-        # new_width =
-        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
-        output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
         image.save(output_path)
-        return {"success":200,"output_p":output_path}
     def process(self):
         if self.doc_type == "pdf":
@@ -76,12 +80,95 @@ class doc_processing:
         return response
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
-#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
 # }
@@ -89,7 +176,7 @@ class doc_processing:
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
-#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "test_Images_folder/gst/e.pdf"
 # }
@@ -102,11 +189,6 @@ class doc_processing:
 #     preprocessing = doc_processing(name,id_type,doc_type,f_path)
 #     response = preprocessing.process()
 #     print("response",response)
-    # id_type, doc_type, f_path

 import fitz
 from PIL import Image
+import re
+import io
+import os
+import logging
+import shutil
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from google.cloud import vision
+from pdf2image import convert_from_path
 class doc_processing:
     def __init__(self, name, id_type, doc_type, f_path):
         self.name = name
         self.id_type = id_type
         self.doc_type = doc_type
         self.f_path = f_path
         # self.o_path = o_path
     def pdf_to_image_scale(self):
         pdf_document = fitz.open(self.f_path)
         if self.id_type == "gst":
             page_num = 2
         else:
             page_num = 0
         page = pdf_document.load_page(page_num)
         pix = page.get_pixmap()  # Render page as a pixmap (image)
         # Convert pixmap to PIL Image
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         original_width, original_height = image.size
+        print("original_width", original_width)
+        print("original_height", original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
+        print("new_width", new_width)
+        print("new_height", new_height)
+        # new_width =
+        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
+        output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
         image.save(output_path)
+        return {"success": 200, "output_p": output_path}
     def scale_img(self):
+        print("path of file", self.f_path)
         image = Image.open(self.f_path).convert("RGB")
         original_width, original_height = image.size
+        print("original_width", original_width)
+        print("original_height", original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
+        print("new_width", new_width)
+        print("new_height", new_height)
+        # new_width =
+        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
+        output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
         image.save(output_path)
+        return {"success": 200, "output_p": output_path}
     def process(self):
         if self.doc_type == "pdf":
         return response
+from google.cloud import vision
+vision_client = vision.ImageAnnotatorClient()
+def extract_document_number(ocr_text: str, id_type: str) -> str:
+    """
+    Searches the OCR text for a valid document number based on regex patterns.
+    Checks for CIN, then MSME, and finally LLPIN.
+    """
+    patterns = {
+        "cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"),
+        "msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"),
+        "llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"),
+        "pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"),
+        "aadhaar": re.compile(r"^\d{12}$"),
+    }
+    if id_type == "cin_llpin":
+        # Try CIN first
+        match = patterns["cin"].search(ocr_text)
+        if match:
+            return match.group(0)
+        # If CIN not found, try LLPIN
+        match = patterns["llpin"].search(ocr_text)
+        if match:
+            return match.group(0)
+    elif id_type in patterns:
+        match = patterns[id_type].search(ocr_text)
+        if match:
+            return match.group(0)
+    return None
+def run_google_vision(file_content: bytes) -> str:
+    """
+    Uses Google Vision OCR to extract text from binary file content.
+    """
+    image = vision.Image(content=file_content)
+    response = vision_client.text_detection(image=image)
+    texts = response.text_annotations
+    if texts:
+        # The first annotation contains the complete detected text
+        return texts[0].description
+    return ""
+def extract_text_from_file(file_path: str) -> str:
+    """
+    Reads the file from file_path. If it's a PDF, converts only the first page to an image,
+    then runs OCR using Google Vision.
+    """
+    if file_path.lower().endswith(".pdf"):
+        try:
+            # Open the PDF file using PyMuPDF (fitz)
+            pdf_document = fitz.open(file_path)
+            page = pdf_document.load_page(0)  # Load the first page
+            pix = page.get_pixmap()  # Render page as an image
+            # Convert pixmap to PIL Image
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # Convert image to bytes for OCR
+            img_byte_arr = io.BytesIO()
+            image.save(img_byte_arr, format="JPEG")
+            file_content = img_byte_arr.getvalue()
+        except Exception as e:
+            logging.error(f"Error converting PDF to image: {e}")
+            return ""
+    else:
+        with open(file_path, "rb") as f:
+            file_content = f.read()
+    return run_google_vision(file_content)
+def extract_document_number_from_file(file_path: str, id_type: str) -> str:
+    """
+    Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path.
+    """
+    ocr_text = extract_text_from_file(file_path)
+    return extract_document_number(ocr_text, id_type)
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
+#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
 # }
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
+#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "test_Images_folder/gst/e.pdf"
 # }
 #     preprocessing = doc_processing(name,id_type,doc_type,f_path)
 #     response = preprocessing.process()
 #     print("response",response)
+# id_type, doc_type, f_path