Spaces:

pawanmau01
/

TestAPI

Sleeping

App Files Files Community

Pavan+2-at-244075126032 commited on Sep 17, 2024

Commit

75a02d0

1 Parent(s): cfed0df

Comment out all ocr services to test uidai

Browse files

Files changed (8) hide show

app/api.py +8 -8
app/routes/adhaarApi.py +60 -60
app/routes/panApi.py +64 -64
app/services/adhaarServices/adhaarDataExtractor.py +47 -47
app/services/adhaarServices/ocr.py +37 -37
app/services/panServices/panDataExtractor.py +80 -80
app/services/panServices/panOcr.py +53 -53
requirements.txt +6 -6

app/api.py CHANGED Viewed

@@ -3,20 +3,20 @@ from app.routes.uidaiServices import generate_captcha, validate_aadhaar
 from flask import Blueprint, request, jsonify
 import requests
 import os
-from .routes.adhaarApi import ocrAdhaar
-from .routes.panApi import ocrPan
 import uuid
 ocr_bp = Blueprint('ocr', __name__)
 session = requests.Session()
 mode = os.getenv("PROJECT_MODE")
-@ocr_bp.route('/ocrPan', methods=['POST'])
-def getResponse_Pan():
-    return ocrPan(mode, session)
-@ocr_bp.route('/ocrAdhaar', methods=['POST'])
-def getResponse_Adhaar():
-    return ocrAdhaar(mode, session)
 # Route to generate captcha
 @ocr_bp.route('/generateCaptcha', methods=['POST'])

 from flask import Blueprint, request, jsonify
 import requests
 import os
+# from .routes.adhaarApi import ocrAdhaar
+# from .routes.panApi import ocrPan
 import uuid
 ocr_bp = Blueprint('ocr', __name__)
 session = requests.Session()
 mode = os.getenv("PROJECT_MODE")
+# @ocr_bp.route('/ocrPan', methods=['POST'])
+# def getResponse_Pan():
+#     return ocrPan(mode, session)
+# @ocr_bp.route('/ocrAdhaar', methods=['POST'])
+# def getResponse_Adhaar():
+#     return ocrAdhaar(mode, session)
 # Route to generate captcha
 @ocr_bp.route('/generateCaptcha', methods=['POST'])

app/routes/adhaarApi.py CHANGED Viewed

@@ -1,70 +1,70 @@
-import re
-from flask import request, jsonify, current_app
-from PIL import Image, UnidentifiedImageError
-from io import BytesIO
-import base64
-import requests
-from  ..services.adhaarServices.ocr import process_results
-import io
-def ocrAdhaar(mode, session):
-    try:
-        print("API HIT ************* AADHAAROCR")
-        data = request.get_json()
-        if not data:
-            return jsonify({"error": "Invalid request payload"}), 400
-        if mode == "prod":
-            if not data.get('image'):
-                return jsonify({"error": "Image data/buffer is required"}), 400
-            #removing 'data:image/png,base64 ' from buffer.
-            imgBuffer = data.get('image')
-            imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
-            # Adjust base64 string padding
-            if len(imgBuffer) % 4:
-                imgBuffer += '=' * (4 - len(imgBuffer) % 4)
-            try:
-                img_data = base64.b64decode(imgBuffer)
-                img = Image.open(BytesIO(img_data))
-                img.verify()  # Verify image format
-                img = Image.open(io.BytesIO(img_data))  # Re-open image after verification
-            except (base64.binascii.Error, ValueError) as decode_err:
-                return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
-            except UnidentifiedImageError:
-                return jsonify({"error": "Unable to identify image format."}), 400
-        elif mode == "dev":
-            if not data.get('imgUrl'):
-                return jsonify({"error": "Image URL is required"}), 400
-            img_url = data.get('imgUrl')
-            response = session.get(img_url)
-            response.raise_for_status()
-            img = Image.open(BytesIO(response.content))
-            img.verify()  # Verify image format
-            img = Image.open(BytesIO(response.content))  # Re-open image after verification
-        else:
-            return jsonify({"error": "Invalid mode configuration"}), 500
-        # Check image format
-        if img.format not in ['JPEG', 'JPG', 'PNG']:
-            return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
-        # Run detection
-        model = current_app.models.get('adhaarModel')
-        results = model.predict(source=img, save=False)
-        extracted_data = process_results(results, img)
-        if extracted_data.get('statusCode') == 400:
-            return jsonify(extracted_data), 400
-        return jsonify(extracted_data), 200
-    except requests.RequestException as req_err:
-        return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
-    except UnidentifiedImageError:
-        return jsonify({"error": "Unable to identify image format."}), 400
-    except Exception as e:
-        current_app.logger.error(f"Unexpected error: {str(e)}")
-        return jsonify({"error": "An unexpected error occurred."}), 500

+# import re
+# from flask import request, jsonify, current_app
+# from PIL import Image, UnidentifiedImageError
+# from io import BytesIO
+# import base64
+# import requests
+# from  ..services.adhaarServices.ocr import process_results
+# import io
+# def ocrAdhaar(mode, session):
+#     try:
+#         print("API HIT ************* AADHAAROCR")
+#         data = request.get_json()
+#         if not data:
+#             return jsonify({"error": "Invalid request payload"}), 400
+#         if mode == "prod":
+#             if not data.get('image'):
+#                 return jsonify({"error": "Image data/buffer is required"}), 400
+#             #removing 'data:image/png,base64 ' from buffer.
+#             imgBuffer = data.get('image')
+#             imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
+#             # Adjust base64 string padding
+#             if len(imgBuffer) % 4:
+#                 imgBuffer += '=' * (4 - len(imgBuffer) % 4)
+#             try:
+#                 img_data = base64.b64decode(imgBuffer)
+#                 img = Image.open(BytesIO(img_data))
+#                 img.verify()  # Verify image format
+#                 img = Image.open(io.BytesIO(img_data))  # Re-open image after verification
+#             except (base64.binascii.Error, ValueError) as decode_err:
+#                 return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
+#             except UnidentifiedImageError:
+#                 return jsonify({"error": "Unable to identify image format."}), 400
+#         elif mode == "dev":
+#             if not data.get('imgUrl'):
+#                 return jsonify({"error": "Image URL is required"}), 400
+#             img_url = data.get('imgUrl')
+#             response = session.get(img_url)
+#             response.raise_for_status()
+#             img = Image.open(BytesIO(response.content))
+#             img.verify()  # Verify image format
+#             img = Image.open(BytesIO(response.content))  # Re-open image after verification
+#         else:
+#             return jsonify({"error": "Invalid mode configuration"}), 500
+#         # Check image format
+#         if img.format not in ['JPEG', 'JPG', 'PNG']:
+#             return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
+#         # Run detection
+#         model = current_app.models.get('adhaarModel')
+#         results = model.predict(source=img, save=False)
+#         extracted_data = process_results(results, img)
+#         if extracted_data.get('statusCode') == 400:
+#             return jsonify(extracted_data), 400
+#         return jsonify(extracted_data), 200
+#     except requests.RequestException as req_err:
+#         return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
+#     except UnidentifiedImageError:
+#         return jsonify({"error": "Unable to identify image format."}), 400
+#     except Exception as e:
+#         current_app.logger.error(f"Unexpected error: {str(e)}")
+#         return jsonify({"error": "An unexpected error occurred."}), 500

app/routes/panApi.py CHANGED Viewed

@@ -1,74 +1,74 @@
-import re
-from flask import request, jsonify, current_app
-from PIL import Image, UnidentifiedImageError
-from io import BytesIO
-import base64
-import requests
-from  ..services.panServices.panOcr import process_results
-import io
-def ocrPan(mode, session):
-    try:
-        print("API HIT ************* PANOCR")
-        data = request.get_json()
-        if not data:
-            return jsonify({"error": "Invalid request payload"}), 400
-        if mode == "prod":
-            if not data.get('image'):
-                return jsonify({"error": "Image data/buffer is required"}), 400
-            #removing 'data:image/png,base64 ' from buffer.
-            imgBuffer = data.get('image')
-            imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
-            # Adjust base64 string padding
-            if len(imgBuffer) % 4:
-                imgBuffer += '=' * (4 - len(imgBuffer) % 4)
-            try:
-                img_data = base64.b64decode(imgBuffer)
-                img = Image.open(BytesIO(img_data))
-                img.verify()  # Verify image format
-                print(img, "img")
-                img = Image.open(io.BytesIO(img_data))  # Re-open image after verification
-            except (base64.binascii.Error, ValueError) as decode_err:
-                return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
-            except UnidentifiedImageError:
-                return jsonify({"error": "Unable to identify image format."}), 400
-        elif mode == "dev":
-            if not data.get('imgUrl'):
-                return jsonify({"error": "Image URL is required"}), 400
-            img_url = data.get('imgUrl')
-            response = session.get(img_url)
-            response.raise_for_status()
-            img = Image.open(BytesIO(response.content))
-            print(img, "img")
-            img.verify()  # Verify image format
-            img = Image.open(BytesIO(response.content))  # Re-open image after verification
-        else:
-            return jsonify({"error": "Invalid mode configuration"}), 500
-        # Check image format
-        if img.format not in ['JPEG', 'JPG', 'PNG']:
-            return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
-        # Run detection
-        model = current_app.models.get('panModel')
-        results = model.predict(source=img, imgsz=680, iou=0.7, augment=True)
-        # print(results,"model result")
-        extracted_data = process_results(results, img)
-        # print(extracted_data, "extracted data")
-        if extracted_data.get('statusCode') == 400:
-            return jsonify(extracted_data), 400
-        return jsonify(extracted_data), 200
-    except requests.RequestException as req_err:
-        return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
-    except UnidentifiedImageError:
-        return jsonify({"error": "Unable to identify image format."}), 400
-    except Exception as e:
-        current_app.logger.error(f"Unexpected error: {str(e)}")
-        return jsonify({"error": "An unexpected error occurred on api call."}), 500

+# import re
+# from flask import request, jsonify, current_app
+# from PIL import Image, UnidentifiedImageError
+# from io import BytesIO
+# import base64
+# import requests
+# from  ..services.panServices.panOcr import process_results
+# import io
+# def ocrPan(mode, session):
+#     try:
+#         print("API HIT ************* PANOCR")
+#         data = request.get_json()
+#         if not data:
+#             return jsonify({"error": "Invalid request payload"}), 400
+#         if mode == "prod":
+#             if not data.get('image'):
+#                 return jsonify({"error": "Image data/buffer is required"}), 400
+#             #removing 'data:image/png,base64 ' from buffer.
+#             imgBuffer = data.get('image')
+#             imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
+#             # Adjust base64 string padding
+#             if len(imgBuffer) % 4:
+#                 imgBuffer += '=' * (4 - len(imgBuffer) % 4)
+#             try:
+#                 img_data = base64.b64decode(imgBuffer)
+#                 img = Image.open(BytesIO(img_data))
+#                 img.verify()  # Verify image format
+#                 print(img, "img")
+#                 img = Image.open(io.BytesIO(img_data))  # Re-open image after verification
+#             except (base64.binascii.Error, ValueError) as decode_err:
+#                 return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
+#             except UnidentifiedImageError:
+#                 return jsonify({"error": "Unable to identify image format."}), 400
+#         elif mode == "dev":
+#             if not data.get('imgUrl'):
+#                 return jsonify({"error": "Image URL is required"}), 400
+#             img_url = data.get('imgUrl')
+#             response = session.get(img_url)
+#             response.raise_for_status()
+#             img = Image.open(BytesIO(response.content))
+#             print(img, "img")
+#             img.verify()  # Verify image format
+#             img = Image.open(BytesIO(response.content))  # Re-open image after verification
+#         else:
+#             return jsonify({"error": "Invalid mode configuration"}), 500
+#         # Check image format
+#         if img.format not in ['JPEG', 'JPG', 'PNG']:
+#             return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
+#         # Run detection
+#         model = current_app.models.get('panModel')
+#         results = model.predict(source=img, imgsz=680, iou=0.7, augment=True)
+#         # print(results,"model result")
+#         extracted_data = process_results(results, img)
+#         # print(extracted_data, "extracted data")
+#         if extracted_data.get('statusCode') == 400:
+#             return jsonify(extracted_data), 400
+#         return jsonify(extracted_data), 200
+#     except requests.RequestException as req_err:
+#         return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
+#     except UnidentifiedImageError:
+#         return jsonify({"error": "Unable to identify image format."}), 400
+#     except Exception as e:
+#         current_app.logger.error(f"Unexpected error: {str(e)}")
+#         return jsonify({"error": "An unexpected error occurred on api call."}), 500

app/services/adhaarServices/adhaarDataExtractor.py CHANGED Viewed

@@ -1,59 +1,59 @@
-import datetime
-def extract_details(texts):
-    details = {'name': '', 'gender': '', 'dob': '', 'aadhaarNo': ''}
-    current_year = datetime.datetime.now().year
-    for text in texts:
-        # Check if colon exists in text and split accordingly
-        if ':' in text:
-            text = text.split(':')[1].strip()
-        cleaned_text = text.replace(':', '').strip()
-        # Remove leading non-alphabetic characters for gender detection and strip spaces
-        cleaned_gender = cleaned_text.lstrip('.-/').strip()
-        # Check if the text is the name (only alphabets, spaces, and possibly dots)
-        if (all(char.isalpha() or char.isspace() or char == '.' for char in cleaned_text)
-                and cleaned_gender.lower() not in ['male', 'female']):
-            details['name'] = cleaned_text
-        # Check if the text is the DOB (format: dd/mm/yyyy or yyyy)
-        elif (len(cleaned_text) == 4 and
-              cleaned_text.isdigit() and
-              1900 < int(cleaned_text) < current_year):
-            details['dob'] = cleaned_text
-        # Check if the text is the DOB (format: dd/mm/yyyy or dd-mm-yyyy)
-        elif (len(cleaned_text) == 10 and
-              (cleaned_text[2] in ['/', '-']) and
-              (cleaned_text[5] in ['/', '-']) and
-              cleaned_text.replace('/', '').replace('-', '').isdigit()):
-            details['dob'] = cleaned_text
-        # Check if the text is the gender (either 'Male' or 'Female')
-        elif cleaned_gender.lower() in ['male', 'female']:
-            details['gender'] = cleaned_gender.capitalize()
-        # Check if the text is the Aadhaar number (12 digits after removing spaces)
-        elif cleaned_text.replace(' ', '').isdigit() and len(cleaned_text.replace(' ', '')) == 12:
-            details['aadhaarNo'] = cleaned_text
-    # Check if any key's value is empty
-    if any(value == '' for value in details.values()):
-        error_key = next(key for key, value in details.items() if value == '')
-        result = {
-            'statusCode': 400,
-            'result': details,
-            'error': f'{error_key} value is not found due to bad image.'
-        }
-    else:
-        result = {
-            'statusCode': 200,
-            'result': details,
-            'error': ''
-        }
-    return result

+# import datetime
+# def extract_details(texts):
+#     details = {'name': '', 'gender': '', 'dob': '', 'aadhaarNo': ''}
+#     current_year = datetime.datetime.now().year
+#     for text in texts:
+#         # Check if colon exists in text and split accordingly
+#         if ':' in text:
+#             text = text.split(':')[1].strip()
+#         cleaned_text = text.replace(':', '').strip()
+#         # Remove leading non-alphabetic characters for gender detection and strip spaces
+#         cleaned_gender = cleaned_text.lstrip('.-/').strip()
+#         # Check if the text is the name (only alphabets, spaces, and possibly dots)
+#         if (all(char.isalpha() or char.isspace() or char == '.' for char in cleaned_text)
+#                 and cleaned_gender.lower() not in ['male', 'female']):
+#             details['name'] = cleaned_text
+#         # Check if the text is the DOB (format: dd/mm/yyyy or yyyy)
+#         elif (len(cleaned_text) == 4 and
+#               cleaned_text.isdigit() and
+#               1900 < int(cleaned_text) < current_year):
+#             details['dob'] = cleaned_text
+#         # Check if the text is the DOB (format: dd/mm/yyyy or dd-mm-yyyy)
+#         elif (len(cleaned_text) == 10 and
+#               (cleaned_text[2] in ['/', '-']) and
+#               (cleaned_text[5] in ['/', '-']) and
+#               cleaned_text.replace('/', '').replace('-', '').isdigit()):
+#             details['dob'] = cleaned_text
+#         # Check if the text is the gender (either 'Male' or 'Female')
+#         elif cleaned_gender.lower() in ['male', 'female']:
+#             details['gender'] = cleaned_gender.capitalize()
+#         # Check if the text is the Aadhaar number (12 digits after removing spaces)
+#         elif cleaned_text.replace(' ', '').isdigit() and len(cleaned_text.replace(' ', '')) == 12:
+#             details['aadhaarNo'] = cleaned_text
+#     # Check if any key's value is empty
+#     if any(value == '' for value in details.values()):
+#         error_key = next(key for key, value in details.items() if value == '')
+#         result = {
+#             'statusCode': 400,
+#             'result': details,
+#             'error': f'{error_key} value is not found due to bad image.'
+#         }
+#     else:
+#         result = {
+#             'statusCode': 200,
+#             'result': details,
+#             'error': ''
+#         }
+#     return result

app/services/adhaarServices/ocr.py CHANGED Viewed

@@ -1,44 +1,44 @@
-from io import BytesIO
-from ...utils.azureOCR import analyze_image
-from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
-from app.services.adhaarServices.adhaarDataExtractor import extract_details
-def process_results(results, img):
-    precision_data = {label: {"correct": 0, "total": 0} for label in ["aadharNo", "name", "dob", "gender", "address"]}
-    confidence_threshold = 0.3
-    input_image_format = img.format if img.format else "PNG"
-    valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
-    input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
-    label_to_image = {}
-    extracted_data = {"adhaarNo": "", "dob": "", "gender": "", "name": "", "address": ""}
-    for result in results:
-        for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
-            label = ["aadharNo", "dob", "gender", "name", "address"][int(cls)]
-            print(label, conf)
-            if conf < confidence_threshold or label == "address":
-                continue
-            x1, y1, x2, y2 = map(int, bbox.tolist())
-            crop_img = img.crop((x1, y1, x2, y2))
-            crop_img = resize_if_needed(crop_img)
-            if label not in label_to_image or label_to_image[label][1] < conf:
-                label_to_image[label] = (crop_img, conf)
-                precision_data[label]["total"] += 1
-                precision_data[label]["correct"] += 1  # Replace with actual OCR validation check
-    cropped_images = [img for label, (img, conf) in sorted(label_to_image.items()) if label != "address"]
-    final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
-    buffer = BytesIO()
-    final_image.save(buffer, format=input_image_format)
-    buffer.seek(0)
-    response = analyze_image(buffer.getvalue(), input_image_format)
-    # print(response)
-    lines = response['readResult']['blocks'][0]['lines']
-    texts = [line['text'] for line in lines]
-    print(texts)
-    extracted_data = extract_details(texts)
-    return extracted_data

+# from io import BytesIO
+# from ...utils.azureOCR import analyze_image
+# from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
+# from app.services.adhaarServices.adhaarDataExtractor import extract_details
+# def process_results(results, img):
+#     precision_data = {label: {"correct": 0, "total": 0} for label in ["aadharNo", "name", "dob", "gender", "address"]}
+#     confidence_threshold = 0.3
+#     input_image_format = img.format if img.format else "PNG"
+#     valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
+#     input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
+#     label_to_image = {}
+#     extracted_data = {"adhaarNo": "", "dob": "", "gender": "", "name": "", "address": ""}
+#     for result in results:
+#         for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
+#             label = ["aadharNo", "dob", "gender", "name", "address"][int(cls)]
+#             print(label, conf)
+#             if conf < confidence_threshold or label == "address":
+#                 continue
+#             x1, y1, x2, y2 = map(int, bbox.tolist())
+#             crop_img = img.crop((x1, y1, x2, y2))
+#             crop_img = resize_if_needed(crop_img)
+#             if label not in label_to_image or label_to_image[label][1] < conf:
+#                 label_to_image[label] = (crop_img, conf)
+#                 precision_data[label]["total"] += 1
+#                 precision_data[label]["correct"] += 1  # Replace with actual OCR validation check
+#     cropped_images = [img for label, (img, conf) in sorted(label_to_image.items()) if label != "address"]
+#     final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
+#     buffer = BytesIO()
+#     final_image.save(buffer, format=input_image_format)
+#     buffer.seek(0)
+#     response = analyze_image(buffer.getvalue(), input_image_format)
+#     # print(response)
+#     lines = response['readResult']['blocks'][0]['lines']
+#     texts = [line['text'] for line in lines]
+#     print(texts)
+#     extracted_data = extract_details(texts)
+#     return extracted_data

app/services/panServices/panDataExtractor.py CHANGED Viewed

@@ -1,95 +1,95 @@
-import re
-def filter_array(arr):
-    # Define the regex patterns
-    pattern_alphanumeric_special = re.compile(r'[\w]+[^.\s\w]+|[^.\s\w]+[\w]+')
-    pattern_numeric = re.compile(r'^[0-9]+$')
-    pattern_special_chars = re.compile(r'[^a-zA-Z.\s]+')
-    # Filter the array
-    filtered_array = [
-        item for item in arr
-        if not (pattern_alphanumeric_special.search(item) or
-                pattern_numeric.match(item) or
-                pattern_special_chars.search(item))
-    ]
-    return filtered_array
-def extract_panData(data):
-    unwanted_words = ["Name", "/Name", 'Permanent', 'Account', 'Number', 'Card', 'नाम', '/Name',
-                      "पिता का नाम", 'नाम / Name', "पिता का नाम/ Father's Name", '414 / Name', 'पिता का नाम / Fath',
-                      "VIT VE Hra / Father's Nama", 'पिता का नाम/ Fal', 'पिता का नाम / Fathe', "पिता का नाम / Father's Na",
-                      'जन्म की तारीख /।', 'जन्म का ताराख', "पिता का नाम/ Father's Nam", 'नाम /Name', "पिता का नाम / Father's Name",
-                      'जन्म का वाराज़', 'Date of Birth', 'Permanent Account Number Card', "Date of Birth", "/Date of Birth",
-                      "Permanent Account Number", "Father's Name", "14 /Name", "/Father's Name", 'HTH / Name',"inent Account Number", "anent Account Number C","Permanent Account Number Car",
-                      'ugr Name']
-    # Initialize result object
-    result = {
-        "statusCode": 200,
-        "error": '',
-        "data": {
-            "panNo": '',
-            "name": '',
-            "fatherName": '',
-            "dob": ''
-        }
-    }
-    # Clean the array by removing unwanted words and invalid entries
-    cleaned_data = []
-    combination_pattern = re.compile(r'(?=.*[0-9])(?=.*[!@#$%^&*(),?":{}|<>])')
-    for item in data:
-        if item not in unwanted_words and not combination_pattern.search(item):
-            cleaned_data.append(item)
-    # Check and extract PAN number
-    pan_pattern = re.compile(r'^[A-Z]{5}\s*[0-9]{4}\s*[A-Z]$')
-    for item in cleaned_data:
-        if pan_pattern.match(item):
-            result["data"]["panNo"] = item
-            cleaned_data.remove(item)
-            break
-    # Check and extract date of birth
-    dob_pattern = re.compile(r'^\d{2}[-/]\d{2}[-/]\d{4}$')
-    for item in cleaned_data:
-        if dob_pattern.match(item):
-            result["data"]["dob"] = item
-            cleaned_data.remove(item)
-            break
-    # If only two values are left, assume they are name and father's name
-    cleaned_data = filter_array(cleaned_data)
-    if len(cleaned_data) == 2:
-        result["data"]["name"] = cleaned_data[0]
-        result["data"]["fatherName"] = cleaned_data[1]
-    else:
-        # Further cleaning of the data array to extract name and father's name
-        cleaned_data = [item for item in cleaned_data if not combination_pattern.search(item) and item not in unwanted_words]
-        print(cleaned_data, "after cleaning")
-        # Check and extract name
-        name_pattern = re.compile(r'^[A-Za-z .]+$')
-        if len(cleaned_data) > 0 and name_pattern.match(cleaned_data[0]):
-            result["data"]["name"] = cleaned_data[0]
-        else:
-            result["data"]["name"] = ''
-        # Check and extract father's name
-        if len(cleaned_data) > 1 and name_pattern.match(cleaned_data[1]):
-            result["data"]["fatherName"] = cleaned_data[1]
-        else:
-            result["data"]["fatherName"] = ''
-    # Check if any value is empty and set error message
-    for key, value in result["data"].items():
-        if value == '':
-            result["statusCode"] = 400
-            result["error"] = f"{key} value is not found due to bad image."
-            break
-    return result

+# import re
+# def filter_array(arr):
+#     # Define the regex patterns
+#     pattern_alphanumeric_special = re.compile(r'[\w]+[^.\s\w]+|[^.\s\w]+[\w]+')
+#     pattern_numeric = re.compile(r'^[0-9]+$')
+#     pattern_special_chars = re.compile(r'[^a-zA-Z.\s]+')
+#     # Filter the array
+#     filtered_array = [
+#         item for item in arr
+#         if not (pattern_alphanumeric_special.search(item) or
+#                 pattern_numeric.match(item) or
+#                 pattern_special_chars.search(item))
+#     ]
+#     return filtered_array
+# def extract_panData(data):
+#     unwanted_words = ["Name", "/Name", 'Permanent', 'Account', 'Number', 'Card', 'नाम', '/Name',
+#                       "पिता का नाम", 'नाम / Name', "पिता का नाम/ Father's Name", '414 / Name', 'पिता का नाम / Fath',
+#                       "VIT VE Hra / Father's Nama", 'पिता का नाम/ Fal', 'पिता का नाम / Fathe', "पिता का नाम / Father's Na",
+#                       'जन्म की तारीख /।', 'जन्म का ताराख', "पिता का नाम/ Father's Nam", 'नाम /Name', "पिता का नाम / Father's Name",
+#                       'जन्म का वाराज़', 'Date of Birth', 'Permanent Account Number Card', "Date of Birth", "/Date of Birth",
+#                       "Permanent Account Number", "Father's Name", "14 /Name", "/Father's Name", 'HTH / Name',"inent Account Number", "anent Account Number C","Permanent Account Number Car",
+#                       'ugr Name']
+#     # Initialize result object
+#     result = {
+#         "statusCode": 200,
+#         "error": '',
+#         "data": {
+#             "panNo": '',
+#             "name": '',
+#             "fatherName": '',
+#             "dob": ''
+#         }
+#     }
+#     # Clean the array by removing unwanted words and invalid entries
+#     cleaned_data = []
+#     combination_pattern = re.compile(r'(?=.*[0-9])(?=.*[!@#$%^&*(),?":{}|<>])')
+#     for item in data:
+#         if item not in unwanted_words and not combination_pattern.search(item):
+#             cleaned_data.append(item)
+#     # Check and extract PAN number
+#     pan_pattern = re.compile(r'^[A-Z]{5}\s*[0-9]{4}\s*[A-Z]$')
+#     for item in cleaned_data:
+#         if pan_pattern.match(item):
+#             result["data"]["panNo"] = item
+#             cleaned_data.remove(item)
+#             break
+#     # Check and extract date of birth
+#     dob_pattern = re.compile(r'^\d{2}[-/]\d{2}[-/]\d{4}$')
+#     for item in cleaned_data:
+#         if dob_pattern.match(item):
+#             result["data"]["dob"] = item
+#             cleaned_data.remove(item)
+#             break
+#     # If only two values are left, assume they are name and father's name
+#     cleaned_data = filter_array(cleaned_data)
+#     if len(cleaned_data) == 2:
+#         result["data"]["name"] = cleaned_data[0]
+#         result["data"]["fatherName"] = cleaned_data[1]
+#     else:
+#         # Further cleaning of the data array to extract name and father's name
+#         cleaned_data = [item for item in cleaned_data if not combination_pattern.search(item) and item not in unwanted_words]
+#         print(cleaned_data, "after cleaning")
+#         # Check and extract name
+#         name_pattern = re.compile(r'^[A-Za-z .]+$')
+#         if len(cleaned_data) > 0 and name_pattern.match(cleaned_data[0]):
+#             result["data"]["name"] = cleaned_data[0]
+#         else:
+#             result["data"]["name"] = ''
+#         # Check and extract father's name
+#         if len(cleaned_data) > 1 and name_pattern.match(cleaned_data[1]):
+#             result["data"]["fatherName"] = cleaned_data[1]
+#         else:
+#             result["data"]["fatherName"] = ''
+#     # Check if any value is empty and set error message
+#     for key, value in result["data"].items():
+#         if value == '':
+#             result["statusCode"] = 400
+#             result["error"] = f"{key} value is not found due to bad image."
+#             break
+#     return result

app/services/panServices/panOcr.py CHANGED Viewed

@@ -1,66 +1,66 @@
-from io import BytesIO
-from ...utils.azureOCR import analyze_image
-from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
-from .panDataExtractor import extract_panData
-# from collections import defaultdict
-def process_results(results, img):
-    label_indices = {"pan_num": 0, "name": 1, "father": 2, "dob": 3}
-    confidence_threshold = 0.3
-    input_image_format = img.format if img.format else "PNG"
-    valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
-    input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
-    best_crops = {label: (None, -1) for label in label_indices.keys()}  # Store best (image, confidence) pairs
-    precision_data = {label: {"correct": 0, "total": 0} for label in label_indices.keys()}
-    for result in results:
-        for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
-            # Ensure the class index is within the bounds of the label list
-            if int(cls) >= len(label_indices):
-                print(f"Warning: Class index {cls} is out of range. Skipping this bbox.")
-                continue
-            label = list(label_indices.keys())[int(cls)]
-            print(label, conf)
-            if conf < confidence_threshold:
-                continue
-            x1, y1, x2, y2 = map(int, bbox.tolist())
-            crop_img = img.crop((x1, y1, x2, y2))
-            crop_img = resize_if_needed(crop_img)
-            crop_img.save(f"temp_{label}.png")
-            # Replace old crop if new one has higher confidence
-            _, best_conf = best_crops[label]
-            if conf > best_conf:
-                best_crops[label] = (crop_img, conf)
-                precision_data[label]["total"] += 1
-                precision_data[label]["correct"] += 1  # Replace with actual OCR validation check
-    # Extract the images for final processing
-    cropped_images_with_labels = [(img, label_indices[label], conf) for label, (img, conf) in best_crops.items() if img is not None]
-    # Sort the images by their label indices in ascending order
-    cropped_images_with_labels.sort(key=lambda x: x[1])
-    print(cropped_images_with_labels, "cropped images with labels")
-    if not cropped_images_with_labels:
-        raise ValueError("No images were cropped.")
-    # Extract only the images for concatenation
-    cropped_images = [img for img, _, _ in cropped_images_with_labels]
-    final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
-    buffer = BytesIO()
-    final_image.save(buffer, format=input_image_format)
-    buffer.seek(0)
-    response = analyze_image(buffer.getvalue(), input_image_format)
-    print(response, "response")
-    lines = response['readResult']['blocks'][0]['lines']
-    texts = [line['text'] for line in lines]
-    print(texts, "text after microsoft ocr")
-    extracted_data = extract_panData(texts)
-    return extracted_data

+# from io import BytesIO
+# from ...utils.azureOCR import analyze_image
+# from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
+# from .panDataExtractor import extract_panData
+# # from collections import defaultdict
+# def process_results(results, img):
+#     label_indices = {"pan_num": 0, "name": 1, "father": 2, "dob": 3}
+#     confidence_threshold = 0.3
+#     input_image_format = img.format if img.format else "PNG"
+#     valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
+#     input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
+#     best_crops = {label: (None, -1) for label in label_indices.keys()}  # Store best (image, confidence) pairs
+#     precision_data = {label: {"correct": 0, "total": 0} for label in label_indices.keys()}
+#     for result in results:
+#         for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
+#             # Ensure the class index is within the bounds of the label list
+#             if int(cls) >= len(label_indices):
+#                 print(f"Warning: Class index {cls} is out of range. Skipping this bbox.")
+#                 continue
+#             label = list(label_indices.keys())[int(cls)]
+#             print(label, conf)
+#             if conf < confidence_threshold:
+#                 continue
+#             x1, y1, x2, y2 = map(int, bbox.tolist())
+#             crop_img = img.crop((x1, y1, x2, y2))
+#             crop_img = resize_if_needed(crop_img)
+#             crop_img.save(f"temp_{label}.png")
+#             # Replace old crop if new one has higher confidence
+#             _, best_conf = best_crops[label]
+#             if conf > best_conf:
+#                 best_crops[label] = (crop_img, conf)
+#                 precision_data[label]["total"] += 1
+#                 precision_data[label]["correct"] += 1  # Replace with actual OCR validation check
+#     # Extract the images for final processing
+#     cropped_images_with_labels = [(img, label_indices[label], conf) for label, (img, conf) in best_crops.items() if img is not None]
+#     # Sort the images by their label indices in ascending order
+#     cropped_images_with_labels.sort(key=lambda x: x[1])
+#     print(cropped_images_with_labels, "cropped images with labels")
+#     if not cropped_images_with_labels:
+#         raise ValueError("No images were cropped.")
+#     # Extract only the images for concatenation
+#     cropped_images = [img for img, _, _ in cropped_images_with_labels]
+#     final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
+#     buffer = BytesIO()
+#     final_image.save(buffer, format=input_image_format)
+#     buffer.seek(0)
+#     response = analyze_image(buffer.getvalue(), input_image_format)
+#     print(response, "response")
+#     lines = response['readResult']['blocks'][0]['lines']
+#     texts = [line['text'] for line in lines]
+#     print(texts, "text after microsoft ocr")
+#     extracted_data = extract_panData(texts)
+#     return extracted_data

requirements.txt CHANGED Viewed

@@ -3,9 +3,9 @@ Werkzeug==2.0.2
 python-dotenv
 gunicorn
 requests
-torch
-ultralytics
-opencv-python-headless
-Pillow
-pytesseract
-openpyxl

 python-dotenv
 gunicorn
 requests
+# torch
+# ultralytics
+# opencv-python-headless
+# Pillow
+# pytesseract
+# openpyxl