Pavan+2-at-244075126032 commited on
Commit
75a02d0
·
1 Parent(s): cfed0df

Comment out all ocr services to test uidai

Browse files
app/api.py CHANGED
@@ -3,20 +3,20 @@ from app.routes.uidaiServices import generate_captcha, validate_aadhaar
3
  from flask import Blueprint, request, jsonify
4
  import requests
5
  import os
6
- from .routes.adhaarApi import ocrAdhaar
7
- from .routes.panApi import ocrPan
8
  import uuid
9
  ocr_bp = Blueprint('ocr', __name__)
10
  session = requests.Session()
11
  mode = os.getenv("PROJECT_MODE")
12
 
13
- @ocr_bp.route('/ocrPan', methods=['POST'])
14
- def getResponse_Pan():
15
- return ocrPan(mode, session)
16
 
17
- @ocr_bp.route('/ocrAdhaar', methods=['POST'])
18
- def getResponse_Adhaar():
19
- return ocrAdhaar(mode, session)
20
 
21
  # Route to generate captcha
22
  @ocr_bp.route('/generateCaptcha', methods=['POST'])
 
3
  from flask import Blueprint, request, jsonify
4
  import requests
5
  import os
6
+ # from .routes.adhaarApi import ocrAdhaar
7
+ # from .routes.panApi import ocrPan
8
  import uuid
9
  ocr_bp = Blueprint('ocr', __name__)
10
  session = requests.Session()
11
  mode = os.getenv("PROJECT_MODE")
12
 
13
+ # @ocr_bp.route('/ocrPan', methods=['POST'])
14
+ # def getResponse_Pan():
15
+ # return ocrPan(mode, session)
16
 
17
+ # @ocr_bp.route('/ocrAdhaar', methods=['POST'])
18
+ # def getResponse_Adhaar():
19
+ # return ocrAdhaar(mode, session)
20
 
21
  # Route to generate captcha
22
  @ocr_bp.route('/generateCaptcha', methods=['POST'])
app/routes/adhaarApi.py CHANGED
@@ -1,70 +1,70 @@
1
- import re
2
- from flask import request, jsonify, current_app
3
- from PIL import Image, UnidentifiedImageError
4
- from io import BytesIO
5
- import base64
6
- import requests
7
- from ..services.adhaarServices.ocr import process_results
8
- import io
9
- def ocrAdhaar(mode, session):
10
- try:
11
- print("API HIT ************* AADHAAROCR")
12
- data = request.get_json()
13
- if not data:
14
- return jsonify({"error": "Invalid request payload"}), 400
15
 
16
- if mode == "prod":
17
- if not data.get('image'):
18
- return jsonify({"error": "Image data/buffer is required"}), 400
19
 
20
- #removing 'data:image/png,base64 ' from buffer.
21
- imgBuffer = data.get('image')
22
- imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
23
- # Adjust base64 string padding
24
- if len(imgBuffer) % 4:
25
- imgBuffer += '=' * (4 - len(imgBuffer) % 4)
26
 
27
- try:
28
- img_data = base64.b64decode(imgBuffer)
29
- img = Image.open(BytesIO(img_data))
30
- img.verify() # Verify image format
31
- img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
32
- except (base64.binascii.Error, ValueError) as decode_err:
33
- return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
34
- except UnidentifiedImageError:
35
- return jsonify({"error": "Unable to identify image format."}), 400
36
 
37
- elif mode == "dev":
38
- if not data.get('imgUrl'):
39
- return jsonify({"error": "Image URL is required"}), 400
40
 
41
- img_url = data.get('imgUrl')
42
- response = session.get(img_url)
43
- response.raise_for_status()
44
- img = Image.open(BytesIO(response.content))
45
- img.verify() # Verify image format
46
- img = Image.open(BytesIO(response.content)) # Re-open image after verification
47
 
48
- else:
49
- return jsonify({"error": "Invalid mode configuration"}), 500
50
 
51
- # Check image format
52
- if img.format not in ['JPEG', 'JPG', 'PNG']:
53
- return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
54
 
55
- # Run detection
56
- model = current_app.models.get('adhaarModel')
57
- results = model.predict(source=img, save=False)
58
- extracted_data = process_results(results, img)
59
 
60
- if extracted_data.get('statusCode') == 400:
61
- return jsonify(extracted_data), 400
62
 
63
- return jsonify(extracted_data), 200
64
- except requests.RequestException as req_err:
65
- return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
66
- except UnidentifiedImageError:
67
- return jsonify({"error": "Unable to identify image format."}), 400
68
- except Exception as e:
69
- current_app.logger.error(f"Unexpected error: {str(e)}")
70
- return jsonify({"error": "An unexpected error occurred."}), 500
 
1
+ # import re
2
+ # from flask import request, jsonify, current_app
3
+ # from PIL import Image, UnidentifiedImageError
4
+ # from io import BytesIO
5
+ # import base64
6
+ # import requests
7
+ # from ..services.adhaarServices.ocr import process_results
8
+ # import io
9
+ # def ocrAdhaar(mode, session):
10
+ # try:
11
+ # print("API HIT ************* AADHAAROCR")
12
+ # data = request.get_json()
13
+ # if not data:
14
+ # return jsonify({"error": "Invalid request payload"}), 400
15
 
16
+ # if mode == "prod":
17
+ # if not data.get('image'):
18
+ # return jsonify({"error": "Image data/buffer is required"}), 400
19
 
20
+ # #removing 'data:image/png,base64 ' from buffer.
21
+ # imgBuffer = data.get('image')
22
+ # imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
23
+ # # Adjust base64 string padding
24
+ # if len(imgBuffer) % 4:
25
+ # imgBuffer += '=' * (4 - len(imgBuffer) % 4)
26
 
27
+ # try:
28
+ # img_data = base64.b64decode(imgBuffer)
29
+ # img = Image.open(BytesIO(img_data))
30
+ # img.verify() # Verify image format
31
+ # img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
32
+ # except (base64.binascii.Error, ValueError) as decode_err:
33
+ # return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
34
+ # except UnidentifiedImageError:
35
+ # return jsonify({"error": "Unable to identify image format."}), 400
36
 
37
+ # elif mode == "dev":
38
+ # if not data.get('imgUrl'):
39
+ # return jsonify({"error": "Image URL is required"}), 400
40
 
41
+ # img_url = data.get('imgUrl')
42
+ # response = session.get(img_url)
43
+ # response.raise_for_status()
44
+ # img = Image.open(BytesIO(response.content))
45
+ # img.verify() # Verify image format
46
+ # img = Image.open(BytesIO(response.content)) # Re-open image after verification
47
 
48
+ # else:
49
+ # return jsonify({"error": "Invalid mode configuration"}), 500
50
 
51
+ # # Check image format
52
+ # if img.format not in ['JPEG', 'JPG', 'PNG']:
53
+ # return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
54
 
55
+ # # Run detection
56
+ # model = current_app.models.get('adhaarModel')
57
+ # results = model.predict(source=img, save=False)
58
+ # extracted_data = process_results(results, img)
59
 
60
+ # if extracted_data.get('statusCode') == 400:
61
+ # return jsonify(extracted_data), 400
62
 
63
+ # return jsonify(extracted_data), 200
64
+ # except requests.RequestException as req_err:
65
+ # return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
66
+ # except UnidentifiedImageError:
67
+ # return jsonify({"error": "Unable to identify image format."}), 400
68
+ # except Exception as e:
69
+ # current_app.logger.error(f"Unexpected error: {str(e)}")
70
+ # return jsonify({"error": "An unexpected error occurred."}), 500
app/routes/panApi.py CHANGED
@@ -1,74 +1,74 @@
1
- import re
2
- from flask import request, jsonify, current_app
3
- from PIL import Image, UnidentifiedImageError
4
- from io import BytesIO
5
- import base64
6
- import requests
7
- from ..services.panServices.panOcr import process_results
8
- import io
9
- def ocrPan(mode, session):
10
- try:
11
- print("API HIT ************* PANOCR")
12
- data = request.get_json()
13
- if not data:
14
- return jsonify({"error": "Invalid request payload"}), 400
15
 
16
- if mode == "prod":
17
- if not data.get('image'):
18
- return jsonify({"error": "Image data/buffer is required"}), 400
19
 
20
- #removing 'data:image/png,base64 ' from buffer.
21
- imgBuffer = data.get('image')
22
- imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
23
- # Adjust base64 string padding
24
- if len(imgBuffer) % 4:
25
- imgBuffer += '=' * (4 - len(imgBuffer) % 4)
26
 
27
- try:
28
- img_data = base64.b64decode(imgBuffer)
29
- img = Image.open(BytesIO(img_data))
30
- img.verify() # Verify image format
31
- print(img, "img")
32
- img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
33
- except (base64.binascii.Error, ValueError) as decode_err:
34
- return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
35
- except UnidentifiedImageError:
36
- return jsonify({"error": "Unable to identify image format."}), 400
37
 
38
- elif mode == "dev":
39
- if not data.get('imgUrl'):
40
- return jsonify({"error": "Image URL is required"}), 400
41
 
42
- img_url = data.get('imgUrl')
43
- response = session.get(img_url)
44
- response.raise_for_status()
45
- img = Image.open(BytesIO(response.content))
46
- print(img, "img")
47
- img.verify() # Verify image format
48
- img = Image.open(BytesIO(response.content)) # Re-open image after verification
49
 
50
- else:
51
- return jsonify({"error": "Invalid mode configuration"}), 500
52
 
53
- # Check image format
54
- if img.format not in ['JPEG', 'JPG', 'PNG']:
55
- return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
56
 
57
- # Run detection
58
- model = current_app.models.get('panModel')
59
- results = model.predict(source=img, imgsz=680, iou=0.7, augment=True)
60
- # print(results,"model result")
61
- extracted_data = process_results(results, img)
62
- # print(extracted_data, "extracted data")
63
 
64
- if extracted_data.get('statusCode') == 400:
65
- return jsonify(extracted_data), 400
66
 
67
- return jsonify(extracted_data), 200
68
- except requests.RequestException as req_err:
69
- return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
70
- except UnidentifiedImageError:
71
- return jsonify({"error": "Unable to identify image format."}), 400
72
- except Exception as e:
73
- current_app.logger.error(f"Unexpected error: {str(e)}")
74
- return jsonify({"error": "An unexpected error occurred on api call."}), 500
 
1
+ # import re
2
+ # from flask import request, jsonify, current_app
3
+ # from PIL import Image, UnidentifiedImageError
4
+ # from io import BytesIO
5
+ # import base64
6
+ # import requests
7
+ # from ..services.panServices.panOcr import process_results
8
+ # import io
9
+ # def ocrPan(mode, session):
10
+ # try:
11
+ # print("API HIT ************* PANOCR")
12
+ # data = request.get_json()
13
+ # if not data:
14
+ # return jsonify({"error": "Invalid request payload"}), 400
15
 
16
+ # if mode == "prod":
17
+ # if not data.get('image'):
18
+ # return jsonify({"error": "Image data/buffer is required"}), 400
19
 
20
+ # #removing 'data:image/png,base64 ' from buffer.
21
+ # imgBuffer = data.get('image')
22
+ # imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
23
+ # # Adjust base64 string padding
24
+ # if len(imgBuffer) % 4:
25
+ # imgBuffer += '=' * (4 - len(imgBuffer) % 4)
26
 
27
+ # try:
28
+ # img_data = base64.b64decode(imgBuffer)
29
+ # img = Image.open(BytesIO(img_data))
30
+ # img.verify() # Verify image format
31
+ # print(img, "img")
32
+ # img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
33
+ # except (base64.binascii.Error, ValueError) as decode_err:
34
+ # return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
35
+ # except UnidentifiedImageError:
36
+ # return jsonify({"error": "Unable to identify image format."}), 400
37
 
38
+ # elif mode == "dev":
39
+ # if not data.get('imgUrl'):
40
+ # return jsonify({"error": "Image URL is required"}), 400
41
 
42
+ # img_url = data.get('imgUrl')
43
+ # response = session.get(img_url)
44
+ # response.raise_for_status()
45
+ # img = Image.open(BytesIO(response.content))
46
+ # print(img, "img")
47
+ # img.verify() # Verify image format
48
+ # img = Image.open(BytesIO(response.content)) # Re-open image after verification
49
 
50
+ # else:
51
+ # return jsonify({"error": "Invalid mode configuration"}), 500
52
 
53
+ # # Check image format
54
+ # if img.format not in ['JPEG', 'JPG', 'PNG']:
55
+ # return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
56
 
57
+ # # Run detection
58
+ # model = current_app.models.get('panModel')
59
+ # results = model.predict(source=img, imgsz=680, iou=0.7, augment=True)
60
+ # # print(results,"model result")
61
+ # extracted_data = process_results(results, img)
62
+ # # print(extracted_data, "extracted data")
63
 
64
+ # if extracted_data.get('statusCode') == 400:
65
+ # return jsonify(extracted_data), 400
66
 
67
+ # return jsonify(extracted_data), 200
68
+ # except requests.RequestException as req_err:
69
+ # return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
70
+ # except UnidentifiedImageError:
71
+ # return jsonify({"error": "Unable to identify image format."}), 400
72
+ # except Exception as e:
73
+ # current_app.logger.error(f"Unexpected error: {str(e)}")
74
+ # return jsonify({"error": "An unexpected error occurred on api call."}), 500
app/services/adhaarServices/adhaarDataExtractor.py CHANGED
@@ -1,59 +1,59 @@
1
- import datetime
2
 
3
- def extract_details(texts):
4
- details = {'name': '', 'gender': '', 'dob': '', 'aadhaarNo': ''}
5
 
6
- current_year = datetime.datetime.now().year
7
 
8
- for text in texts:
9
- # Check if colon exists in text and split accordingly
10
- if ':' in text:
11
- text = text.split(':')[1].strip()
12
 
13
- cleaned_text = text.replace(':', '').strip()
14
 
15
- # Remove leading non-alphabetic characters for gender detection and strip spaces
16
- cleaned_gender = cleaned_text.lstrip('.-/').strip()
17
 
18
- # Check if the text is the name (only alphabets, spaces, and possibly dots)
19
- if (all(char.isalpha() or char.isspace() or char == '.' for char in cleaned_text)
20
- and cleaned_gender.lower() not in ['male', 'female']):
21
- details['name'] = cleaned_text
22
 
23
- # Check if the text is the DOB (format: dd/mm/yyyy or yyyy)
24
- elif (len(cleaned_text) == 4 and
25
- cleaned_text.isdigit() and
26
- 1900 < int(cleaned_text) < current_year):
27
- details['dob'] = cleaned_text
28
 
29
- # Check if the text is the DOB (format: dd/mm/yyyy or dd-mm-yyyy)
30
- elif (len(cleaned_text) == 10 and
31
- (cleaned_text[2] in ['/', '-']) and
32
- (cleaned_text[5] in ['/', '-']) and
33
- cleaned_text.replace('/', '').replace('-', '').isdigit()):
34
- details['dob'] = cleaned_text
35
 
36
- # Check if the text is the gender (either 'Male' or 'Female')
37
- elif cleaned_gender.lower() in ['male', 'female']:
38
- details['gender'] = cleaned_gender.capitalize()
39
 
40
- # Check if the text is the Aadhaar number (12 digits after removing spaces)
41
- elif cleaned_text.replace(' ', '').isdigit() and len(cleaned_text.replace(' ', '')) == 12:
42
- details['aadhaarNo'] = cleaned_text
43
 
44
- # Check if any key's value is empty
45
- if any(value == '' for value in details.values()):
46
- error_key = next(key for key, value in details.items() if value == '')
47
- result = {
48
- 'statusCode': 400,
49
- 'result': details,
50
- 'error': f'{error_key} value is not found due to bad image.'
51
- }
52
- else:
53
- result = {
54
- 'statusCode': 200,
55
- 'result': details,
56
- 'error': ''
57
- }
58
 
59
- return result
 
1
+ # import datetime
2
 
3
+ # def extract_details(texts):
4
+ # details = {'name': '', 'gender': '', 'dob': '', 'aadhaarNo': ''}
5
 
6
+ # current_year = datetime.datetime.now().year
7
 
8
+ # for text in texts:
9
+ # # Check if colon exists in text and split accordingly
10
+ # if ':' in text:
11
+ # text = text.split(':')[1].strip()
12
 
13
+ # cleaned_text = text.replace(':', '').strip()
14
 
15
+ # # Remove leading non-alphabetic characters for gender detection and strip spaces
16
+ # cleaned_gender = cleaned_text.lstrip('.-/').strip()
17
 
18
+ # # Check if the text is the name (only alphabets, spaces, and possibly dots)
19
+ # if (all(char.isalpha() or char.isspace() or char == '.' for char in cleaned_text)
20
+ # and cleaned_gender.lower() not in ['male', 'female']):
21
+ # details['name'] = cleaned_text
22
 
23
+ # # Check if the text is the DOB (format: dd/mm/yyyy or yyyy)
24
+ # elif (len(cleaned_text) == 4 and
25
+ # cleaned_text.isdigit() and
26
+ # 1900 < int(cleaned_text) < current_year):
27
+ # details['dob'] = cleaned_text
28
 
29
+ # # Check if the text is the DOB (format: dd/mm/yyyy or dd-mm-yyyy)
30
+ # elif (len(cleaned_text) == 10 and
31
+ # (cleaned_text[2] in ['/', '-']) and
32
+ # (cleaned_text[5] in ['/', '-']) and
33
+ # cleaned_text.replace('/', '').replace('-', '').isdigit()):
34
+ # details['dob'] = cleaned_text
35
 
36
+ # # Check if the text is the gender (either 'Male' or 'Female')
37
+ # elif cleaned_gender.lower() in ['male', 'female']:
38
+ # details['gender'] = cleaned_gender.capitalize()
39
 
40
+ # # Check if the text is the Aadhaar number (12 digits after removing spaces)
41
+ # elif cleaned_text.replace(' ', '').isdigit() and len(cleaned_text.replace(' ', '')) == 12:
42
+ # details['aadhaarNo'] = cleaned_text
43
 
44
+ # # Check if any key's value is empty
45
+ # if any(value == '' for value in details.values()):
46
+ # error_key = next(key for key, value in details.items() if value == '')
47
+ # result = {
48
+ # 'statusCode': 400,
49
+ # 'result': details,
50
+ # 'error': f'{error_key} value is not found due to bad image.'
51
+ # }
52
+ # else:
53
+ # result = {
54
+ # 'statusCode': 200,
55
+ # 'result': details,
56
+ # 'error': ''
57
+ # }
58
 
59
+ # return result
app/services/adhaarServices/ocr.py CHANGED
@@ -1,44 +1,44 @@
1
- from io import BytesIO
2
- from ...utils.azureOCR import analyze_image
3
- from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
4
- from app.services.adhaarServices.adhaarDataExtractor import extract_details
5
 
6
- def process_results(results, img):
7
- precision_data = {label: {"correct": 0, "total": 0} for label in ["aadharNo", "name", "dob", "gender", "address"]}
8
- confidence_threshold = 0.3
9
- input_image_format = img.format if img.format else "PNG"
10
- valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
11
- input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
12
 
13
- label_to_image = {}
14
- extracted_data = {"adhaarNo": "", "dob": "", "gender": "", "name": "", "address": ""}
15
- for result in results:
16
- for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
17
- label = ["aadharNo", "dob", "gender", "name", "address"][int(cls)]
18
- print(label, conf)
19
- if conf < confidence_threshold or label == "address":
20
- continue
21
 
22
- x1, y1, x2, y2 = map(int, bbox.tolist())
23
- crop_img = img.crop((x1, y1, x2, y2))
24
- crop_img = resize_if_needed(crop_img)
25
 
26
- if label not in label_to_image or label_to_image[label][1] < conf:
27
- label_to_image[label] = (crop_img, conf)
28
- precision_data[label]["total"] += 1
29
- precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
30
 
31
- cropped_images = [img for label, (img, conf) in sorted(label_to_image.items()) if label != "address"]
32
- final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
33
 
34
- buffer = BytesIO()
35
- final_image.save(buffer, format=input_image_format)
36
- buffer.seek(0)
37
 
38
- response = analyze_image(buffer.getvalue(), input_image_format)
39
- # print(response)
40
- lines = response['readResult']['blocks'][0]['lines']
41
- texts = [line['text'] for line in lines]
42
- print(texts)
43
- extracted_data = extract_details(texts)
44
- return extracted_data
 
1
+ # from io import BytesIO
2
+ # from ...utils.azureOCR import analyze_image
3
+ # from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
4
+ # from app.services.adhaarServices.adhaarDataExtractor import extract_details
5
 
6
+ # def process_results(results, img):
7
+ # precision_data = {label: {"correct": 0, "total": 0} for label in ["aadharNo", "name", "dob", "gender", "address"]}
8
+ # confidence_threshold = 0.3
9
+ # input_image_format = img.format if img.format else "PNG"
10
+ # valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
11
+ # input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
12
 
13
+ # label_to_image = {}
14
+ # extracted_data = {"adhaarNo": "", "dob": "", "gender": "", "name": "", "address": ""}
15
+ # for result in results:
16
+ # for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
17
+ # label = ["aadharNo", "dob", "gender", "name", "address"][int(cls)]
18
+ # print(label, conf)
19
+ # if conf < confidence_threshold or label == "address":
20
+ # continue
21
 
22
+ # x1, y1, x2, y2 = map(int, bbox.tolist())
23
+ # crop_img = img.crop((x1, y1, x2, y2))
24
+ # crop_img = resize_if_needed(crop_img)
25
 
26
+ # if label not in label_to_image or label_to_image[label][1] < conf:
27
+ # label_to_image[label] = (crop_img, conf)
28
+ # precision_data[label]["total"] += 1
29
+ # precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
30
 
31
+ # cropped_images = [img for label, (img, conf) in sorted(label_to_image.items()) if label != "address"]
32
+ # final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
33
 
34
+ # buffer = BytesIO()
35
+ # final_image.save(buffer, format=input_image_format)
36
+ # buffer.seek(0)
37
 
38
+ # response = analyze_image(buffer.getvalue(), input_image_format)
39
+ # # print(response)
40
+ # lines = response['readResult']['blocks'][0]['lines']
41
+ # texts = [line['text'] for line in lines]
42
+ # print(texts)
43
+ # extracted_data = extract_details(texts)
44
+ # return extracted_data
app/services/panServices/panDataExtractor.py CHANGED
@@ -1,95 +1,95 @@
1
- import re
2
- def filter_array(arr):
3
- # Define the regex patterns
4
- pattern_alphanumeric_special = re.compile(r'[\w]+[^.\s\w]+|[^.\s\w]+[\w]+')
5
- pattern_numeric = re.compile(r'^[0-9]+$')
6
- pattern_special_chars = re.compile(r'[^a-zA-Z.\s]+')
7
 
8
- # Filter the array
9
- filtered_array = [
10
- item for item in arr
11
- if not (pattern_alphanumeric_special.search(item) or
12
- pattern_numeric.match(item) or
13
- pattern_special_chars.search(item))
14
- ]
15
- return filtered_array
16
 
17
- def extract_panData(data):
18
- unwanted_words = ["Name", "/Name", 'Permanent', 'Account', 'Number', 'Card', 'नाम', '/Name',
19
- "पिता का नाम", 'नाम / Name', "पिता का नाम/ Father's Name", '414 / Name', 'पिता का नाम / Fath',
20
- "VIT VE Hra / Father's Nama", 'पिता का नाम/ Fal', 'पिता का नाम / Fathe', "पिता का नाम / Father's Na",
21
- 'जन्म की तारीख /।', 'जन्म का ताराख', "पिता का नाम/ Father's Nam", 'नाम /Name', "पिता का नाम / Father's Name",
22
- 'जन्म का वाराज़', 'Date of Birth', 'Permanent Account Number Card', "Date of Birth", "/Date of Birth",
23
- "Permanent Account Number", "Father's Name", "14 /Name", "/Father's Name", 'HTH / Name',"inent Account Number", "anent Account Number C","Permanent Account Number Car",
24
- 'ugr Name']
25
 
26
 
27
 
28
 
29
- # Initialize result object
30
- result = {
31
- "statusCode": 200,
32
- "error": '',
33
- "data": {
34
- "panNo": '',
35
- "name": '',
36
- "fatherName": '',
37
- "dob": ''
38
- }
39
- }
40
 
41
- # Clean the array by removing unwanted words and invalid entries
42
- cleaned_data = []
43
- combination_pattern = re.compile(r'(?=.*[0-9])(?=.*[!@#$%^&*(),?":{}|<>])')
44
 
45
- for item in data:
46
- if item not in unwanted_words and not combination_pattern.search(item):
47
- cleaned_data.append(item)
48
 
49
 
50
- # Check and extract PAN number
51
- pan_pattern = re.compile(r'^[A-Z]{5}\s*[0-9]{4}\s*[A-Z]$')
52
- for item in cleaned_data:
53
- if pan_pattern.match(item):
54
- result["data"]["panNo"] = item
55
- cleaned_data.remove(item)
56
- break
57
 
58
- # Check and extract date of birth
59
- dob_pattern = re.compile(r'^\d{2}[-/]\d{2}[-/]\d{4}$')
60
- for item in cleaned_data:
61
- if dob_pattern.match(item):
62
- result["data"]["dob"] = item
63
- cleaned_data.remove(item)
64
- break
65
 
66
- # If only two values are left, assume they are name and father's name
67
- cleaned_data = filter_array(cleaned_data)
68
- if len(cleaned_data) == 2:
69
- result["data"]["name"] = cleaned_data[0]
70
- result["data"]["fatherName"] = cleaned_data[1]
71
- else:
72
- # Further cleaning of the data array to extract name and father's name
73
- cleaned_data = [item for item in cleaned_data if not combination_pattern.search(item) and item not in unwanted_words]
74
- print(cleaned_data, "after cleaning")
75
- # Check and extract name
76
- name_pattern = re.compile(r'^[A-Za-z .]+$')
77
- if len(cleaned_data) > 0 and name_pattern.match(cleaned_data[0]):
78
- result["data"]["name"] = cleaned_data[0]
79
- else:
80
- result["data"]["name"] = ''
81
 
82
- # Check and extract father's name
83
- if len(cleaned_data) > 1 and name_pattern.match(cleaned_data[1]):
84
- result["data"]["fatherName"] = cleaned_data[1]
85
- else:
86
- result["data"]["fatherName"] = ''
87
 
88
- # Check if any value is empty and set error message
89
- for key, value in result["data"].items():
90
- if value == '':
91
- result["statusCode"] = 400
92
- result["error"] = f"{key} value is not found due to bad image."
93
- break
94
 
95
- return result
 
1
+ # import re
2
+ # def filter_array(arr):
3
+ # # Define the regex patterns
4
+ # pattern_alphanumeric_special = re.compile(r'[\w]+[^.\s\w]+|[^.\s\w]+[\w]+')
5
+ # pattern_numeric = re.compile(r'^[0-9]+$')
6
+ # pattern_special_chars = re.compile(r'[^a-zA-Z.\s]+')
7
 
8
+ # # Filter the array
9
+ # filtered_array = [
10
+ # item for item in arr
11
+ # if not (pattern_alphanumeric_special.search(item) or
12
+ # pattern_numeric.match(item) or
13
+ # pattern_special_chars.search(item))
14
+ # ]
15
+ # return filtered_array
16
 
17
+ # def extract_panData(data):
18
+ # unwanted_words = ["Name", "/Name", 'Permanent', 'Account', 'Number', 'Card', 'नाम', '/Name',
19
+ # "पिता का नाम", 'नाम / Name', "पिता का नाम/ Father's Name", '414 / Name', 'पिता का नाम / Fath',
20
+ # "VIT VE Hra / Father's Nama", 'पिता का नाम/ Fal', 'पिता का नाम / Fathe', "पिता का नाम / Father's Na",
21
+ # 'जन्म की तारीख /।', 'जन्म का ताराख', "पिता का नाम/ Father's Nam", 'नाम /Name', "पिता का नाम / Father's Name",
22
+ # 'जन्म का वाराज़', 'Date of Birth', 'Permanent Account Number Card', "Date of Birth", "/Date of Birth",
23
+ # "Permanent Account Number", "Father's Name", "14 /Name", "/Father's Name", 'HTH / Name',"inent Account Number", "anent Account Number C","Permanent Account Number Car",
24
+ # 'ugr Name']
25
 
26
 
27
 
28
 
29
+ # # Initialize result object
30
+ # result = {
31
+ # "statusCode": 200,
32
+ # "error": '',
33
+ # "data": {
34
+ # "panNo": '',
35
+ # "name": '',
36
+ # "fatherName": '',
37
+ # "dob": ''
38
+ # }
39
+ # }
40
 
41
+ # # Clean the array by removing unwanted words and invalid entries
42
+ # cleaned_data = []
43
+ # combination_pattern = re.compile(r'(?=.*[0-9])(?=.*[!@#$%^&*(),?":{}|<>])')
44
 
45
+ # for item in data:
46
+ # if item not in unwanted_words and not combination_pattern.search(item):
47
+ # cleaned_data.append(item)
48
 
49
 
50
+ # # Check and extract PAN number
51
+ # pan_pattern = re.compile(r'^[A-Z]{5}\s*[0-9]{4}\s*[A-Z]$')
52
+ # for item in cleaned_data:
53
+ # if pan_pattern.match(item):
54
+ # result["data"]["panNo"] = item
55
+ # cleaned_data.remove(item)
56
+ # break
57
 
58
+ # # Check and extract date of birth
59
+ # dob_pattern = re.compile(r'^\d{2}[-/]\d{2}[-/]\d{4}$')
60
+ # for item in cleaned_data:
61
+ # if dob_pattern.match(item):
62
+ # result["data"]["dob"] = item
63
+ # cleaned_data.remove(item)
64
+ # break
65
 
66
+ # # If only two values are left, assume they are name and father's name
67
+ # cleaned_data = filter_array(cleaned_data)
68
+ # if len(cleaned_data) == 2:
69
+ # result["data"]["name"] = cleaned_data[0]
70
+ # result["data"]["fatherName"] = cleaned_data[1]
71
+ # else:
72
+ # # Further cleaning of the data array to extract name and father's name
73
+ # cleaned_data = [item for item in cleaned_data if not combination_pattern.search(item) and item not in unwanted_words]
74
+ # print(cleaned_data, "after cleaning")
75
+ # # Check and extract name
76
+ # name_pattern = re.compile(r'^[A-Za-z .]+$')
77
+ # if len(cleaned_data) > 0 and name_pattern.match(cleaned_data[0]):
78
+ # result["data"]["name"] = cleaned_data[0]
79
+ # else:
80
+ # result["data"]["name"] = ''
81
 
82
+ # # Check and extract father's name
83
+ # if len(cleaned_data) > 1 and name_pattern.match(cleaned_data[1]):
84
+ # result["data"]["fatherName"] = cleaned_data[1]
85
+ # else:
86
+ # result["data"]["fatherName"] = ''
87
 
88
+ # # Check if any value is empty and set error message
89
+ # for key, value in result["data"].items():
90
+ # if value == '':
91
+ # result["statusCode"] = 400
92
+ # result["error"] = f"{key} value is not found due to bad image."
93
+ # break
94
 
95
+ # return result
app/services/panServices/panOcr.py CHANGED
@@ -1,66 +1,66 @@
1
- from io import BytesIO
2
- from ...utils.azureOCR import analyze_image
3
- from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
4
- from .panDataExtractor import extract_panData
5
- # from collections import defaultdict
6
 
7
- def process_results(results, img):
8
- label_indices = {"pan_num": 0, "name": 1, "father": 2, "dob": 3}
9
- confidence_threshold = 0.3
10
- input_image_format = img.format if img.format else "PNG"
11
- valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
12
- input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
13
 
14
- best_crops = {label: (None, -1) for label in label_indices.keys()} # Store best (image, confidence) pairs
15
 
16
- precision_data = {label: {"correct": 0, "total": 0} for label in label_indices.keys()}
17
 
18
- for result in results:
19
- for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
20
- # Ensure the class index is within the bounds of the label list
21
- if int(cls) >= len(label_indices):
22
- print(f"Warning: Class index {cls} is out of range. Skipping this bbox.")
23
- continue
24
 
25
- label = list(label_indices.keys())[int(cls)]
26
- print(label, conf)
27
- if conf < confidence_threshold:
28
- continue
29
 
30
- x1, y1, x2, y2 = map(int, bbox.tolist())
31
- crop_img = img.crop((x1, y1, x2, y2))
32
- crop_img = resize_if_needed(crop_img)
33
- crop_img.save(f"temp_{label}.png")
34
 
35
- # Replace old crop if new one has higher confidence
36
- _, best_conf = best_crops[label]
37
- if conf > best_conf:
38
- best_crops[label] = (crop_img, conf)
39
- precision_data[label]["total"] += 1
40
- precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
41
 
42
- # Extract the images for final processing
43
- cropped_images_with_labels = [(img, label_indices[label], conf) for label, (img, conf) in best_crops.items() if img is not None]
44
 
45
- # Sort the images by their label indices in ascending order
46
- cropped_images_with_labels.sort(key=lambda x: x[1])
47
- print(cropped_images_with_labels, "cropped images with labels")
48
 
49
- if not cropped_images_with_labels:
50
- raise ValueError("No images were cropped.")
51
 
52
- # Extract only the images for concatenation
53
- cropped_images = [img for img, _, _ in cropped_images_with_labels]
54
 
55
- final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
56
- buffer = BytesIO()
57
- final_image.save(buffer, format=input_image_format)
58
- buffer.seek(0)
59
 
60
- response = analyze_image(buffer.getvalue(), input_image_format)
61
- print(response, "response")
62
- lines = response['readResult']['blocks'][0]['lines']
63
- texts = [line['text'] for line in lines]
64
- print(texts, "text after microsoft ocr")
65
- extracted_data = extract_panData(texts)
66
- return extracted_data
 
1
+ # from io import BytesIO
2
+ # from ...utils.azureOCR import analyze_image
3
+ # from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
4
+ # from .panDataExtractor import extract_panData
5
+ # # from collections import defaultdict
6
 
7
+ # def process_results(results, img):
8
+ # label_indices = {"pan_num": 0, "name": 1, "father": 2, "dob": 3}
9
+ # confidence_threshold = 0.3
10
+ # input_image_format = img.format if img.format else "PNG"
11
+ # valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
12
+ # input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
13
 
14
+ # best_crops = {label: (None, -1) for label in label_indices.keys()} # Store best (image, confidence) pairs
15
 
16
+ # precision_data = {label: {"correct": 0, "total": 0} for label in label_indices.keys()}
17
 
18
+ # for result in results:
19
+ # for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
20
+ # # Ensure the class index is within the bounds of the label list
21
+ # if int(cls) >= len(label_indices):
22
+ # print(f"Warning: Class index {cls} is out of range. Skipping this bbox.")
23
+ # continue
24
 
25
+ # label = list(label_indices.keys())[int(cls)]
26
+ # print(label, conf)
27
+ # if conf < confidence_threshold:
28
+ # continue
29
 
30
+ # x1, y1, x2, y2 = map(int, bbox.tolist())
31
+ # crop_img = img.crop((x1, y1, x2, y2))
32
+ # crop_img = resize_if_needed(crop_img)
33
+ # crop_img.save(f"temp_{label}.png")
34
 
35
+ # # Replace old crop if new one has higher confidence
36
+ # _, best_conf = best_crops[label]
37
+ # if conf > best_conf:
38
+ # best_crops[label] = (crop_img, conf)
39
+ # precision_data[label]["total"] += 1
40
+ # precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
41
 
42
+ # # Extract the images for final processing
43
+ # cropped_images_with_labels = [(img, label_indices[label], conf) for label, (img, conf) in best_crops.items() if img is not None]
44
 
45
+ # # Sort the images by their label indices in ascending order
46
+ # cropped_images_with_labels.sort(key=lambda x: x[1])
47
+ # print(cropped_images_with_labels, "cropped images with labels")
48
 
49
+ # if not cropped_images_with_labels:
50
+ # raise ValueError("No images were cropped.")
51
 
52
+ # # Extract only the images for concatenation
53
+ # cropped_images = [img for img, _, _ in cropped_images_with_labels]
54
 
55
+ # final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
56
+ # buffer = BytesIO()
57
+ # final_image.save(buffer, format=input_image_format)
58
+ # buffer.seek(0)
59
 
60
+ # response = analyze_image(buffer.getvalue(), input_image_format)
61
+ # print(response, "response")
62
+ # lines = response['readResult']['blocks'][0]['lines']
63
+ # texts = [line['text'] for line in lines]
64
+ # print(texts, "text after microsoft ocr")
65
+ # extracted_data = extract_panData(texts)
66
+ # return extracted_data
requirements.txt CHANGED
@@ -3,9 +3,9 @@ Werkzeug==2.0.2
3
  python-dotenv
4
  gunicorn
5
  requests
6
- torch
7
- ultralytics
8
- opencv-python-headless
9
- Pillow
10
- pytesseract
11
- openpyxl
 
3
  python-dotenv
4
  gunicorn
5
  requests
6
+ # torch
7
+ # ultralytics
8
+ # opencv-python-headless
9
+ # Pillow
10
+ # pytesseract
11
+ # openpyxl