Pranay25 commited on
Commit
199272d
·
verified ·
1 Parent(s): 847cfef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -45,22 +45,38 @@ def extract_text(image):
45
  extracted_text.append(line[1][0])
46
  return "\n".join(extracted_text)
47
 
 
 
 
 
 
 
 
 
48
  # Function to extract attributes using regex
49
  def extract_attributes(extracted_text):
50
  attributes = {}
51
 
 
 
 
 
 
52
  # Patterns for extracting personal information
53
  patterns = {
54
- "Name": r"Name[:\-]?\s*([A-Za-z\s]+?)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n))",
55
  "Age": r"Age[:\-]?\s*(\d{1,3})",
56
  "Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
57
  "Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
58
  }
59
 
60
  for readable_attr, pattern in patterns.items():
61
- match = re.search(pattern, extracted_text, re.IGNORECASE)
62
  if match:
63
  attributes[readable_attr] = match.group(1).strip()
 
 
 
64
 
65
  if "Gender" in attributes:
66
  attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])
 
45
  extracted_text.append(line[1][0])
46
  return "\n".join(extracted_text)
47
 
48
+ # Function to clean extracted text
49
+ def clean_extracted_text(text):
50
+ # Replace carriage returns and normalize newlines
51
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
52
+ # Strip leading/trailing whitespace and normalize multiple spaces
53
+ text = re.sub(r'\s+', ' ', text.strip())
54
+ return text
55
+
56
  # Function to extract attributes using regex
57
  def extract_attributes(extracted_text):
58
  attributes = {}
59
 
60
+ # Clean the extracted text
61
+ cleaned_text = clean_extracted_text(extracted_text)
62
+ print(f"Raw extracted text: '{extracted_text}'")
63
+ print(f"Cleaned extracted text: '{cleaned_text}'")
64
+
65
  # Patterns for extracting personal information
66
  patterns = {
67
+ "Name": r"Name[:\-]?\s*([A-Za-z\s]+)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
68
  "Age": r"Age[:\-]?\s*(\d{1,3})",
69
  "Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
70
  "Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
71
  }
72
 
73
  for readable_attr, pattern in patterns.items():
74
+ match = re.search(pattern, cleaned_text, re.IGNORECASE)
75
  if match:
76
  attributes[readable_attr] = match.group(1).strip()
77
+ print(f"Extracted {readable_attr}: '{attributes[readable_attr]}'")
78
+ else:
79
+ print(f"No match for {readable_attr} with pattern: {pattern}")
80
 
81
  if "Gender" in attributes:
82
  attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])