Update app.py
Browse files
app.py
CHANGED
@@ -45,22 +45,38 @@ def extract_text(image):
|
|
45 |
extracted_text.append(line[1][0])
|
46 |
return "\n".join(extracted_text)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
# Function to extract attributes using regex
|
49 |
def extract_attributes(extracted_text):
|
50 |
attributes = {}
|
51 |
|
|
|
|
|
|
|
|
|
|
|
52 |
# Patterns for extracting personal information
|
53 |
patterns = {
|
54 |
-
"Name": r"Name[:\-]?\s*([A-Za-z\s]
|
55 |
"Age": r"Age[:\-]?\s*(\d{1,3})",
|
56 |
"Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
|
57 |
"Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
|
58 |
}
|
59 |
|
60 |
for readable_attr, pattern in patterns.items():
|
61 |
-
match = re.search(pattern,
|
62 |
if match:
|
63 |
attributes[readable_attr] = match.group(1).strip()
|
|
|
|
|
|
|
64 |
|
65 |
if "Gender" in attributes:
|
66 |
attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])
|
|
|
45 |
extracted_text.append(line[1][0])
|
46 |
return "\n".join(extracted_text)
|
47 |
|
48 |
+
# Function to clean extracted text
|
49 |
+
def clean_extracted_text(text):
|
50 |
+
# Replace carriage returns and normalize newlines
|
51 |
+
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
52 |
+
# Strip leading/trailing whitespace and normalize multiple spaces
|
53 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
54 |
+
return text
|
55 |
+
|
56 |
# Function to extract attributes using regex
|
57 |
def extract_attributes(extracted_text):
|
58 |
attributes = {}
|
59 |
|
60 |
+
# Clean the extracted text
|
61 |
+
cleaned_text = clean_extracted_text(extracted_text)
|
62 |
+
print(f"Raw extracted text: '{extracted_text}'")
|
63 |
+
print(f"Cleaned extracted text: '{cleaned_text}'")
|
64 |
+
|
65 |
# Patterns for extracting personal information
|
66 |
patterns = {
|
67 |
+
"Name": r"Name[:\-]?\s*([A-Za-z\s]+)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
|
68 |
"Age": r"Age[:\-]?\s*(\d{1,3})",
|
69 |
"Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
|
70 |
"Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
|
71 |
}
|
72 |
|
73 |
for readable_attr, pattern in patterns.items():
|
74 |
+
match = re.search(pattern, cleaned_text, re.IGNORECASE)
|
75 |
if match:
|
76 |
attributes[readable_attr] = match.group(1).strip()
|
77 |
+
print(f"Extracted {readable_attr}: '{attributes[readable_attr]}'")
|
78 |
+
else:
|
79 |
+
print(f"No match for {readable_attr} with pattern: {pattern}")
|
80 |
|
81 |
if "Gender" in attributes:
|
82 |
attributes["Gender"] = GENDER_MAPPING.get(attributes["Gender"], attributes["Gender"])
|