Pranay25 commited on
Commit
b85d243
·
verified ·
1 Parent(s): 543b092

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -49,9 +49,10 @@ def extract_text(image):
49
  def clean_extracted_text(text):
50
  # Replace carriage returns and normalize newlines
51
  text = text.replace('\r\n', '\n').replace('\r', '\n')
52
- # Strip leading/trailing whitespace and normalize multiple spaces
53
- text = re.sub(r'\s+', ' ', text.strip())
54
- return text
 
55
 
56
  # Function to extract attributes using regex
57
  def extract_attributes(extracted_text):
@@ -64,10 +65,10 @@ def extract_attributes(extracted_text):
64
 
65
  # Patterns for extracting personal information
66
  patterns = {
67
- "Name": r"Name\s*[:\-]?\s*([\w\s\-\.\']+)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
68
- "Age": r"Age[:\-]?\s*(\d{1,3})",
69
- "Gender": r"Gender[:\-]?\s*(Male|Female|Other)",
70
- "Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)[:\-]?\s*(?:\+91)?([6-9]\d{9})"
71
  }
72
 
73
  for readable_attr, pattern in patterns.items():
 
49
  def clean_extracted_text(text):
50
  # Replace carriage returns and normalize newlines
51
  text = text.replace('\r\n', '\n').replace('\r', '\n')
52
+ # Split into lines, clean each line, then join back
53
+ lines = text.split('\n')
54
+ cleaned_lines = [re.sub(r'\s+', ' ', line.strip()) for line in lines]
55
+ return '\n'.join(cleaned_lines)
56
 
57
  # Function to extract attributes using regex
58
  def extract_attributes(extracted_text):
 
65
 
66
  # Patterns for extracting personal information
67
  patterns = {
68
+ "Name": r"Name\s*[:\-]?\s*([\w\s\-\.\',]+)(?=\s*(?:Age|Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
69
+ "Age": r"Age\s*[:\-]?\s*(\d{1,3})(?=\s*(?:Gender|Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
70
+ "Gender": r"Gender\s*[:\-]?\s*(Male|Female|Other)(?=\s*(?:Phone Number|Phone|Mobile|$|\n|\r\n|\Z))",
71
+ "Phone Number": r"(?:(?:Phone Number)|Phone|Mobile|Phonenumber)\s*[:\-]?\s*(?:\+91)?([6-9]\d{9})(?=\s*(?:$|\n|\r\n|\Z))"
72
  }
73
 
74
  for readable_attr, pattern in patterns.items():