NLPV commited on
Commit
6a5acd4
·
verified ·
1 Parent(s): e21cf09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -1,17 +1,19 @@
1
- import pytesseract
2
- from PIL import Image
3
- import re
4
  import gradio as gr
 
 
 
 
 
5
 
6
- # OCR & Parsing Function
7
  def extract_info_from_image(image):
8
- text = pytesseract.image_to_string(image, lang='hin+eng')
9
-
 
10
  def extract_data(text):
11
  data = {}
12
  data['क्रमांक'] = re.search(r'क्रमांक\s*[:\-]?\s*JS\s*(\d+)', text)
13
  data['पूरा नाम'] = re.search(r'पूरा नाम\s*[:\-]?\s*(.+)', text)
14
- data['लिंग'] = 'पुरुष' if '[x]' in text.split('पुरुष')[0][-3:] else 'महिला' if '[x]' in text.split('महिला')[0][-3:] else 'अन्य'
15
  data['उम्र'] = re.search(r'उम्र\s*[:\-]?\s*(\d+)', text)
16
  data['गांव'] = re.search(r'गांव\s*[:\-]?\s*(.+)', text)
17
  data['पंचायत'] = re.search(r'पंचायत\s*[:\-]?\s*(.+)', text)
@@ -48,16 +50,14 @@ def extract_info_from_image(image):
48
  data[k] = v.group(1).strip()
49
  return data
50
 
51
- extracted_info = extract_data(text)
52
- return extracted_info
53
 
54
- # Gradio Interface
55
  iface = gr.Interface(
56
  fn=extract_info_from_image,
57
  inputs=gr.Image(type="pil"),
58
  outputs="json",
59
- title="जन सुराज सदस्यता फॉर्म डेटा एक्सट्रैक्टर",
60
- description="Upload a scanned जन सुराज सदस्यता form and extract the information automatically using OCR."
61
  )
62
 
63
  if __name__ == "__main__":
 
 
 
 
1
  import gradio as gr
2
+ import easyocr
3
+ import re
4
+
5
+ # Load OCR Reader (supports Hindi and English)
6
+ reader = easyocr.Reader(['hi', 'en'], gpu=False)
7
 
 
8
  def extract_info_from_image(image):
9
+ result = reader.readtext(image, detail=0)
10
+ text = '\n'.join(result)
11
+
12
  def extract_data(text):
13
  data = {}
14
  data['क्रमांक'] = re.search(r'क्रमांक\s*[:\-]?\s*JS\s*(\d+)', text)
15
  data['पूरा नाम'] = re.search(r'पूरा नाम\s*[:\-]?\s*(.+)', text)
16
+ data['लिंग'] = 'पुरुष' if 'पुरुष' in text and '[x]' in text.split('पुरुष')[0][-3:] else 'महिला' if 'महिला' in text and '[x]' in text.split('महिला')[0][-3:] else 'अन्य'
17
  data['उम्र'] = re.search(r'उम्र\s*[:\-]?\s*(\d+)', text)
18
  data['गांव'] = re.search(r'गांव\s*[:\-]?\s*(.+)', text)
19
  data['पंचायत'] = re.search(r'पंचायत\s*[:\-]?\s*(.+)', text)
 
50
  data[k] = v.group(1).strip()
51
  return data
52
 
53
+ return extract_data(text)
 
54
 
 
55
  iface = gr.Interface(
56
  fn=extract_info_from_image,
57
  inputs=gr.Image(type="pil"),
58
  outputs="json",
59
+ title="जन सुराज सदस्यता OCR Extractor",
60
+ description="Upload a scanned जन सुराज form to extract structured data using OCR."
61
  )
62
 
63
  if __name__ == "__main__":