Spaces:

NLPV
/

PDF_Xtractor

Runtime error

App Files Files Community

NLPV commited on Apr 10

Commit

6a5acd4

verified ·

1 Parent(s): e21cf09

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -1,17 +1,19 @@
-import pytesseract
-from PIL import Image
-import re
 import gradio as gr
-# OCR & Parsing Function
 def extract_info_from_image(image):
-    text = pytesseract.image_to_string(image, lang='hin+eng')
     def extract_data(text):
         data = {}
         data['क्रमांक'] = re.search(r'क्रमांक\s*[:\-]?\s*JS\s*(\d+)', text)
         data['पूरा नाम'] = re.search(r'पूरा नाम\s*[:\-]?\s*(.+)', text)
-        data['लिंग'] = 'पुरुष' if '[x]' in text.split('पुरुष')[0][-3:] else 'महिला' if '[x]' in text.split('महिला')[0][-3:] else 'अन्य'
         data['उम्र'] = re.search(r'उम्र\s*[:\-]?\s*(\d+)', text)
         data['गांव'] = re.search(r'गांव\s*[:\-]?\s*(.+)', text)
         data['पंचायत'] = re.search(r'पंचायत\s*[:\-]?\s*(.+)', text)
@@ -48,16 +50,14 @@ def extract_info_from_image(image):
                 data[k] = v.group(1).strip()
         return data
-    extracted_info = extract_data(text)
-    return extracted_info
-# Gradio Interface
 iface = gr.Interface(
     fn=extract_info_from_image,
     inputs=gr.Image(type="pil"),
     outputs="json",
-    title="जन सुराज सदस्यता फॉर्म डेटा एक्सट्रैक्टर",
-    description="Upload a scanned जन सुराज सदस्यता form and extract the information automatically using OCR."
 )
 if __name__ == "__main__":

 import gradio as gr
+import easyocr
+import re
+# Load OCR Reader (supports Hindi and English)
+reader = easyocr.Reader(['hi', 'en'], gpu=False)
 def extract_info_from_image(image):
+    result = reader.readtext(image, detail=0)
+    text = '\n'.join(result)
     def extract_data(text):
         data = {}
         data['क्रमांक'] = re.search(r'क्रमांक\s*[:\-]?\s*JS\s*(\d+)', text)
         data['पूरा नाम'] = re.search(r'पूरा नाम\s*[:\-]?\s*(.+)', text)
+        data['लिंग'] = 'पुरुष' if 'पुरुष' in text and '[x]' in text.split('पुरुष')[0][-3:] else 'महिला' if 'महिला' in text and '[x]' in text.split('महिला')[0][-3:] else 'अन्य'
         data['उम्र'] = re.search(r'उम्र\s*[:\-]?\s*(\d+)', text)
         data['गांव'] = re.search(r'गांव\s*[:\-]?\s*(.+)', text)
         data['पंचायत'] = re.search(r'पंचायत\s*[:\-]?\s*(.+)', text)
                 data[k] = v.group(1).strip()
         return data
+    return extract_data(text)
 iface = gr.Interface(
     fn=extract_info_from_image,
     inputs=gr.Image(type="pil"),
     outputs="json",
+    title="जन सुराज सदस्यता OCR Extractor",
+    description="Upload a scanned जन सुराज form to extract structured data using OCR."
 )
 if __name__ == "__main__":