Spaces:

DeepDiveDev
/

OCR

Runtime error

App Files Files Community

DeepDiveDev commited on Sep 30, 2024

Commit

8976d30

verified ·

1 Parent(s): 17130f7

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -8

app.py CHANGED Viewed

@@ -34,7 +34,6 @@ translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-hi-en')
 # Define a function for keyword highlighting
 def highlight_keywords(text, keyword):
-    # Escape keyword for regex to avoid issues with special characters
     pattern = re.compile(re.escape(keyword), re.IGNORECASE)
     highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
     return highlighted_text
@@ -59,11 +58,11 @@ if image_file is not None:
     if st.button("Run OCR"):
         # Use GOT-OCR2 model for plain text OCR (structured documents)
         with torch.no_grad():
-            res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')  # Removed device parameter
         # Perform formatted text OCR
         with torch.no_grad():
-            res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')  # Removed device parameter
         # Use EasyOCR for both English and Hindi text recognition
         result_easyocr = reader.readtext(temp_file_path, detail=0)
@@ -94,13 +93,13 @@ if image_file is not None:
         # Additional OCR types using GOT-OCR2
         with torch.no_grad():
-            res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')  # Removed device parameter
         st.subheader("Fine-Grained OCR Results:")
         st.write(res_fine_grained)
         # Render formatted OCR to HTML
         with torch.no_grad():
-            res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')  # Removed device parameter
         st.subheader("Rendered OCR Results (HTML):")
         st.write(res_render)
@@ -109,11 +108,8 @@ if image_file is not None:
         if keyword:
             st.subheader("Search Results:")
-            # Highlight the matching sections in the extracted text
             highlighted_text = highlight_keywords(extracted_text, keyword)
             st.markdown(highlighted_text)
         # Clean up the temporary file after use
         os.remove(temp_file_path)
-# Note: No need for if __name__ == "__main__": st.run()

 # Define a function for keyword highlighting
 def highlight_keywords(text, keyword):
     pattern = re.compile(re.escape(keyword), re.IGNORECASE)
     highlighted_text = pattern.sub(lambda match: f"**{match.group(0)}**", text)
     return highlighted_text
     if st.button("Run OCR"):
         # Use GOT-OCR2 model for plain text OCR (structured documents)
         with torch.no_grad():
+            res_plain = model.chat(tokenizer, temp_file_path, ocr_type='ocr')
         # Perform formatted text OCR
         with torch.no_grad():
+            res_format = model.chat(tokenizer, temp_file_path, ocr_type='format')
         # Use EasyOCR for both English and Hindi text recognition
         result_easyocr = reader.readtext(temp_file_path, detail=0)
         # Additional OCR types using GOT-OCR2
         with torch.no_grad():
+            res_fine_grained = model.chat(tokenizer, temp_file_path, ocr_type='ocr', ocr_box='')
         st.subheader("Fine-Grained OCR Results:")
         st.write(res_fine_grained)
         # Render formatted OCR to HTML
         with torch.no_grad():
+            res_render = model.chat(tokenizer, temp_file_path, ocr_type='format', render=True, save_render_file='./demo.html')
         st.subheader("Rendered OCR Results (HTML):")
         st.write(res_render)
         if keyword:
             st.subheader("Search Results:")
             highlighted_text = highlight_keywords(extracted_text, keyword)
             st.markdown(highlighted_text)
         # Clean up the temporary file after use
         os.remove(temp_file_path)