Spaces:

bulentsoykan
/

streamlit-OCR-app

Running

App Files Files Community

bulentsoykan commited on Mar 17

Commit

00821bd

verified ·

1 Parent(s): 6a1b293

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -9

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 # Page configuration
 st.set_page_config(
-    page_title="Gemma-3 OCR",
     page_icon="🔎",
     layout="wide",
     initial_sidebar_state="expanded"
@@ -20,7 +20,8 @@ if not HF_API_KEY:
 # Hugging Face API function
 def process_image_with_hf(image_bytes):
-    API_URL = "https://api-inference.huggingface.co/models/google/gemma-3-vision"
     headers = {"Authorization": f"Bearer {HF_API_KEY}"}
     # Convert image to base64
@@ -43,17 +44,23 @@ def process_image_with_hf(image_bytes):
     if response.status_code != 200:
         raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
-    return response.json()[0]["generated_text"]
 # Title and description in main area
 try:
     # Try to load the image from assets folder
     st.markdown("""
-        # <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Gemma-3 OCR
     """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
 except FileNotFoundError:
     # Fallback if image doesn't exist
-    st.title("Gemma-3 OCR")
 # Add clear button to top right
 col1, col2 = st.columns([6,1])
@@ -63,11 +70,27 @@ with col2:
             del st.session_state['ocr_result']
         st.rerun()
-st.markdown('<p style="margin-top: -20px;">Extract structured text from images using Gemma-3 Vision!</p>', unsafe_allow_html=True)
 st.markdown("---")
-# Move upload controls to sidebar
 with st.sidebar:
     st.header("Upload Image")
     uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg'])
@@ -81,8 +104,11 @@ with st.sidebar:
             st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.")
         else:
             if st.button("Extract Text 🔍", type="primary"):
-                with st.spinner("Processing image..."):
                     try:
                         # Get image bytes
                         img_bytes = uploaded_file.getvalue()
@@ -91,6 +117,7 @@ with st.sidebar:
                         st.session_state['ocr_result'] = result
                     except Exception as e:
                         st.error(f"Error processing image: {str(e)}")
 # Main content area for results
 if 'ocr_result' in st.session_state:
@@ -100,4 +127,4 @@ else:
 # Footer
 st.markdown("---")
-st.markdown("Made with using Gemma-3 Vision Model | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")

 # Page configuration
 st.set_page_config(
+    page_title="Vision OCR",
     page_icon="🔎",
     layout="wide",
     initial_sidebar_state="expanded"
 # Hugging Face API function
 def process_image_with_hf(image_bytes):
+    # Use an available multimodal model that can handle images and text
+    API_URL = "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
     headers = {"Authorization": f"Bearer {HF_API_KEY}"}
     # Convert image to base64
     if response.status_code != 200:
         raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
+    # Handle different response formats
+    if isinstance(response.json(), list):
+        return response.json()[0]["generated_text"]
+    elif isinstance(response.json(), dict) and "generated_text" in response.json():
+        return response.json()["generated_text"]
+    else:
+        return str(response.json())
 # Title and description in main area
 try:
     # Try to load the image from assets folder
     st.markdown("""
+        # <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Vision OCR
     """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
 except FileNotFoundError:
     # Fallback if image doesn't exist
+    st.title("Vision OCR")
 # Add clear button to top right
 col1, col2 = st.columns([6,1])
             del st.session_state['ocr_result']
         st.rerun()
+st.markdown('<p style="margin-top: -20px;">Extract structured text from images using advanced vision models!</p>', unsafe_allow_html=True)
 st.markdown("---")
+# Add model selection
 with st.sidebar:
+    st.header("Settings")
+    model_option = st.selectbox(
+        "Select Vision Model",
+        ["LLaVA 1.5 (7B)", "CLIP-ViT", "BLIP-2"],
+        index=0
+    )
+    # Map selection to model ID
+    model_mapping = {
+        "LLaVA 1.5 (7B)": "llava-hf/llava-1.5-7b-hf",
+        "CLIP-ViT": "openai/clip-vit-base-patch32",
+        "BLIP-2": "Salesforce/blip2-opt-2.7b"
+    }
+    selected_model = model_mapping[model_option]
     st.header("Upload Image")
     uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg'])
             st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.")
         else:
             if st.button("Extract Text 🔍", type="primary"):
+                with st.spinner(f"Processing image with {model_option}..."):
                     try:
+                        # Update the model URL
+                        API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
                         # Get image bytes
                         img_bytes = uploaded_file.getvalue()
                         st.session_state['ocr_result'] = result
                     except Exception as e:
                         st.error(f"Error processing image: {str(e)}")
+                        st.info("Try selecting a different model from the dropdown.")
 # Main content area for results
 if 'ocr_result' in st.session_state:
 # Footer
 st.markdown("---")
+st.markdown("Made with ❤️ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")