import streamlit as st
from PIL import Image
import io
import base64
import requests
import os
# Page configuration
st.set_page_config(
page_title="Vision OCR",
page_icon="🔎",
layout="wide",
initial_sidebar_state="expanded"
)
# Set up Hugging Face API
HF_API_KEY = os.environ.get("HF_API_KEY", "") # Get API key from environment variable
if not HF_API_KEY:
HF_API_KEY = st.secrets.get("HF_API_KEY", "") # Try getting from Streamlit secrets
# Hugging Face API function
def process_image_with_hf(image_bytes, model_id):
API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
headers = {"Authorization": f"Bearer {HF_API_KEY}"}
# Convert image to base64
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Prepare payload based on model type
if "llava" in model_id.lower():
payload = {
"inputs": {
"image": image_b64,
"prompt": """Analyze the text in the provided image. Extract all readable content
and present it in a structured Markdown format that is clear, concise,
and well-organized. Ensure proper formatting (e.g., headings, lists, or
code blocks) as necessary to represent the content effectively."""
},
"parameters": {
"max_new_tokens": 1024
}
}
else:
# Generic payload format for other models
payload = {
"inputs": {
"image": image_b64,
"text": """Analyze the text in the provided image. Extract all readable content
and present it in a structured Markdown format that is clear, concise,
and well-organized. Ensure proper formatting (e.g., headings, lists, or
code blocks) as necessary to represent the content effectively."""
}
}
# Make API request
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code != 200:
raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
# Handle different response formats
response_json = response.json()
if isinstance(response_json, list):
return response_json[0]["generated_text"]
elif isinstance(response_json, dict):
if "generated_text" in response_json:
return response_json["generated_text"]
elif "text" in response_json:
return response_json["text"]
# Fallback
return str(response_json)
# Title and description in main area
try:
# Try to load the image from assets folder
st.markdown("""
# Vision OCR
""".format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
except FileNotFoundError:
# Fallback if image doesn't exist
st.title("Vision OCR")
# Add clear button to top right
col1, col2 = st.columns([6,1])
with col2:
if st.button("Clear 🗑️"):
if 'ocr_result' in st.session_state:
del st.session_state['ocr_result']
st.rerun()
st.markdown('
Extract structured text from images using advanced vision models!
', unsafe_allow_html=True) st.markdown("---") # Add model selection with st.sidebar: st.header("Settings") model_option = st.selectbox( "Select Vision Model", ["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"], index=0 ) # Updated model mapping with confirmed working models model_mapping = { "LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf", "MiniGPT-4": "Vision-CAIR/MiniGPT-4", "Idefics": "HuggingFaceM4/idefics-9b-instruct" } selected_model = model_mapping[model_option] st.header("Upload Image") uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg']) if uploaded_file is not None: # Display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image") # Check if API key is available if not HF_API_KEY: st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.") else: if st.button("Extract Text 🔍", type="primary"): with st.spinner(f"Processing image with {model_option}..."): try: # Get image bytes img_bytes = uploaded_file.getvalue() # Process with Hugging Face API using selected model result = process_image_with_hf(img_bytes, selected_model) st.session_state['ocr_result'] = result except Exception as e: st.error(f"Error processing image: {str(e)}") st.info("Try selecting a different model from the dropdown.") # Main content area for results if 'ocr_result' in st.session_state: st.markdown(st.session_state['ocr_result']) else: st.info("Upload an image and click 'Extract Text' to see the results here.") # Footer st.markdown("---") st.markdown("Made with ❤️ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")