import streamlit as st from PIL import Image import io import base64 import requests import os # Page configuration st.set_page_config( page_title="Vision OCR", page_icon="🔎", layout="wide", initial_sidebar_state="expanded" ) # Set up Hugging Face API HF_API_KEY = os.environ.get("HF_API_KEY", "") # Get API key from environment variable if not HF_API_KEY: HF_API_KEY = st.secrets.get("HF_API_KEY", "") # Try getting from Streamlit secrets # Hugging Face API function def process_image_with_hf(image_bytes, model_id): API_URL = f"https://api-inference.huggingface.co/models/{model_id}" headers = {"Authorization": f"Bearer {HF_API_KEY}"} # Convert image to base64 image_b64 = base64.b64encode(image_bytes).decode('utf-8') # Prepare payload based on model type if "llava" in model_id.lower(): payload = { "inputs": { "image": image_b64, "prompt": """Analyze the text in the provided image. Extract all readable content and present it in a structured Markdown format that is clear, concise, and well-organized. Ensure proper formatting (e.g., headings, lists, or code blocks) as necessary to represent the content effectively.""" }, "parameters": { "max_new_tokens": 1024 } } else: # Generic payload format for other models payload = { "inputs": { "image": image_b64, "text": """Analyze the text in the provided image. Extract all readable content and present it in a structured Markdown format that is clear, concise, and well-organized. Ensure proper formatting (e.g., headings, lists, or code blocks) as necessary to represent the content effectively.""" } } # Make API request response = requests.post(API_URL, headers=headers, json=payload) if response.status_code != 200: raise Exception(f"API request failed with status code {response.status_code}: {response.text}") # Handle different response formats response_json = response.json() if isinstance(response_json, list): return response_json[0]["generated_text"] elif isinstance(response_json, dict): if "generated_text" in response_json: return response_json["generated_text"] elif "text" in response_json: return response_json["text"] # Fallback return str(response_json) # Title and description in main area try: # Try to load the image from assets folder st.markdown(""" # Vision OCR """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True) except FileNotFoundError: # Fallback if image doesn't exist st.title("Vision OCR") # Add clear button to top right col1, col2 = st.columns([6,1]) with col2: if st.button("Clear 🗑️"): if 'ocr_result' in st.session_state: del st.session_state['ocr_result'] st.rerun() st.markdown('

Extract structured text from images using advanced vision models!

', unsafe_allow_html=True) st.markdown("---") # Add model selection with st.sidebar: st.header("Settings") model_option = st.selectbox( "Select Vision Model", ["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"], index=0 ) # Updated model mapping with confirmed working models model_mapping = { "LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf", "MiniGPT-4": "Vision-CAIR/MiniGPT-4", "Idefics": "HuggingFaceM4/idefics-9b-instruct" } selected_model = model_mapping[model_option] st.header("Upload Image") uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg']) if uploaded_file is not None: # Display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image") # Check if API key is available if not HF_API_KEY: st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.") else: if st.button("Extract Text 🔍", type="primary"): with st.spinner(f"Processing image with {model_option}..."): try: # Get image bytes img_bytes = uploaded_file.getvalue() # Process with Hugging Face API using selected model result = process_image_with_hf(img_bytes, selected_model) st.session_state['ocr_result'] = result except Exception as e: st.error(f"Error processing image: {str(e)}") st.info("Try selecting a different model from the dropdown.") # Main content area for results if 'ocr_result' in st.session_state: st.markdown(st.session_state['ocr_result']) else: st.info("Upload an image and click 'Extract Text' to see the results here.") # Footer st.markdown("---") st.markdown("Made with ❤️ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")