Spaces:
Running
Running
import streamlit as st | |
from PIL import Image | |
import io | |
import base64 | |
import requests | |
import os | |
# Page configuration | |
st.set_page_config( | |
page_title="Vision OCR", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Set up Hugging Face API | |
HF_API_KEY = os.environ.get("HF_API_KEY", "") # Get API key from environment variable | |
if not HF_API_KEY: | |
HF_API_KEY = st.secrets.get("HF_API_KEY", "") # Try getting from Streamlit secrets | |
# Hugging Face API function | |
def process_image_with_hf(image_bytes, model_id): | |
API_URL = f"https://api-inference.huggingface.co/models/{model_id}" | |
headers = {"Authorization": f"Bearer {HF_API_KEY}"} | |
# Convert image to base64 | |
image_b64 = base64.b64encode(image_bytes).decode('utf-8') | |
# Prepare payload based on model type | |
if "llava" in model_id.lower(): | |
payload = { | |
"inputs": { | |
"image": image_b64, | |
"prompt": """Analyze the text in the provided image. Extract all readable content | |
and present it in a structured Markdown format that is clear, concise, | |
and well-organized. Ensure proper formatting (e.g., headings, lists, or | |
code blocks) as necessary to represent the content effectively.""" | |
}, | |
"parameters": { | |
"max_new_tokens": 1024 | |
} | |
} | |
else: | |
# Generic payload format for other models | |
payload = { | |
"inputs": { | |
"image": image_b64, | |
"text": """Analyze the text in the provided image. Extract all readable content | |
and present it in a structured Markdown format that is clear, concise, | |
and well-organized. Ensure proper formatting (e.g., headings, lists, or | |
code blocks) as necessary to represent the content effectively.""" | |
} | |
} | |
# Make API request | |
response = requests.post(API_URL, headers=headers, json=payload) | |
if response.status_code != 200: | |
raise Exception(f"API request failed with status code {response.status_code}: {response.text}") | |
# Handle different response formats | |
response_json = response.json() | |
if isinstance(response_json, list): | |
return response_json[0]["generated_text"] | |
elif isinstance(response_json, dict): | |
if "generated_text" in response_json: | |
return response_json["generated_text"] | |
elif "text" in response_json: | |
return response_json["text"] | |
# Fallback | |
return str(response_json) | |
# Title and description in main area | |
try: | |
# Try to load the image from assets folder | |
st.markdown(""" | |
# <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Vision OCR | |
""".format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True) | |
except FileNotFoundError: | |
# Fallback if image doesn't exist | |
st.title("Vision OCR") | |
# Add clear button to top right | |
col1, col2 = st.columns([6,1]) | |
with col2: | |
if st.button("Clear ποΈ"): | |
if 'ocr_result' in st.session_state: | |
del st.session_state['ocr_result'] | |
st.rerun() | |
st.markdown('<p style="margin-top: -20px;">Extract structured text from images using advanced vision models!</p>', unsafe_allow_html=True) | |
st.markdown("---") | |
# Add model selection | |
with st.sidebar: | |
st.header("Settings") | |
model_option = st.selectbox( | |
"Select Vision Model", | |
["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"], | |
index=0 | |
) | |
# Updated model mapping with confirmed working models | |
model_mapping = { | |
"LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf", | |
"MiniGPT-4": "Vision-CAIR/MiniGPT-4", | |
"Idefics": "HuggingFaceM4/idefics-9b-instruct" | |
} | |
selected_model = model_mapping[model_option] | |
st.header("Upload Image") | |
uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg']) | |
if uploaded_file is not None: | |
# Display the uploaded image | |
image = Image.open(uploaded_file) | |
st.image(image, caption="Uploaded Image") | |
# Check if API key is available | |
if not HF_API_KEY: | |
st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.") | |
else: | |
if st.button("Extract Text π", type="primary"): | |
with st.spinner(f"Processing image with {model_option}..."): | |
try: | |
# Get image bytes | |
img_bytes = uploaded_file.getvalue() | |
# Process with Hugging Face API using selected model | |
result = process_image_with_hf(img_bytes, selected_model) | |
st.session_state['ocr_result'] = result | |
except Exception as e: | |
st.error(f"Error processing image: {str(e)}") | |
st.info("Try selecting a different model from the dropdown.") | |
# Main content area for results | |
if 'ocr_result' in st.session_state: | |
st.markdown(st.session_state['ocr_result']) | |
else: | |
st.info("Upload an image and click 'Extract Text' to see the results here.") | |
# Footer | |
st.markdown("---") | |
st.markdown("Made with β€οΈ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)") |