import streamlit as st
from PIL import Image
import io
import base64
import requests
import os

# Page configuration
st.set_page_config(
    page_title="Vision OCR",
    page_icon="🔎",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Set up Hugging Face API
HF_API_KEY = os.environ.get("HF_API_KEY", "")  # Get API key from environment variable
if not HF_API_KEY:
    HF_API_KEY = st.secrets.get("HF_API_KEY", "")  # Try getting from Streamlit secrets

# Hugging Face API function
def process_image_with_hf(image_bytes, model_id):
    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HF_API_KEY}"}
    
    # Convert image to base64
    image_b64 = base64.b64encode(image_bytes).decode('utf-8')
    
    # Prepare payload based on model type
    if "llava" in model_id.lower():
        payload = {
            "inputs": {
                "image": image_b64,
                "prompt": """Analyze the text in the provided image. Extract all readable content
                        and present it in a structured Markdown format that is clear, concise, 
                        and well-organized. Ensure proper formatting (e.g., headings, lists, or
                        code blocks) as necessary to represent the content effectively."""
            },
            "parameters": {
                "max_new_tokens": 1024
            }
        }
    else:
        # Generic payload format for other models
        payload = {
            "inputs": {
                "image": image_b64,
                "text": """Analyze the text in the provided image. Extract all readable content
                        and present it in a structured Markdown format that is clear, concise, 
                        and well-organized. Ensure proper formatting (e.g., headings, lists, or
                        code blocks) as necessary to represent the content effectively."""
            }
        }
    
    # Make API request
    response = requests.post(API_URL, headers=headers, json=payload)
    
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
    
    # Handle different response formats
    response_json = response.json()
    if isinstance(response_json, list):
        return response_json[0]["generated_text"]
    elif isinstance(response_json, dict):
        if "generated_text" in response_json:
            return response_json["generated_text"]
        elif "text" in response_json:
            return response_json["text"]
    
    # Fallback
    return str(response_json)

# Title and description in main area
try:
    # Try to load the image from assets folder
    st.markdown("""
        # <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Vision OCR
    """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
except FileNotFoundError:
    # Fallback if image doesn't exist
    st.title("Vision OCR")

# Add clear button to top right
col1, col2 = st.columns([6,1])
with col2:
    if st.button("Clear 🗑️"):
        if 'ocr_result' in st.session_state:
            del st.session_state['ocr_result']
        st.rerun()

st.markdown('<p style="margin-top: -20px;">Extract structured text from images using advanced vision models!</p>', unsafe_allow_html=True)
st.markdown("---")

# Add model selection
with st.sidebar:
    st.header("Settings")
    model_option = st.selectbox(
        "Select Vision Model",
        ["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"],
        index=0
    )
    
    # Updated model mapping with confirmed working models
    model_mapping = {
        "LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf",
        "MiniGPT-4": "Vision-CAIR/MiniGPT-4",
        "Idefics": "HuggingFaceM4/idefics-9b-instruct"
    }
    
    selected_model = model_mapping[model_option]
    
    st.header("Upload Image")
    uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg'])
    
    if uploaded_file is not None:
        # Display the uploaded image
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image")
        
        # Check if API key is available
        if not HF_API_KEY:
            st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.")
        else:
            if st.button("Extract Text 🔍", type="primary"):
                with st.spinner(f"Processing image with {model_option}..."):
                    try:
                        # Get image bytes
                        img_bytes = uploaded_file.getvalue()
                        
                        # Process with Hugging Face API using selected model
                        result = process_image_with_hf(img_bytes, selected_model)
                        st.session_state['ocr_result'] = result
                    except Exception as e:
                        st.error(f"Error processing image: {str(e)}")
                        st.info("Try selecting a different model from the dropdown.")

# Main content area for results
if 'ocr_result' in st.session_state:
    st.markdown(st.session_state['ocr_result'])
else:
    st.info("Upload an image and click 'Extract Text' to see the results here.")

# Footer
st.markdown("---")
st.markdown("Made with ❤️ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")