File size: 3,219 Bytes
f9300e0
 
 
 
3a7616c
f9300e0
 
5037d5c
f9300e0
 
 
 
 
1d9753d
f9300e0
3a7616c
 
 
 
1d9753d
 
 
 
 
 
 
 
 
 
f9300e0
ef9bfa1
 
 
1d9753d
 
 
 
ef9bfa1
 
 
 
1d9753d
3a7616c
f9300e0
1d9753d
 
 
 
 
3a7616c
1d9753d
 
 
 
 
 
 
 
 
f9300e0
 
 
5037d5c
f9300e0
 
 
 
 
 
 
 
 
1d9753d
 
 
 
f9300e0
1d9753d
 
5037d5c
 
 
1d9753d
 
 
f9300e0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import streamlit as st
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
import time  # To simulate progress bar updates

# Load model and processor
@st.cache_resource
def load_model():
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    return processor, model

# Function to preprocess image and handle model execution
def extract_text(image, processor, model):
    # Initialize progress bar
    progress_bar = st.progress(0)
    time.sleep(0.5)

    # Resize the image to fixed dimensions
    try:
        required_size = (224, 224)  # Explicit resizing for model input
        image_resized = image.resize(required_size)
        progress_bar.progress(20)  # Step 1: Image resized
        time.sleep(0.5)

        # Preprocess image (extract pixel values)
        inputs = processor(images=image_resized, return_tensors="pt", do_resize=False).to("cpu")
        pixel_values = inputs.get("pixel_values")

        # Debugging: Check the pixel_values tensor shape
        st.write(f"Pixel Values Shape: {pixel_values.shape}")

        # Check if pixel values are valid
        if pixel_values is None or pixel_values.shape[0] == 0:
            raise ValueError("Preprocessing failed: Empty tensor generated for image.")

        # Additional check to ensure it has expected shape
        if pixel_values.shape[0] != 81 or pixel_values.shape[1] != 2048:
            raise ValueError(f"Unexpected tensor shape: {pixel_values.shape}. Expected shape: [batch_size, 2048].")

        progress_bar.progress(50)  # Step 2: Image preprocessed
        time.sleep(0.5)

        # Perform inference
        with torch.no_grad():
            outputs = model.generate(pixel_values=pixel_values)
            progress_bar.progress(80)  # Step 3: Model processing
            time.sleep(0.5)

        # Decode outputs to text
        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        progress_bar.progress(100)  # Step 4: Completed
        time.sleep(0.5)

        return result

    except Exception as e:
        raise RuntimeError(f"Error during text extraction: {str(e)}")

# Streamlit UI
def main():
    st.title("🖼️ OCR App using SmolVLM-Instruct")
    st.write("Upload an image, and I will extract the text for you!")

    # Load the model and processor
    processor, model = load_model()

    # File uploader
    uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

    if uploaded_file is not None:
        try:
            # Open and display image
            image = Image.open(uploaded_file).convert("RGB")
            st.image(image, caption="Uploaded Image", use_container_width=True)

            # Extract text with progress bar
            with st.spinner("Extracting text... Please wait!"):
                extracted_text = extract_text(image, processor, model)
                st.subheader("📝 Extracted Text:")
                st.write(extracted_text)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()