File size: 3,038 Bytes
76c5b28
 
 
 
 
 
 
 
 
3930e60
76c5b28
 
 
3930e60
76c5b28
 
 
 
 
 
 
 
0c3f7ae
 
 
 
 
 
 
3930e60
0c3f7ae
 
76c5b28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c3f7ae
 
 
 
 
 
 
 
76c5b28
0c3f7ae
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import streamlit as st
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import torch

# Load the Florence model and processor
@st.cache_resource
def load_model():
    model_id = 'microsoft/Florence-2-large'
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().to(torch.float32)
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
    return model, processor

# Load the model and processor globally
model, processor = load_model()

# Function to run the model
def run_example(task_prompt, image, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    # Prepare inputs
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    inputs["input_ids"] = inputs["input_ids"].to(torch.float32)
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)

    # Ensure the model is in float32 mode
    # The model has already been converted to float32 during loading, so this is not needed here.

    # Generate predictions
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

# Streamlit UI
st.title("Microsoft Florence Image Captioning (CPU)")

# File uploader
uploaded_file = st.file_uploader("Upload an image (PNG or JPG)", type=["png", "jpg", "jpeg"])

if uploaded_file is not None:
    # Convert and display the image
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Generate captions
    st.subheader("Generated Captions")
    
    with st.spinner("Generating caption..."):
        try:
            caption = run_example('<CAPTION>', image)
            detailed_caption = run_example('<DETAILED_CAPTION>', image)
            more_detailed_caption = run_example('<MORE_DETAILED_CAPTION>', image)

            st.write("**Caption:**", caption)
            st.write("**Detailed Caption:**", detailed_caption)
            st.write("**More Detailed Caption:**", more_detailed_caption)

            # Option to save the output
            if st.button("Save Captions"):
                output_path = "captions.txt"
                with open(output_path, "w") as file:
                    file.write(f"Caption: {caption}\n")
                    file.write(f"Detailed Caption: {detailed_caption}\n")
                    file.write(f"More Detailed Caption: {more_detailed_caption}\n")
                st.success(f"Captions saved to {output_path}!")
        except Exception as e:
            st.error(f"Error: {e}")