File size: 2,214 Bytes
6aba0d7
 
1b5849e
6aba0d7
be1d27c
6aba0d7
7f7b97d
 
 
6aba0d7
7f7b97d
6aba0d7
7f7b97d
 
6aba0d7
7f7b97d
 
 
 
 
1b5849e
be1d27c
 
 
 
 
05dc25c
 
 
be1d27c
 
 
05dc25c
 
 
be1d27c
1b5849e
 
 
 
 
 
 
 
 
 
 
 
 
6aba0d7
 
1b5849e
6aba0d7
1b5849e
 
 
 
 
 
 
6aba0d7
be1d27c
05dc25c
be1d27c
 
1b5849e
 
6aba0d7
05dc25c
6aba0d7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import streamlit as st
from PIL import Image
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor

st.title("Coffe machine captioning app")

@st.cache_resource()
def load_model():
    with st.spinner('Loading model and tokenizer...'):

        model_id = "Fer14/paligemma_coffe_machine_caption"

        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
        processor = PaliGemmaProcessor.from_pretrained(model_id)

    st.success('Model loaded!')
    return model, processor


model, processor = load_model()


st.sidebar.title("Instructions")
st.sidebar.write(
    """
    1. Upload an image using the file uploader.
    2. Wait for the app to process and generate the caption.
    3. The caption will be displayed in the text area.
    4. Enjoy your caption!
    """
)

uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])



prompt  = (
            f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n"
            "\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n"
            "in which:\n"
            "- color: red, black, blue...\n"
            "- type: coffee machine, coffee maker, espresso coffee machine...\n"
            "- accessories: a list of accessories like the ones described above\n"
            "- shape: cubed, round...\n"
            "- screen: screen, no screen.\n"
            "- number: amount of buttons to add\n"
            "- b_color: color of the buttons"
        )

if uploaded_image is not None:
    # Display the uploaded image
    image = Image.open(uploaded_image).convert("RGB")
    st.image(image, caption='Uploaded Image.', use_column_width=True)

    inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt",
            padding="longest",
        )
    

    with st.spinner('Generating caption...'):
        output = model.generate(**inputs, max_length=1000)

    out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :]

    # Display the extracted text
    st.text_area("Coffe machine caption", out, height=300)