Spaces:

Fer14
/

coffee_machine_captioning

Sleeping

File size: 2,111 Bytes

6aba0d7
 
1b5849e
be1d27c
6aba0d7
be1d27c
6aba0d7
 
 
 
a6920b2
6aba0d7
a6920b2
 
1b5849e
be1d27c
 
 
 
 
 
 
 
 
 
 
 
1b5849e
 
 
 
 
 
 
 
 
 
 
 
 
6aba0d7
 
1b5849e
6aba0d7
1b5849e
 
 
 
 
 
 
6aba0d7
be1d27c
 
 
 
 
 
1b5849e
 
6aba0d7
1b5849e
6aba0d7

import streamlit as st
from PIL import Image
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
from tqdm import tqdm

st.title("Coffe machine captioning app")


uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

model_id = "Fer14/paligemma_coffe_machine_caption"

model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
processor = PaliGemmaProcessor.from_pretrained(model_id)


# Instructions for Tesseract OCR
st.sidebar.title("Instructions")
st.sidebar.write(
    """
    1. Upload an image using the file uploader.
    2. Wait for the app to process and extract text from the image.
    3. The extracted text will be displayed in the text area.
    """
)


prompt  = (
            f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n"
            "\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n"
            "in which:\n"
            "- color: red, black, blue...\n"
            "- type: coffee machine, coffee maker, espresso coffee machine...\n"
            "- accessories: a list of accessories like the ones described above\n"
            "- shape: cubed, round...\n"
            "- screen: screen, no screen.\n"
            "- number: amount of buttons to add\n"
            "- b_color: color of the buttons"
        )

if uploaded_image is not None:
    # Display the uploaded image
    image = Image.open(uploaded_image).convert("RGB")
    st.image(image, caption='Uploaded Image.', use_column_width=True)

    inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt",
            padding="longest",
        )
    

    st.write("Generating caption for the image...")
    with tqdm(total=100) as pbar:
        output = model.generate(**inputs, max_length=1000)
        pbar.update(100)

    out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :]

    # Display the extracted text
    st.text_area("Coffe machine description", out, height=300)