import streamlit as st from PIL import Image from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor from tqdm import tqdm st.title("Coffe machine captioning app") uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) model_id = "Fer14/paligemma_coffe_machine_caption" model = PaliGemmaForConditionalGeneration.from_pretrained(model_id) processor = PaliGemmaProcessor.from_pretrained(model_id) # Instructions for Tesseract OCR st.sidebar.title("Instructions") st.sidebar.write( """ 1. Upload an image using the file uploader. 2. Wait for the app to process and extract text from the image. 3. The extracted text will be displayed in the text area. """ ) prompt = ( f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n" "\"A , , shaped, with and butons\"\n\n" "in which:\n" "- color: red, black, blue...\n" "- type: coffee machine, coffee maker, espresso coffee machine...\n" "- accessories: a list of accessories like the ones described above\n" "- shape: cubed, round...\n" "- screen: screen, no screen.\n" "- number: amount of buttons to add\n" "- b_color: color of the buttons" ) if uploaded_image is not None: # Display the uploaded image image = Image.open(uploaded_image).convert("RGB") st.image(image, caption='Uploaded Image.', use_column_width=True) inputs = processor( text=prompt, images=image, return_tensors="pt", padding="longest", ) st.write("Generating caption for the image...") with tqdm(total=100) as pbar: output = model.generate(**inputs, max_length=1000) pbar.update(100) out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :] # Display the extracted text st.text_area("Coffe machine description", out, height=300)