Spaces:

Fer14
/

coffee_machine_captioning

Sleeping

App Files Files Community

coffee_machine_captioning / app.py

vicgalle

update app

05dc25c 5 months ago

raw

history blame

2.15 kB

	import streamlit as st
	from PIL import Image
	from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
	from tqdm import tqdm

	st.title("Coffe machine captioning app")

	with st.spinner('Loading model and tokenizer...'):

	model_id = "Fer14/paligemma_coffe_machine_caption"

	model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
	processor = PaliGemmaProcessor.from_pretrained(model_id)

	st.success('Model loaded!')



	# Instructions for Tesseract OCR
	st.sidebar.title("Instructions")
	st.sidebar.write(
	"""
	1. Upload an image using the file uploader.
	2. Wait for the app to process and generate the caption.
	3. The caption will be displayed in the text area.
	4. Enjoy your caption!
	"""
	)

	uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])



	prompt = (
	f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n"
	"\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n"
	"in which:\n"
	"- color: red, black, blue...\n"
	"- type: coffee machine, coffee maker, espresso coffee machine...\n"
	"- accessories: a list of accessories like the ones described above\n"
	"- shape: cubed, round...\n"
	"- screen: screen, no screen.\n"
	"- number: amount of buttons to add\n"
	"- b_color: color of the buttons"
	)

	if uploaded_image is not None:
	# Display the uploaded image
	image = Image.open(uploaded_image).convert("RGB")
	st.image(image, caption='Uploaded Image.', use_column_width=True)

	inputs = processor(
	text=prompt,
	images=image,
	return_tensors="pt",
	padding="longest",
	)


	with st.spinner('Generating caption...'):
	output = model.generate(**inputs, max_length=1000)

	out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :]

	# Display the extracted text
	st.text_area("Coffe machine caption", out, height=300)