vicgalle's picture
update modle_id
a6920b2
raw
history blame
2.05 kB
import streamlit as st
from PIL import Image
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
st.title("Image to Text Converter")
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
model_id = "Fer14/paligemma_coffe_machine_caption"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
processor = PaliGemmaProcessor.from_pretrained(model_id)
prompt = (
f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n"
"\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n"
"in which:\n"
"- color: red, black, blue...\n"
"- type: coffee machine, coffee maker, espresso coffee machine...\n"
"- accessories: a list of accessories like the ones described above\n"
"- shape: cubed, round...\n"
"- screen: screen, no screen.\n"
"- number: amount of buttons to add\n"
"- b_color: color of the buttons"
)
if uploaded_image is not None:
# Display the uploaded image
image = Image.open(uploaded_image).convert("RGB")
st.image(image, caption='Uploaded Image.', use_column_width=True)
inputs = processor(
text=prompt,
images=image,
return_tensors="pt",
padding="longest",
)
output = model.generate(**inputs, max_length=1000)
out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :]
# Extract text from the image
st.write("Extracting text from the image...")
# Display the extracted text
st.text_area("Coffe machine description", out, height=300)
# Instructions for Tesseract OCR
st.sidebar.title("Instructions")
st.sidebar.write(
"""
1. Upload an image using the file uploader.
2. Wait for the app to process and extract text from the image.
3. The extracted text will be displayed in the text area.
"""
)