Spaces:

Fer14
/

coffee_machine_captioning

Sleeping

App Files Files Community

vicgalle commited on Jun 10, 2024

Commit

1b5849e

1 Parent(s): 6aba0d7

Add application file

Browse files

Files changed (1) hide show

app.py +29 -2

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from PIL import Image
 st.title("Image to Text Converter")
@@ -7,15 +8,41 @@ st.title("Image to Text Converter")
 uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 if uploaded_image is not None:
     # Display the uploaded image
-    image = Image.open(uploaded_image)
     st.image(image, caption='Uploaded Image.', use_column_width=True)
     # Extract text from the image
     st.write("Extracting text from the image...")
     # Display the extracted text
-    st.text_area("Extracted Text", "desc", height=200)

 import streamlit as st
 from PIL import Image
+from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 st.title("Image to Text Converter")
 uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+model = PaliGemmaForConditionalGeneration.from_pretrained("Fer14/paligemma_coffe_describer")
+processor = PaliGemmaProcessor.from_pretrained("Fer14/paligemma_coffe_describer")
+prompt  = (
+            f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n"
+            "\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n"
+            "in which:\n"
+            "- color: red, black, blue...\n"
+            "- type: coffee machine, coffee maker, espresso coffee machine...\n"
+            "- accessories: a list of accessories like the ones described above\n"
+            "- shape: cubed, round...\n"
+            "- screen: screen, no screen.\n"
+            "- number: amount of buttons to add\n"
+            "- b_color: color of the buttons"
+        )
 if uploaded_image is not None:
     # Display the uploaded image
+    image = Image.open(uploaded_image).convert("RGB")
     st.image(image, caption='Uploaded Image.', use_column_width=True)
+    inputs = processor(
+            text=prompt,
+            images=image,
+            return_tensors="pt",
+            padding="longest",
+        )
+    output = model.generate(**inputs, max_length=1000)
+    out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :]
     # Extract text from the image
     st.write("Extracting text from the image...")
     # Display the extracted text
+    st.text_area("Coffe machine description", out, height=300)