Spaces:

ashok2216
/

Image-to-Text

Sleeping

Image-to-Text / app.py

Create app.py

bb8c731 verified 8 months ago

1.32 kB

	import streamlit as st
	from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
	import torch
	from PIL import Image

	# Load the model and tokenizer
	model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
	processor = ViTImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

	# Streamlit app title
	st.title("Image Captioning with ViT-GPT2 Model")
	st.write("Upload an image, and the model will generate a descriptive caption.")

	# File uploader for image input
	uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])

	if uploaded_file is not None:
	# Load and display the uploaded image
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	# Preprocess the image for the model
	inputs = processor(images=image, return_tensors="pt")
	pixel_values = inputs.pixel_values

	# Generate the caption
	with st.spinner("Generating caption..."):
	output = model.generate(pixel_values)
	caption = tokenizer.decode(output[0], skip_special_tokens=True)

	# Display the generated caption
	st.success("Generated Caption:")
	st.write(caption)