import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import requests from io import BytesIO model_name = "nlpconnect/vit-gpt2-image-captioning" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def generate_caption(image): image = image.convert("RGB") image = image.resize((224, 224)) inputs = tokenizer("Image caption: ", return_tensors="pt", max_length=30, truncation=True) with st.spinner("Generating caption..."): caption_ids = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]) generated_caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True) return generated_caption def main(): st.title("Image Captioning App") with st.form("my_form"): uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) clicked = st.form_submit_button("Generate Caption") if clicked and uploaded_file is not None: caption = generate_caption(image) st.success("Generated Caption:") st.write(caption) if __name__ == "__main__": main()