import streamlit as st
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Title of the Streamlit app
st.title("Image and Text Combined in One Message")

# Load the pre-trained BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Image upload
uploaded_file = st.file_uploader("Upload a product image (JPG, JPEG, PNG):", type=["jpg", "jpeg", "png"])

if uploaded_file:
    # Open and display the uploaded image
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Generate the description using BLIP model
    st.write("Processing the image...")

    # Process the image and generate a detailed description
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)

    # Decode and display the description
    generated_description = processor.decode(out[0], skip_special_tokens=True)
    
    # Combine Image and Text in One Message
    st.markdown(f"**Generated Product Description:** {generated_description}")
    st.markdown(f"**Here is your product image:**")
    st.image(image, caption="Generated Product Image", use_column_width=True)