import streamlit as st import requests from PIL import Image import io from huggingface_hub import InferenceClient # Streamlit page setup st.set_page_config(page_title="MTSS Image Accessibility Alt Text Generator", layout="centered") # Add the logo image with a specified width image_width = 300 # Set the desired width in pixels st.image('MTSS.ai_Logo.png', width=image_width) st.header('VisionTexts™ | Accessibility') st.subheader('Image Alt Text Creator') # Retrieve the Hugging Face API Key from secrets huggingface_api_key = st.secrets["huggingface_api_key"] # Initialize the Hugging Face inference client client = InferenceClient(token=huggingface_api_key) # File uploader allows user to add their own image uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) if uploaded_file: # Display the uploaded image image = Image.open(uploaded_file).convert('RGB') image_width = 200 # Set the desired width in pixels with st.expander("Image", expanded=True): st.image(image, caption=uploaded_file.name, width=image_width, use_column_width=False) else: st.warning("Please upload an image.") # Option for adding additional details show_details = st.checkbox("Add additional details about the image.", value=False) if show_details: # Text input for additional details about the image additional_details = st.text_area( "Provide specific information that is important to include in the alt text or reflect why the image is being used:" ) else: additional_details = "" # Button to trigger the analysis analyze_button = st.button("Analyze the Image", type="secondary") # Prompt for complex image description complex_image_prompt_text = ( "As an expert in image accessibility and alternative text, thoroughly describe the image caption provided. " "Provide a detailed description using not more than 500 characters that conveys the essential information in eight or fewer clear and concise sentences. " "Skip phrases like 'image of' or 'picture of.' " "Your description should form a clear, well-structured, and factual paragraph that avoids bullet points, focusing on creating a seamless narrative. " "Importantly, only describe what is visibly present in the image and avoid making assumptions or adding extraneous information. " "Stick to the facts and ensure the description is accurate and reliable." ) # Functions to query the Hugging Face Inference API def query_image_caption(image): # Convert PIL image to bytes buffered = io.BytesIO() image.save(buffered, format="JPEG") image_bytes = buffered.getvalue() # Use the InferenceClient's image_to_text method response = client.image_to_text( # model="Salesforce/blip-image-captioning-large", model="nlpconnect/vit-gpt2-image-captioning", image=image_bytes, ) return response def query_llm(prompt): # System prompt (optional) system_prompt = "You are an expert in image accessibility and alternative text." # Generate the response using the Hugging Face InferenceClient's chat completion response = client.chat.completions.create( model="meta-llama/Llama-2-7b-chat-hf", messages=[ {"role": "system", "content": system_prompt}, # Optional system prompt {"role": "user", "content": prompt} ], stream=True, temperature=0.5, max_tokens=1024, top_p=0.7 ) # Collect the streamed response response_content = "" for message in response: if "choices" in message and len(message["choices"]) > 0: delta = message["choices"][0].get("delta", {}) content = delta.get("content", "") response_content += content # Optionally, you can update the progress to the user here return response_content.strip() # Check if an image has been uploaded and if the button has been pressed if uploaded_file is not None and analyze_button: with st.spinner("Analyzing the image..."): # Get the caption from the image using the image captioning API caption_response = query_image_caption(image) # Handle potential errors from the API if isinstance(caption_response, dict) and caption_response.get("error"): st.error(f"Error with image captioning model: {caption_response['error']}") else: # Since caption_response is a string, assign it directly image_caption = caption_response # Use the complex image prompt text prompt_text = complex_image_prompt_text # Include additional details if provided if additional_details: prompt_text += f"\n\nAdditional context provided by the user:\n{additional_details}" # Create the full prompt full_prompt = f"{prompt_text}\n\nImage Caption: {image_caption}" # Use the language model to generate the alt text description llm_response = query_llm(full_prompt) # Display the generated alt text st.markdown("### Generated Alt Text:") st.write(llm_response) st.success('Powered by MTSS GPT. AI can make mistakes. Consider checking important information.') else: st.write("Please upload an image and click 'Analyze the Image' to generate alt text.")