File size: 5,399 Bytes
5f1f9d6
9fc4609
0075144
9fc4609
ee54df8
7956ee3
5f1f9d6
ee54df8
5f1f9d6
1e06fb8
0075144
5f1f9d6
 
 
 
 
0075144
 
 
ee54df8
e4af4b8
0075144
 
5f1f9d6
 
 
7956ee3
9fc4609
0075144
5f1f9d6
0075144
 
 
5f1f9d6
1e06fb8
 
5f1f9d6
 
 
 
1e06fb8
5f1f9d6
0075144
 
5f1f9d6
 
0075144
5f1f9d6
1e06fb8
7956ee3
0075144
1e06fb8
7956ee3
71826d4
 
 
7956ee3
 
71826d4
9fc4609
ee54df8
9fc4609
 
 
 
 
 
dbe5bdc
 
e264991
7f1c702
dbe5bdc
ee54df8
 
9fc4609
 
ee54df8
 
 
 
7fac068
ee54df8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fc4609
7956ee3
5f1f9d6
0075144
9fc4609
 
bbec170
9fc4609
 
 
7956ee3
c082e41
 
9fc4609
1e06fb8
 
9fc4609
 
 
1e06fb8
9fc4609
ee54df8
9fc4609
 
 
 
 
ee54df8
 
 
9fc4609
ee54df8
5f1f9d6
0075144
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
import requests
from PIL import Image
import io
from huggingface_hub import InferenceClient

# Streamlit page setup
st.set_page_config(page_title="MTSS Image Accessibility Alt Text Generator", layout="centered")

# Add the logo image with a specified width
image_width = 300  # Set the desired width in pixels
st.image('MTSS.ai_Logo.png', width=image_width)

st.header('VisionTexts™ | Accessibility')
st.subheader('Image Alt Text Creator')

# Retrieve the Hugging Face API Key from secrets
huggingface_api_key = st.secrets["huggingface_api_key"]

# Initialize the Hugging Face inference client
client = InferenceClient(token=huggingface_api_key)

# File uploader allows user to add their own image
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])

if uploaded_file:
    # Display the uploaded image
    image = Image.open(uploaded_file).convert('RGB')
    image_width = 200  # Set the desired width in pixels
    with st.expander("Image", expanded=True):
        st.image(image, caption=uploaded_file.name, width=image_width, use_column_width=False)
else:
    st.warning("Please upload an image.")

# Option for adding additional details
show_details = st.checkbox("Add additional details about the image.", value=False)

if show_details:
    # Text input for additional details about the image
    additional_details = st.text_area(
        "Provide specific information that is important to include in the alt text or reflect why the image is being used:"
    )
else:
    additional_details = ""

# Button to trigger the analysis
analyze_button = st.button("Analyze the Image", type="secondary")

# Prompt for complex image description
complex_image_prompt_text = (
    "As an expert in image accessibility and alternative text, thoroughly describe the image caption provided. "
    "Provide a detailed description using not more than 500 characters that conveys the essential information in eight or fewer clear and concise sentences. "
    "Skip phrases like 'image of' or 'picture of.' "
    "Your description should form a clear, well-structured, and factual paragraph that avoids bullet points, focusing on creating a seamless narrative. "
    "Importantly, only describe what is visibly present in the image and avoid making assumptions or adding extraneous information. "
    "Stick to the facts and ensure the description is accurate and reliable."
)


# Functions to query the Hugging Face Inference API

def query_image_caption(image):
    # Convert PIL image to bytes
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    image_bytes = buffered.getvalue()

    # Use the InferenceClient's image_to_text method
    response = client.image_to_text(
        # model="Salesforce/blip-image-captioning-large",
        model="nlpconnect/vit-gpt2-image-captioning",
        image=image_bytes,
    )
    return response

def query_llm(prompt):
    # System prompt (optional)
    system_prompt = "You are an expert in image accessibility and alternative text."

    # Generate the response using the Hugging Face InferenceClient's chat completion
    response = client.chat.completions.create(
        model="meta-llama/Llama-2-7b-chat-hf",
        messages=[
            {"role": "system", "content": system_prompt},  # Optional system prompt
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0.5,
        max_tokens=1024,
        top_p=0.7
    )

    # Collect the streamed response
    response_content = ""
    for message in response:
        if "choices" in message and len(message["choices"]) > 0:
            delta = message["choices"][0].get("delta", {})
            content = delta.get("content", "")
            response_content += content
            # Optionally, you can update the progress to the user here

    return response_content.strip()

# Check if an image has been uploaded and if the button has been pressed
if uploaded_file is not None and analyze_button:
    with st.spinner("Analyzing the image..."):
        # Get the caption from the image using the image captioning API
        caption_response = query_image_caption(image)

        # Handle potential errors from the API
        if isinstance(caption_response, dict) and caption_response.get("error"):
            st.error(f"Error with image captioning model: {caption_response['error']}")
        else:
            # Since caption_response is a string, assign it directly
            image_caption = caption_response

            # Use the complex image prompt text
            prompt_text = complex_image_prompt_text

            # Include additional details if provided
            if additional_details:
                prompt_text += f"\n\nAdditional context provided by the user:\n{additional_details}"

            # Create the full prompt
            full_prompt = f"{prompt_text}\n\nImage Caption: {image_caption}"

            # Use the language model to generate the alt text description
            llm_response = query_llm(full_prompt)

            # Display the generated alt text
            st.markdown("### Generated Alt Text:")
            st.write(llm_response)

            st.success('Powered by MTSS GPT. AI can make mistakes. Consider checking important information.')
else:
    st.write("Please upload an image and click 'Analyze the Image' to generate alt text.")