File size: 6,713 Bytes
592c01e
 
 
 
 
 
 
fdb9c67
592c01e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74de6d6
592c01e
 
 
 
 
74de6d6
592c01e
 
 
 
 
 
958f56e
592c01e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdb9c67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint
from PIL import Image
import os
import secrets
from pathlib import Path
import tempfile

# Initialize the Hugging Face BLIP model
image_captioning_model = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base",
    huggingfacehub_api_token=os.getenv("HUGGING_FACE_API"),  # Ensure you set this in your environment
    temperature=0.7,
    max_new_tokens=1024,
)
math_llm=HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct",
    huggingfacehub_api_token=os.getenv("HUGGING_FACE_API"),  # Ensure you set this in your environment
    temperature=0.7,
    max_new_tokens=1024,)
# Function to process the image
def process_image(image, shouldConvert=False):
    # Ensure temporary directory exists
    uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(
        Path(tempfile.gettempdir()) / "gradio"
    )
    os.makedirs(uploaded_file_dir, exist_ok=True)
    
    # Save the uploaded image
    name = f"tmp{secrets.token_hex(20)}.jpg"
    filename = os.path.join(uploaded_file_dir, name)
    if shouldConvert:
        # Convert image to RGB mode if it contains transparency
        new_img = Image.new("RGB", size=(image.width, image.height), color=(255, 255, 255))
        new_img.paste(image, (0, 0), mask=image)
        image = new_img
    image.save(filename)

    # Define a PromptTemplate for text instruction
    template = """
    You are a helpful AI assistant.
    Please describe the math-related content in this image, ensuring that any LaTeX formulas are correctly transcribed. 
    Non-mathematical details do not need to be described.

    Image Path: {image}
    """
    prompt_template = PromptTemplate(
        input_variables=["image"],  # Dynamically insert the image path
        template=template
    )

    # Create the text instruction by rendering the prompt template
    prompt = prompt_template.format(image=f"file://{filename}")

    # Use the model with both the image and the generated prompt
    with open(filename, "rb") as img_file:
        response = image_captioning_model({
            "inputs": {
                "image": img_file,
                "text": prompt
            }
        })

    # Return the model's response
    return response

def get_math_response(image_description, user_question):
    template = """
    You are a helpful AI assistant specialized in solving math reasoning problems. 
    Analyze the following question carefully and provide a step-by-step explanation along with the answer.
    Image description : {image_description}
    Question: {user_question}?
    """

    prompt_template = PromptTemplate(
        input_variables=["user_question","image_description"],  # Define the placeholder(s) in the template
        template=template
    )
    formatted_prompt = prompt_template.format(user_question=user_question, image_description=image_description)

    # Pass the formatted prompt to the model
    response = math_llm(formatted_prompt)

    # Print the response
    yield response

def math_chat_bot(image, sketchpad, question, state):
    current_tab_index = state["tab_index"]
    image_description = None
    # Upload
    if current_tab_index == 0:
        if image is not None:
            image_description = process_image(image)
    # Sketch
    elif current_tab_index == 1:
        print(sketchpad)
        if sketchpad and sketchpad["composite"]:
            image_description = process_image(sketchpad["composite"], True)
    yield from get_math_response(image_description, question)
css = """
#qwen-md .katex-display { display: inline; }
#qwen-md .katex-display>.katex { display: inline; }
#qwen-md .katex-display>.katex>.katex-html { display: inline; }
"""
def tabs_select(e: gr.SelectData, _state):
    _state["tab_index"] = e.index

with gr.Blocks(css=css) as demo:

    state = gr.State({"tab_index": 0})
    with gr.Row():
        with gr.Column():
            with gr.Tabs() as input_tabs:
                with gr.Tab("Upload"):
                    input_image = gr.Image(type="pil", label="Upload"),
                with gr.Tab("Sketch"):
                    input_sketchpad = gr.Sketchpad(type="pil", label="Sketch", layers=False)
            input_tabs.select(fn=tabs_select, inputs=[state])
            input_text = gr.Textbox(label="input your question")
            with gr.Row():
                with gr.Column():
                    clear_btn = gr.ClearButton(
                        [*input_image, input_sketchpad, input_text])
                with gr.Column():
                    submit_btn = gr.Button("Submit", variant="primary")
        with gr.Column():
            output_md = gr.Markdown(label="answer",
                                    latex_delimiters=[{
                                        "left": "\\(",
                                        "right": "\\)",
                                        "display": True
                                    }, {
                                        "left": "\\begin\{equation\}",
                                        "right": "\\end\{equation\}",
                                        "display": True
                                    }, {
                                        "left": "\\begin\{align\}",
                                        "right": "\\end\{align\}",
                                        "display": True
                                    }, {
                                        "left": "\\begin\{alignat\}",
                                        "right": "\\end\{alignat\}",
                                        "display": True
                                    }, {
                                        "left": "\\begin\{gather\}",
                                        "right": "\\end\{gather\}",
                                        "display": True
                                    }, {
                                        "left": "\\begin\{CD\}",
                                        "right": "\\end\{CD\}",
                                        "display": True
                                    }, {
                                        "left": "\\[",
                                        "right": "\\]",
                                        "display": True
                                    }],
                                    elem_id="qwen-md")
        submit_btn.click(
            fn=math_chat_bot,
            inputs=[*input_image, input_sketchpad, input_text, state],
            outputs=output_md)
demo.launch()