File size: 11,518 Bytes
fffe03d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import gradio as gr
import subprocess
import time

# import os # Not strictly needed in *this* version of app.py as no env vars are read

# --- Ollama Helper Functions ---


def check_ollama_running():
    """Checks if the Ollama service is accessible."""
    try:
        subprocess.run(["ollama", "ps"], check=True, capture_output=True, timeout=5)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
        return False


def get_ollama_models():
    """Gets a list of locally available Ollama models."""
    # Removed the 'if not check_ollama_running(): return []'
    # because it's called after AVAILABLE_MODELS is determined,
    # and check_ollama_running is implicitly done by the initial AVAILABLE_MODELS load.
    # However, in a container, Ollama should be running.
    try:
        result = subprocess.run(["ollama", "list"], check=True, capture_output=True, text=True, timeout=10)
        models = []
        lines = result.stdout.strip().split("\n")
        if len(lines) > 1:
            for line in lines[1:]:
                parts = line.split()
                if parts:
                    models.append(parts[0])
        # Ensure models are sorted and unique for consistent dropdown
        return sorted(list(set(models)))
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e:
        print(f"Error in get_ollama_models: {e}")  # Added a print for debugging
        return []


# --- Core Logic ---

# Typing speed simulation
CHAR_DELAY = 0.02  # Adjust for desired speed (0.01 is fast, 0.05 is slower)


def reasoning_ollama_stream(model_name, prompt, mode):  # Renamed prompt_text back to prompt
    """
    Streams response from an Ollama model with simulated typing speed.
    """
    if not model_name:
        yield "Error: No model selected. Please choose a model."
        return
    if not prompt.strip():  # Using original 'prompt' variable name
        yield "Error: Prompt cannot be empty."
        return

    # This check is good for robustness, even in Docker.
    if not check_ollama_running():
        yield "Error: Ollama service does not seem to be running or accessible. Please start Ollama."
        return

    # This is a runtime check. The Dockerfile aims to pull models, but this confirms.
    available_models_runtime = get_ollama_models()
    if model_name not in available_models_runtime:
        yield f"Error: Model '{model_name}' selected, but not found by Ollama at runtime. Available: {available_models_runtime}. Please ensure it was pulled."
        return

    # Using original 'prompt' and 'mode'
    prompt_with_mode = f"{prompt.strip()} /{mode}"
    command = ["ollama", "run", model_name]

    displayed_response = ""
    try:
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True,
        )

        process.stdin.write(prompt_with_mode + "\n")
        process.stdin.close()

        for line_chunk in iter(process.stdout.readline, ""):
            if not line_chunk and process.poll() is not None:  # Check if process ended
                break
            for char in line_chunk:
                displayed_response += char
                yield displayed_response
                if char.strip():  # Only sleep for non-whitespace characters
                    time.sleep(CHAR_DELAY)

        process.stdout.close()
        return_code = process.wait(timeout=10)  # Added timeout to wait

        if return_code != 0:
            error_output = process.stderr.read()
            error_message = f"\n\n--- Ollama Error (code {return_code}) ---\n{error_output.strip()}"
            if displayed_response and not displayed_response.endswith(error_message):
                displayed_response += error_message
            elif not displayed_response:
                displayed_response = error_message.strip()
            yield displayed_response
            return

        if not displayed_response.strip() and return_code == 0:
            yield "Model returned an empty response."
        elif displayed_response:
            yield displayed_response

    except FileNotFoundError:
        yield "Error: 'ollama' command not found. Please ensure Ollama is installed and in your PATH (or Dockerfile is correct)."
    except subprocess.TimeoutExpired:  # Catch timeout from process.wait()
        yield "Error: Ollama process timed out while waiting for completion."
        if displayed_response:
            yield displayed_response
    except Exception as e:
        yield f"An unexpected error occurred: {str(e)}"
        if displayed_response:
            yield displayed_response


# --- Gradio UI ---

# This runs once when the script starts.
# In Docker, this will query the Ollama instance inside the container AFTER models are pulled by CMD.
AVAILABLE_MODELS = get_ollama_models()
QWEN_MODELS = [m for m in AVAILABLE_MODELS if "qwen" in m.lower()]
INITIAL_MODEL = None

# Prioritize qwen3:4b if available - This logic is from your original app.py
if "qwen3:4b" in AVAILABLE_MODELS:
    INITIAL_MODEL = "qwen3:4b"
elif QWEN_MODELS:
    INITIAL_MODEL = QWEN_MODELS[0]
elif AVAILABLE_MODELS:
    INITIAL_MODEL = AVAILABLE_MODELS[0]
# If no models, INITIAL_MODEL remains None, and dropdown will show "No models found..."

with gr.Blocks(title="Qwen3 x Ollama", theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
        <h1 style='text-align: center'>
        Qwen3 Reasoning with Ollama
        </h1>
    """
    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://opencv.org/university/' target='_blank'>OpenCV Courses</a> | <a href='https://github.com/OpenCV-University' target='_blank'>Github</a>
        </h3>
        """
    )
    gr.Markdown(
        """
        - Interact with a Qwen3 model hosted on Ollama.
        - Switch between `/think` and `/no_think` modes to explore the thinking process.
        - The response will stream with a simulated typing effect.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            model_selector = gr.Dropdown(
                label="Select Model",
                choices=AVAILABLE_MODELS if AVAILABLE_MODELS else ["No models found - check Ollama setup"],
                value=INITIAL_MODEL,
                interactive=True,
            )
            prompt_input = gr.Textbox(
                label="Enter your prompt",
                placeholder="e.g., Explain quantum entanglement in simple terms.",
                lines=5,
                elem_id="prompt-input",
            )
            mode_radio = gr.Radio(
                ["think", "no_think"],  # Kept original modes from your app.py
                label="Reasoning Mode",
                value="think",
                info="`/think` encourages step-by-step reasoning. `/no_think` aims for a direct answer.",
            )
            with gr.Row():
                submit_button = gr.Button("Generate Response", variant="primary")
                clear_button = gr.ClearButton()

        with gr.Column(scale=2):
            status_output = gr.Textbox(
                label="Status",
                interactive=False,
                lines=1,
                placeholder="Awaiting submission...",
                elem_id="status-output",
            )
            response_output = gr.Textbox(  # Kept as gr.Textbox as requested
                label="Model Response", lines=20, interactive=False, show_copy_button=True, elem_id="response-output"
            )

    def handle_submit_wrapper(model, prompt, mode):
        yield {status_output: "Processing... Preparing to stream response.", response_output: ""}

        final_chunk = ""
        # Using original variable names 'prompt' and 'mode' for reasoning_ollama_stream
        for chunk in reasoning_ollama_stream(model, prompt, mode):
            final_chunk = chunk
            yield {status_output: "Streaming response...", response_output: chunk}

        if "Error:" in final_chunk or "--- Ollama Error ---" in final_chunk:
            yield {status_output: "Completed with issues.", response_output: final_chunk}
        elif "Model returned an empty response." in final_chunk:
            yield {status_output: "Model returned an empty response.", response_output: final_chunk}
        elif not final_chunk.strip() and ("Error:" not in final_chunk and "--- Ollama Error ---" not in final_chunk):
            yield {status_output: "Completed, but no substantive output received.", response_output: ""}
        else:
            yield {status_output: "Response generated successfully!", response_output: final_chunk}

    submit_button.click(
        fn=handle_submit_wrapper,
        inputs=[model_selector, prompt_input, mode_radio],
        outputs=[status_output, response_output],
    )
    clear_button.add([prompt_input, response_output, status_output])

    # Example model determination logic from your original app.py
    # Note: This might select a model not actually available if AVAILABLE_MODELS is empty
    # and the fallback "qwen3:4b" is used.
    # A safer approach is to ensure example_model is from AVAILABLE_MODELS if possible.
    example_model_for_ui = INITIAL_MODEL
    if not example_model_for_ui and AVAILABLE_MODELS:
        example_model_for_ui = AVAILABLE_MODELS[0]
    elif not example_model_for_ui:  # Fallback if no models and INITIAL_MODEL is None
        example_model_for_ui = "qwen3:4b"  # Default example model

    gr.Examples(
        examples=[
            [example_model_for_ui, "What are the main pros and cons of using nuclear energy?", "think"],
            # Fallback for the second example if qwen3:4b isn't a primary choice
            [
                (
                    example_model_for_ui
                    if example_model_for_ui != "qwen3:4b"
                    else (INITIAL_MODEL if INITIAL_MODEL and INITIAL_MODEL != "qwen3:4b" else "qwen3:1.7b")
                ),
                "Write a short poem about a rainy day.",
                "no_think",
            ],
            [example_model_for_ui, "Plan a 3-day trip to Paris, focusing on historical sites.", "think"],
        ],
        inputs=[model_selector, prompt_input, mode_radio],
        outputs=[status_output, response_output],
        fn=handle_submit_wrapper,
        cache_examples=False,  # Cache examples can be True if inputs are static and fn is pure
    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        Developed with ❤️ by OpenCV
        </h3>
        """
    )

if __name__ == "__main__":
    print("--- Gradio App Starting ---")  # Simplified print
    print(f"Attempting to fetch Ollama models (initial load)... Result: {AVAILABLE_MODELS}")
    print(f"Initial model for UI (if any): {INITIAL_MODEL}")
    print(f"Gradio version: {gr.__version__}")
    print(f"---------------------------")

    # For local Docker testing, server_name="0.0.0.0" is important.
    # For Hugging Face Spaces, demo.launch() is usually enough as it handles proxying.
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,  # Set to True if you need a public link for local testing (requires internet)
        # share=os.getenv("GRADIO_SHARE", "False").lower() == "true" # If using env var for share
    )