1
File size: 3,901 Bytes
c4110d1
e508bdf
c4110d1
e508bdf
c4110d1
 
 
 
8367fb2
c4110d1
 
6949ffc
c4110d1
 
 
 
 
b3abd21
c876f7b
c4110d1
 
 
 
 
6adb177
c4110d1
 
fd1d947
c4110d1
 
 
 
 
 
 
 
 
e508bdf
c4110d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3abd21
c4110d1
 
 
 
 
e508bdf
c4110d1
 
 
6949ffc
c4110d1
6949ffc
c4110d1
 
 
 
c876f7b
c4110d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2aae3c9
c4110d1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# app.py
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from gtts import gTTS
import os
import time
import torch
from threading import Thread

# Initialize models
@st.cache_resource
def load_models():
    model_name = "Qwen/Qwen3-1.7B"
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True
    )
    
    return model, tokenizer

def parse_thinking_output(output_ids, tokenizer, thinking_token_id=151668):
    try:
        index = len(output_ids) - output_ids[::-1].index(thinking_token_id)
    except ValueError:
        index = 0
        
    thinking = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
    return thinking, content

def generate_response(prompt, model, tokenizer):
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True
    )
    
    streamer = TextIteratorStreamer(tokenizer)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=4096,
        temperature=0.7,
        do_sample=True
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    full_response = ""
    thinking_content = ""
    for new_text in streamer:
        full_response += new_text
        try:
            current_ids = tokenizer.encode(full_response, return_tensors="pt")[0]
            thinking, content = parse_thinking_output(current_ids, tokenizer)
            yield thinking, content
        except:
            yield "", full_response

def text_to_speech(text):
    tts = gTTS(text=text, lang='en', slow=False)
    audio_file = f"audio_{int(time.time())}.mp3"
    tts.save(audio_file)
    return audio_file

# Streamlit UI
def main():
    st.title("🧠 Qwen3-1.7B Thinking Mode Demo")
    
    model, tokenizer = load_models()
    
    with st.sidebar:
        st.header("Settings")
        max_length = st.slider("Max Tokens", 100, 4096, 1024)
        temperature = st.slider("Temperature", 0.1, 1.0, 0.7)
    
    prompt = st.text_area("Enter your prompt:", 
                         "Explain quantum computing in simple terms")
    
    if st.button("Generate Response"):
        with st.spinner("Generating response..."):
            # Setup containers
            thinking_container = st.container(border=True)
            response_container = st.empty()
            audio_container = st.empty()
            
            full_content = ""
            current_thinking = ""
            
            for thinking, content in generate_response(prompt, model, tokenizer):
                if thinking != current_thinking:
                    thinking_container.markdown(f"**Thinking Process:**\n{thinking}")
                    current_thinking = thinking
                
                if content != full_content:
                    response_container.markdown(f"**Final Answer:**\n{content}")
                    full_content = content
            
            # Add audio version
            audio_file = text_to_speech(full_content)
            audio_container.audio(audio_file, format='audio/mp3')
            
            # Add download button
            st.download_button(
                label="Download Response",
                data=full_content,
                file_name="qwen_response.txt",
                mime="text/plain"
            )

if __name__ == "__main__":
    main()