File size: 3,901 Bytes
c4110d1 e508bdf c4110d1 e508bdf c4110d1 8367fb2 c4110d1 6949ffc c4110d1 b3abd21 c876f7b c4110d1 6adb177 c4110d1 fd1d947 c4110d1 e508bdf c4110d1 b3abd21 c4110d1 e508bdf c4110d1 6949ffc c4110d1 6949ffc c4110d1 c876f7b c4110d1 2aae3c9 c4110d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# app.py
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from gtts import gTTS
import os
import time
import torch
from threading import Thread
# Initialize models
@st.cache_resource
def load_models():
model_name = "Qwen/Qwen3-1.7B"
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
)
return model, tokenizer
def parse_thinking_output(output_ids, tokenizer, thinking_token_id=151668):
try:
index = len(output_ids) - output_ids[::-1].index(thinking_token_id)
except ValueError:
index = 0
thinking = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
return thinking, content
def generate_response(prompt, model, tokenizer):
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True
)
streamer = TextIteratorStreamer(tokenizer)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=4096,
temperature=0.7,
do_sample=True
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
full_response = ""
thinking_content = ""
for new_text in streamer:
full_response += new_text
try:
current_ids = tokenizer.encode(full_response, return_tensors="pt")[0]
thinking, content = parse_thinking_output(current_ids, tokenizer)
yield thinking, content
except:
yield "", full_response
def text_to_speech(text):
tts = gTTS(text=text, lang='en', slow=False)
audio_file = f"audio_{int(time.time())}.mp3"
tts.save(audio_file)
return audio_file
# Streamlit UI
def main():
st.title("🧠 Qwen3-1.7B Thinking Mode Demo")
model, tokenizer = load_models()
with st.sidebar:
st.header("Settings")
max_length = st.slider("Max Tokens", 100, 4096, 1024)
temperature = st.slider("Temperature", 0.1, 1.0, 0.7)
prompt = st.text_area("Enter your prompt:",
"Explain quantum computing in simple terms")
if st.button("Generate Response"):
with st.spinner("Generating response..."):
# Setup containers
thinking_container = st.container(border=True)
response_container = st.empty()
audio_container = st.empty()
full_content = ""
current_thinking = ""
for thinking, content in generate_response(prompt, model, tokenizer):
if thinking != current_thinking:
thinking_container.markdown(f"**Thinking Process:**\n{thinking}")
current_thinking = thinking
if content != full_content:
response_container.markdown(f"**Final Answer:**\n{content}")
full_content = content
# Add audio version
audio_file = text_to_speech(full_content)
audio_container.audio(audio_file, format='audio/mp3')
# Add download button
st.download_button(
label="Download Response",
data=full_content,
file_name="qwen_response.txt",
mime="text/plain"
)
if __name__ == "__main__":
main()
|