import gradio as gr from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer import torch # Load DeepSeek LLM model_name = "deepseek-ai/deepseek-llm-7b-chat" # llm = ChatOllama(model="deepseek-r1:1.5b", temperature=0) model_name = "deepseek-ai/deepseek-llm-7b-chat" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # tokenizer = LlamaTokenizer.from_pretrained(model_name) # Explicitly use LlamaTokenizer # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload_weights" ) ''' ValueError: The current device_map had weights offloaded to the disk. Please provide an offload_folder f or them. Alternatively, make sure you have safetensors installed if the model you are using offers the weights in this format. ''' SYSTEM_PROMPT = "You are a helpful AI assistant. Keep responses concise and informative." import wikipediaapi # wiki_wiki = wikipediaapi.Wikipedia('en') # Specify a valid user-agent string wiki_wiki = wikipediaapi.Wikipedia( language='en', user_agent='Chatbot/1.0 (rydjorker121@yahoo.com)' ) def fetch_wikipedia(query): """Fetch a summary from Wikipedia""" page = wiki_wiki.page(query) if page.exists(): return page.summary[:500] # Limit to 500 chars return "I couldn't find relevant Wikipedia information on that topic." def generate_response(message, history): history = history or [] history.append(("User", message)) # Check if the user asks for factual info if "wikipedia" in message.lower(): query = message.lower().replace("wikipedia", "").strip() wiki_info = fetch_wikipedia(query) history.append(("Bot", wiki_info)) return history, "" # Default chatbot behavior chat_history = "" for user, bot in history[-5:]: chat_history += f"User: {user}\nBot: {bot}\n" input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") streamer = tokenizer.streamer() model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) bot_message = "" for token in streamer: bot_message += token yield bot_message history.append(("Bot", bot_message)) return history, "" # # Function to handle chat with memory # def generate_response(message, history): # history = history or [] # history.append(("User", message)) # # Format history for the model # chat_history = "" # for user, bot in history[-5:]: # Limit history to last 5 exchanges to avoid exceeding token limit # chat_history += f"User: {user}\nBot: {bot}\n" # input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:" # inputs = tokenizer(input_text, return_tensors="pt").to("cuda") # streamer = tokenizer.streamer() # model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) # bot_message = "" # for token in streamer: # bot_message += token # yield bot_message # history.append(("Bot", bot_message)) # return history, "" # def generate_response(message, history): # history = history or [] # history.append(("User", message)) # # Add system message for better guidance # input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n" + tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True) # inputs = tokenizer(input_text, return_tensors="pt").to("cuda") # streamer = tokenizer.streamer() # model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) # bot_message = "" # for token in streamer: # bot_message += token # yield bot_message with gr.Blocks() as demo: gr.Markdown("# 🚀 DeepSeek LLM Chatbot with Memory & Wikipedia API") chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Ask me anything...", label="Your Message") clear_btn = gr.Button("Clear Chat") def respond(message, history): history, bot_message = generate_response(message, history) return history, bot_message msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg]) clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg]) # # Create Gradio Chatbot UI with streaming # with gr.Blocks() as demo: # gr.Markdown("### 🚀 DeepSeek LLM Chatbot (Streaming & Improved UI)") # chatbot = gr.Chatbot() # msg = gr.Textbox(placeholder="Type your message here...", label="Your Message") # clear_btn = gr.Button("Clear Chat") # def respond(message, history): # history = history or [] # bot_response = generate_response(message, history) # return bot_response, history + [("User", message), ("Bot", bot_response)] # msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg]) # clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg]) demo.launch() # with gr.Blocks() as demo: # gr.Markdown("### 🚀 DeepSeek LLM Chatbot (Streaming Enabled)") # chat = gr.ChatInterface(fn=generate_response) # demo.launch() ''' ✅ Uses streaming (streamer=tokenizer.streamer()) ✅ Returns tokens in real-time instead of waiting for full response ✅ Improved UI with gr.Blocks() ✅ System prompt ensures responses are concise & helpful ✅ Chat history is structured more clearly ✅ Retains chat history ✅ "Clear Chat" button ✅ Better UI layout with Markdown & structured input boxes '''