my-chatbot / app.py
jorker121's picture
Deploy DeepSeek LLM chatbotV6
be34538
import gradio as gr
from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
import torch
# Load DeepSeek LLM
model_name = "deepseek-ai/deepseek-llm-7b-chat"
# llm = ChatOllama(model="deepseek-r1:1.5b", temperature=0)
model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = LlamaTokenizer.from_pretrained(model_name) # Explicitly use LlamaTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
offload_folder="offload_weights"
)
'''
ValueError: The current device_map had weights offloaded to the disk. Please provide an offload_folder f
or them. Alternatively, make sure you have safetensors installed if the model you are using offers the
weights in this format.
'''
SYSTEM_PROMPT = "You are a helpful AI assistant. Keep responses concise and informative."
import wikipediaapi
# wiki_wiki = wikipediaapi.Wikipedia('en')
# Specify a valid user-agent string
wiki_wiki = wikipediaapi.Wikipedia(
language='en',
user_agent='Chatbot/1.0 ([email protected])'
)
def fetch_wikipedia(query):
"""Fetch a summary from Wikipedia"""
page = wiki_wiki.page(query)
if page.exists():
return page.summary[:500] # Limit to 500 chars
return "I couldn't find relevant Wikipedia information on that topic."
def generate_response(message, history):
history = history or []
history.append(("User", message))
# Check if the user asks for factual info
if "wikipedia" in message.lower():
query = message.lower().replace("wikipedia", "").strip()
wiki_info = fetch_wikipedia(query)
history.append(("Bot", wiki_info))
return history, ""
# Default chatbot behavior
chat_history = ""
for user, bot in history[-5:]:
chat_history += f"User: {user}\nBot: {bot}\n"
input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
streamer = tokenizer.streamer()
model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)
bot_message = ""
for token in streamer:
bot_message += token
yield bot_message
history.append(("Bot", bot_message))
return history, ""
# # Function to handle chat with memory
# def generate_response(message, history):
# history = history or []
# history.append(("User", message))
# # Format history for the model
# chat_history = ""
# for user, bot in history[-5:]: # Limit history to last 5 exchanges to avoid exceeding token limit
# chat_history += f"User: {user}\nBot: {bot}\n"
# input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# streamer = tokenizer.streamer()
# model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)
# bot_message = ""
# for token in streamer:
# bot_message += token
# yield bot_message
# history.append(("Bot", bot_message))
# return history, ""
# def generate_response(message, history):
# history = history or []
# history.append(("User", message))
# # Add system message for better guidance
# input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n" + tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# streamer = tokenizer.streamer()
# model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)
# bot_message = ""
# for token in streamer:
# bot_message += token
# yield bot_message
with gr.Blocks() as demo:
gr.Markdown("# πŸš€ DeepSeek LLM Chatbot with Memory & Wikipedia API")
chatbot = gr.Chatbot()
msg = gr.Textbox(placeholder="Ask me anything...", label="Your Message")
clear_btn = gr.Button("Clear Chat")
def respond(message, history):
history, bot_message = generate_response(message, history)
return history, bot_message
msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
# # Create Gradio Chatbot UI with streaming
# with gr.Blocks() as demo:
# gr.Markdown("### πŸš€ DeepSeek LLM Chatbot (Streaming & Improved UI)")
# chatbot = gr.Chatbot()
# msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
# clear_btn = gr.Button("Clear Chat")
# def respond(message, history):
# history = history or []
# bot_response = generate_response(message, history)
# return bot_response, history + [("User", message), ("Bot", bot_response)]
# msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
# clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
demo.launch()
# with gr.Blocks() as demo:
# gr.Markdown("### πŸš€ DeepSeek LLM Chatbot (Streaming Enabled)")
# chat = gr.ChatInterface(fn=generate_response)
# demo.launch()
'''
βœ… Uses streaming (streamer=tokenizer.streamer())
βœ… Returns tokens in real-time instead of waiting for full response
βœ… Improved UI with gr.Blocks()
βœ… System prompt ensures responses are concise & helpful
βœ… Chat history is structured more clearly
βœ… Retains chat history
βœ… "Clear Chat" button
βœ… Better UI layout with Markdown & structured input boxes
'''