Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer | |
import torch | |
# Load DeepSeek LLM | |
model_name = "deepseek-ai/deepseek-llm-7b-chat" | |
# llm = ChatOllama(model="deepseek-r1:1.5b", temperature=0) | |
model_name = "deepseek-ai/deepseek-llm-7b-chat" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
# tokenizer = LlamaTokenizer.from_pretrained(model_name) # Explicitly use LlamaTokenizer | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
offload_folder="offload_weights" | |
) | |
''' | |
ValueError: The current device_map had weights offloaded to the disk. Please provide an offload_folder f | |
or them. Alternatively, make sure you have safetensors installed if the model you are using offers the | |
weights in this format. | |
''' | |
SYSTEM_PROMPT = "You are a helpful AI assistant. Keep responses concise and informative." | |
import wikipediaapi | |
# wiki_wiki = wikipediaapi.Wikipedia('en') | |
# Specify a valid user-agent string | |
wiki_wiki = wikipediaapi.Wikipedia( | |
language='en', | |
user_agent='Chatbot/1.0 ([email protected])' | |
) | |
def fetch_wikipedia(query): | |
"""Fetch a summary from Wikipedia""" | |
page = wiki_wiki.page(query) | |
if page.exists(): | |
return page.summary[:500] # Limit to 500 chars | |
return "I couldn't find relevant Wikipedia information on that topic." | |
def generate_response(message, history): | |
history = history or [] | |
history.append(("User", message)) | |
# Check if the user asks for factual info | |
if "wikipedia" in message.lower(): | |
query = message.lower().replace("wikipedia", "").strip() | |
wiki_info = fetch_wikipedia(query) | |
history.append(("Bot", wiki_info)) | |
return history, "" | |
# Default chatbot behavior | |
chat_history = "" | |
for user, bot in history[-5:]: | |
chat_history += f"User: {user}\nBot: {bot}\n" | |
input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:" | |
inputs = tokenizer(input_text, return_tensors="pt").to("cuda") | |
streamer = tokenizer.streamer() | |
model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) | |
bot_message = "" | |
for token in streamer: | |
bot_message += token | |
yield bot_message | |
history.append(("Bot", bot_message)) | |
return history, "" | |
# # Function to handle chat with memory | |
# def generate_response(message, history): | |
# history = history or [] | |
# history.append(("User", message)) | |
# # Format history for the model | |
# chat_history = "" | |
# for user, bot in history[-5:]: # Limit history to last 5 exchanges to avoid exceeding token limit | |
# chat_history += f"User: {user}\nBot: {bot}\n" | |
# input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:" | |
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda") | |
# streamer = tokenizer.streamer() | |
# model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) | |
# bot_message = "" | |
# for token in streamer: | |
# bot_message += token | |
# yield bot_message | |
# history.append(("Bot", bot_message)) | |
# return history, "" | |
# def generate_response(message, history): | |
# history = history or [] | |
# history.append(("User", message)) | |
# # Add system message for better guidance | |
# input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n" + tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True) | |
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda") | |
# streamer = tokenizer.streamer() | |
# model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id) | |
# bot_message = "" | |
# for token in streamer: | |
# bot_message += token | |
# yield bot_message | |
with gr.Blocks() as demo: | |
gr.Markdown("# π DeepSeek LLM Chatbot with Memory & Wikipedia API") | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox(placeholder="Ask me anything...", label="Your Message") | |
clear_btn = gr.Button("Clear Chat") | |
def respond(message, history): | |
history, bot_message = generate_response(message, history) | |
return history, bot_message | |
msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg]) | |
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg]) | |
# # Create Gradio Chatbot UI with streaming | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### π DeepSeek LLM Chatbot (Streaming & Improved UI)") | |
# chatbot = gr.Chatbot() | |
# msg = gr.Textbox(placeholder="Type your message here...", label="Your Message") | |
# clear_btn = gr.Button("Clear Chat") | |
# def respond(message, history): | |
# history = history or [] | |
# bot_response = generate_response(message, history) | |
# return bot_response, history + [("User", message), ("Bot", bot_response)] | |
# msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg]) | |
# clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg]) | |
demo.launch() | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### π DeepSeek LLM Chatbot (Streaming Enabled)") | |
# chat = gr.ChatInterface(fn=generate_response) | |
# demo.launch() | |
''' | |
β Uses streaming (streamer=tokenizer.streamer()) | |
β Returns tokens in real-time instead of waiting for full response | |
β Improved UI with gr.Blocks() | |
β System prompt ensures responses are concise & helpful | |
β Chat history is structured more clearly | |
β Retains chat history | |
β "Clear Chat" button | |
β Better UI layout with Markdown & structured input boxes | |
''' |