import gradio as gr
from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
import torch

# Load DeepSeek LLM
model_name = "deepseek-ai/deepseek-llm-7b-chat"  
# llm = ChatOllama(model="deepseek-r1:1.5b", temperature=0)

model_name = "deepseek-ai/deepseek-llm-7b-chat"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = LlamaTokenizer.from_pretrained(model_name)  # Explicitly use LlamaTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto", 
    offload_folder="offload_weights"
)
'''
ValueError: The current device_map had weights offloaded to the disk. Please provide an offload_folder f
or them. Alternatively, make sure you have safetensors installed if the model you are using offers the 
weights in this format.

'''

SYSTEM_PROMPT = "You are a helpful AI assistant. Keep responses concise and informative."


import wikipediaapi

# wiki_wiki = wikipediaapi.Wikipedia('en')

# Specify a valid user-agent string
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='Chatbot/1.0 (rydjorker121@yahoo.com)'
)

def fetch_wikipedia(query):
    """Fetch a summary from Wikipedia"""
    page = wiki_wiki.page(query)
    if page.exists():
        return page.summary[:500]  # Limit to 500 chars
    return "I couldn't find relevant Wikipedia information on that topic."

def generate_response(message, history):
    history = history or []
    history.append(("User", message))

    # Check if the user asks for factual info
    if "wikipedia" in message.lower():
        query = message.lower().replace("wikipedia", "").strip()
        wiki_info = fetch_wikipedia(query)
        history.append(("Bot", wiki_info))
        return history, ""

    # Default chatbot behavior
    chat_history = ""
    for user, bot in history[-5:]:
        chat_history += f"User: {user}\nBot: {bot}\n"

    input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    streamer = tokenizer.streamer()
    model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)

    bot_message = ""
    for token in streamer:
        bot_message += token
        yield bot_message  

    history.append(("Bot", bot_message))
    return history, ""


# # Function to handle chat with memory
# def generate_response(message, history):
#     history = history or []
#     history.append(("User", message))

#     # Format history for the model
#     chat_history = ""
#     for user, bot in history[-5:]:  # Limit history to last 5 exchanges to avoid exceeding token limit
#         chat_history += f"User: {user}\nBot: {bot}\n"

#     input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
#     inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

#     streamer = tokenizer.streamer()
#     model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)

#     bot_message = ""
#     for token in streamer:
#         bot_message += token
#         yield bot_message  

#     history.append(("Bot", bot_message))
#     return history, ""

# def generate_response(message, history):
#     history = history or []
#     history.append(("User", message))
    
#     # Add system message for better guidance
#     input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n" + tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
#     inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

#     streamer = tokenizer.streamer()
#     model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)
    
#     bot_message = ""
#     for token in streamer:
#         bot_message += token
#         yield bot_message  


with gr.Blocks() as demo:
    gr.Markdown("# 🚀 DeepSeek LLM Chatbot with Memory & Wikipedia API")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask me anything...", label="Your Message")
    clear_btn = gr.Button("Clear Chat")
    
    def respond(message, history):
        history, bot_message = generate_response(message, history)
        return history, bot_message

    msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])


# # Create Gradio Chatbot UI with streaming
# with gr.Blocks() as demo:
#     gr.Markdown("### 🚀 DeepSeek LLM Chatbot (Streaming & Improved UI)")
    
#     chatbot = gr.Chatbot()
#     msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
#     clear_btn = gr.Button("Clear Chat")
    
#     def respond(message, history):
#         history = history or []
#         bot_response = generate_response(message, history)
#         return bot_response, history + [("User", message), ("Bot", bot_response)]
    
#     msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
#     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])


demo.launch()


# with gr.Blocks() as demo:
#     gr.Markdown("### 🚀 DeepSeek LLM Chatbot (Streaming Enabled)")
#     chat = gr.ChatInterface(fn=generate_response)
    
# demo.launch()

'''
✅ Uses streaming (streamer=tokenizer.streamer())
✅ Returns tokens in real-time instead of waiting for full response
✅ Improved UI with gr.Blocks()

✅ System prompt ensures responses are concise & helpful
✅ Chat history is structured more clearly

✅ Retains chat history
✅ "Clear Chat" button
✅ Better UI layout with Markdown & structured input boxes

'''