File size: 5,817 Bytes
903a631
689b966
903a631
 
 
 
d6c2c58
689b966
d6c2c58
689b966
d6c2c58
 
689b966
3ed4994
 
 
 
 
 
 
 
 
 
 
 
 
 
903a631
 
 
3ed4994
 
 
be34538
 
 
 
 
 
 
3ed4994
 
 
 
 
 
 
 
903a631
 
 
3ed4994
 
 
 
 
 
 
 
 
 
 
 
 
 
903a631
 
 
 
3ed4994
903a631
 
 
 
 
3ed4994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903a631
3ed4994
903a631
 
3ed4994
903a631
 
 
3ed4994
 
 
903a631
 
 
3ed4994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903a631
 
 
 
3ed4994
 
 
903a631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
import torch

# Load DeepSeek LLM
model_name = "deepseek-ai/deepseek-llm-7b-chat"  
# llm = ChatOllama(model="deepseek-r1:1.5b", temperature=0)

model_name = "deepseek-ai/deepseek-llm-7b-chat"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = LlamaTokenizer.from_pretrained(model_name)  # Explicitly use LlamaTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto", 
    offload_folder="offload_weights"
)
'''
ValueError: The current device_map had weights offloaded to the disk. Please provide an offload_folder f
or them. Alternatively, make sure you have safetensors installed if the model you are using offers the 
weights in this format.

'''

SYSTEM_PROMPT = "You are a helpful AI assistant. Keep responses concise and informative."


import wikipediaapi

# wiki_wiki = wikipediaapi.Wikipedia('en')

# Specify a valid user-agent string
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='Chatbot/1.0 ([email protected])'
)

def fetch_wikipedia(query):
    """Fetch a summary from Wikipedia"""
    page = wiki_wiki.page(query)
    if page.exists():
        return page.summary[:500]  # Limit to 500 chars
    return "I couldn't find relevant Wikipedia information on that topic."

def generate_response(message, history):
    history = history or []
    history.append(("User", message))

    # Check if the user asks for factual info
    if "wikipedia" in message.lower():
        query = message.lower().replace("wikipedia", "").strip()
        wiki_info = fetch_wikipedia(query)
        history.append(("Bot", wiki_info))
        return history, ""

    # Default chatbot behavior
    chat_history = ""
    for user, bot in history[-5:]:
        chat_history += f"User: {user}\nBot: {bot}\n"

    input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    streamer = tokenizer.streamer()
    model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)

    bot_message = ""
    for token in streamer:
        bot_message += token
        yield bot_message  

    history.append(("Bot", bot_message))
    return history, ""


# # Function to handle chat with memory
# def generate_response(message, history):
#     history = history or []
#     history.append(("User", message))

#     # Format history for the model
#     chat_history = ""
#     for user, bot in history[-5:]:  # Limit history to last 5 exchanges to avoid exceeding token limit
#         chat_history += f"User: {user}\nBot: {bot}\n"

#     input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n{chat_history}User: {message}\nBot:"
#     inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

#     streamer = tokenizer.streamer()
#     model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)

#     bot_message = ""
#     for token in streamer:
#         bot_message += token
#         yield bot_message  

#     history.append(("Bot", bot_message))
#     return history, ""

# def generate_response(message, history):
#     history = history or []
#     history.append(("User", message))
    
#     # Add system message for better guidance
#     input_text = f"[SYSTEM] {SYSTEM_PROMPT}\n" + tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
#     inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

#     streamer = tokenizer.streamer()
#     model.generate(**inputs, streamer=streamer, max_length=512, pad_token_id=tokenizer.eos_token_id)
    
#     bot_message = ""
#     for token in streamer:
#         bot_message += token
#         yield bot_message  


with gr.Blocks() as demo:
    gr.Markdown("# πŸš€ DeepSeek LLM Chatbot with Memory & Wikipedia API")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask me anything...", label="Your Message")
    clear_btn = gr.Button("Clear Chat")
    
    def respond(message, history):
        history, bot_message = generate_response(message, history)
        return history, bot_message

    msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])



# # Create Gradio Chatbot UI with streaming
# with gr.Blocks() as demo:
#     gr.Markdown("### πŸš€ DeepSeek LLM Chatbot (Streaming & Improved UI)")
    
#     chatbot = gr.Chatbot()
#     msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
#     clear_btn = gr.Button("Clear Chat")
    
#     def respond(message, history):
#         history = history or []
#         bot_response = generate_response(message, history)
#         return bot_response, history + [("User", message), ("Bot", bot_response)]
    
#     msg.submit(respond, inputs=[msg, chatbot], outputs=[chatbot, msg])
#     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])


demo.launch()






# with gr.Blocks() as demo:
#     gr.Markdown("### πŸš€ DeepSeek LLM Chatbot (Streaming Enabled)")
#     chat = gr.ChatInterface(fn=generate_response)
    
# demo.launch()

'''
βœ… Uses streaming (streamer=tokenizer.streamer())
βœ… Returns tokens in real-time instead of waiting for full response
βœ… Improved UI with gr.Blocks()

βœ… System prompt ensures responses are concise & helpful
βœ… Chat history is structured more clearly

βœ… Retains chat history
βœ… "Clear Chat" button
βœ… Better UI layout with Markdown & structured input boxes

'''