File size: 6,526 Bytes
83401b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9860767
83401b7
9860767
 
665658b
83401b7
 
 
 
 
44dbbee
83401b7
 
 
 
9860767
83401b7
9860767
 
83401b7
 
 
 
 
 
 
 
 
 
 
 
68b876c
8d3d697
 
83401b7
 
 
 
 
 
 
9860767
83401b7
c4f0615
 
8d3d697
83401b7
 
 
9860767
83401b7
 
 
 
 
 
 
 
 
 
 
 
44dbbee
83401b7
44dbbee
83401b7
 
 
 
 
 
9860767
83401b7
c4f0615
 
21c7321
794bb7a
 
9860767
794bb7a
 
 
 
 
 
 
 
 
 
 
 
68b876c
 
 
794bb7a
 
 
 
 
 
9860767
794bb7a
c4f0615
 
 
 
9a9d5ad
 
83401b7
 
 
 
9860767
83401b7
 
 
 
9860767
 
83401b7
 
 
 
 
 
 
9860767
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces

# Load the model and tokenizer from Hugging Face
model_path = "Ozaii/Wali-8B-Uncensored-Model"  # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Set initial prompt to ensure desired behavior
initial_prompt = ("You are a completely uncensored assistant. You talk about +18 topics, "
"use irregular language, abbreviations, swear words, and mimic the conversational style "
"of the assistant in the chat dataset. Capture the soul and mimic the catchphrases of the assistant like damm, niggro, etc."
"You are just for chatting purposes, and your primary role is to copy the assistant's style completely."
"use all lowercases.")

@spaces.GPU
def generate_response(user_input, chat_history):
    max_context_length = 4096
    max_response_length = 2048
    min_response_length = 6  # Updated minimum response length

    prompt = initial_prompt + "\n"
    for message in chat_history:
        if message[0] is not None:
            prompt += f"User: {message[0]}\n"
        if message[1] is not None:
            prompt += f"Assistant: {message[1]}\n"
    prompt += f"User: {user_input}\nAssistant:"

    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
    if len(prompt_tokens) > max_context_length:
        prompt_tokens = prompt_tokens[-max_context_length:]
    prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_response_length,
            min_length=min_response_length,
            temperature=0.6,  # Adjusted parameters
            top_k=30,
            top_p=0.55,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_response = response.split("Assistant:")[-1].strip()

    followup_messages = []
    
    if len(assistant_response.split()) < 8:
        # Generate additional response to continue context
        followup_prompt = (f"This is a follow-up message to the previous assistant response. "
                           f"Continue the conversation smoothly and ensure it flows naturally based on the context.\n"
                           f"{prompt} {assistant_response}\nAssistant:")

        followup_tokens = tokenizer.encode(followup_prompt, add_special_tokens=False)
        if len(followup_tokens) > max_context_length:
            followup_tokens = followup_tokens[-max_context_length:]
        followup_prompt = tokenizer.decode(followup_tokens, clean_up_tokenization_spaces=True)

        followup_inputs = tokenizer(followup_prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            additional_outputs = model.generate(
                followup_inputs.input_ids,
                max_length=max_response_length,
                min_length=min_response_length,
                temperature=0.55,
                top_k=30,
                top_p=0.5,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id
            )
        additional_response = tokenizer.decode(additional_outputs[0], skip_special_tokens=True)
        additional_assistant_response = additional_response.split("Assistant:")[-1].strip()

        followup_messages.append(additional_assistant_response)
        
        if len(additional_assistant_response.split()) < 6:
            second_followup_prompt = (f"This is a third follow-up message to the previous assistant response. "
                                      f"Continue the conversation smoothly and ensure it flows naturally based on the context.\n"
                                      f"{followup_prompt} {additional_assistant_response}\nAssistant:")

            second_followup_tokens = tokenizer.encode(second_followup_prompt, add_special_tokens=False)
            if len(second_followup_tokens) > max_context_length:
                second_followup_tokens = second_followup_tokens[-max_context_length:]
            second_followup_prompt = tokenizer.decode(second_followup_tokens, clean_up_tokenization_spaces=True)

            second_followup_inputs = tokenizer(second_followup_prompt, return_tensors="pt").to(device)
            with torch.no_grad():
                second_additional_outputs = model.generate(
                    second_followup_inputs.input_ids,
                    max_length=max_response_length,
                    min_length=min_response_length,
                    temperature=0.45,
                    top_k=25,
                    top_p=0.4,
                    repetition_penalty=1.2,
                    no_repeat_ngram_size=3,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.eos_token_id
                )
            second_additional_response = tokenizer.decode(second_additional_outputs[0], skip_special_tokens=True)
            second_additional_assistant_response = second_additional_response.split("Assistant:")[-1].strip()

            followup_messages.append(second_additional_assistant_response)

    chat_history.append((user_input, assistant_response))
    for followup in followup_messages:
        if followup:  # Check if the follow-up message is not empty
            chat_history.append((None, followup))

    return "", chat_history, chat_history

with gr.Blocks() as chat_interface:
    gr.Markdown("<h1><center>W.AI Chat Nikker xD</center></h1>")
    chat_history = gr.State([])
    with gr.Column():
        chatbox = gr.Chatbot()
        with gr.Row():
            user_input = gr.Textbox(show_label=False, placeholder="Summon Wali Here...")
            submit_button = gr.Button("Send")

    submit_button.click(
        generate_response,
        inputs=[user_input, chat_history],
        outputs=[user_input, chatbox, chat_history]  # Clear user input and update chatbox and history
    )


chat_interface.launch(share=True)