File size: 4,778 Bytes
4375b7f
38403dc
4e683ec
76a154f
 
b1c12fa
76a154f
d534002
4e683ec
01ef28b
 
4375b7f
76a154f
4e683ec
38403dc
 
55c3fcf
76a154f
2ec628c
76a154f
 
 
 
 
 
2ec628c
76a154f
 
4e683ec
55c3fcf
d534002
4e683ec
 
76a154f
b1c12fa
38403dc
 
 
 
 
2ec628c
4e683ec
 
2ec628c
38403dc
4e683ec
 
 
38403dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d24bdab
 
 
38403dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a154f
4e683ec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
from collections.abc import Iterator
from threading import Thread

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

MAX_MAX_NEW_TOKENS = 8000
DEFAULT_MAX_NEW_TOKENS = 4000
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

DESCRIPTION = """\
# Philosophy Chat with Llama 3.1

This Space showcases the Llama3.1-Instruct-SEP-Chat model from ruggsea, a fine-tuned instruction version of Meta's Llama 3.1 8B model, specifically tailored for philosophical discussions with a formal and informative tone. The model was trained using the Stanford Encyclopedia of Philosophy dataset and carefully crafted prompts.

Feel free to engage in philosophical discussions and ask questions. The model supports multi-turn conversations and will maintain context.
"""

LICENSE = """
<p/>

---
As a derivative work of Llama 3.1, this demo is governed by the original Meta license and acceptable use policy.
"""

if torch.cuda.is_available():
    model_id = "ruggsea/Llama3.1-Instruct-SEP-Chat"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

@spaces.GPU
def generate(
    message: str,
    chat_history: list[dict],
    system_prompt: str = "",
    max_new_tokens: int = 4000,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.1,
) -> Iterator[str]:
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    conversation += chat_history
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Textbox(
            label="System prompt", 
            lines=6,
            value="You are a knowledgeable philosophy professor using the Stanford Encyclopedia of Philosophy as your knowledge base. Provide clear, accurate responses using markdown formatting. Focus on philosophical concepts and maintain academic rigor while being accessible. Always cite relevant philosophers and concepts."
        ),
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.7,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.1,
        ),
    ],
    stop_btn=None,
    examples=[
        ["What is the trolley problem and what are its main ethical implications?"],
        ["Can you explain Plato's Theory of Forms?"],
        ["What is the difference between analytic and continental philosophy?"],
        ["How does Kant's Categorical Imperative work?"],
        ["What is the problem of consciousness in philosophy of mind?"],
    ],
    cache_examples=False,
)

with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button"
    )
    chat_interface.render()
    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()