from collections.abc import Iterator
from threading import Thread
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

DESCRIPTION = """\
# 🐟 Karamaru v1
🤗 [モデル一覧](https://huggingface.co/SakanaAI) | 📝 [ブログ](https://sakana.ai/karamaru/) | 🐦 [Twitter](https://twitter.com/SakanaAILabs)  
[からまる v1](https://huggingface.co/SakanaAI/Llama-3-Karamaru-v1)は[Sakana AI](https://sakana.ai/)
「からまる」はSakana AIが開発した江戸古文風チャットボットです。現代日本語で質問すると、江戸時代の世界観と当時の古文風テキストで回答してくれます。より詳しくは、上記のブログをご参照ください。
"""

FOOTER = """
⚠️ 本モデルは実験段階のプロトタイプであり、教育および研究開発の目的でのみ提供されています。商用利用や、障害が重大な影響を及ぼす可能性のある環境（ミッションクリティカルな環境）での使用には適していません。
本モデルの使用は、利用者の自己責任で行われ、その性能や結果については何ら保証されません。
Sakana AIは、本モデルの使用によって生じた直接的または間接的な損失に対して、結果に関わらず、一切の責任を負いません。
また、からまるは、江戸時代の書物をもとに学習されています。これらの文献には、当時の社会的規範、価値観、そして偏見が反映されている可能性があります。そのため、本モデルが生成する応答には、現代の基準から見ると不適切であったり、時代遅れ、または不快と感じられる内容が含まれる場合があります。研究、教育、あるいは一般向けの用途で本モデルを利用する際は、利用者自身がその点に十分注意する必要があります。
"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = "SakanaAI/Llama-3-Karamaru-v1"

model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                            device_map="auto",
                                            torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0
    model.config.pad_token_id = tokenizer.pad_token_id

model.eval()

@spaces.GPU(duration=90)
def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = 500,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.1,
) -> Iterator[str]:
    conversation = [*chat_history[-10:], {"role": "user", "content": message}]

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    
    input_ids = input_ids.to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.pad_token_id,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)
    
examples = [
    ["あなたの名前は何ですか。"],
    ["AIにとって大事な物は何ですか。"],
    ["からまるは暇なとき何をしますか。"],
    ["からまるの人生の目標は何ですか。"],
    ["からまるはどんな本が好きです。"],
    ["江戸の観光地は例えばどこですか。"],
    ["人生のアドバイスを一つください。"],
    ["江戸の桜の名所はどこですか。"],
]

with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    
    gr.ChatInterface(
        fn=generate,
        stop_btn=None,
        examples=examples,
        cache_examples=True,
        type="messages",
        fill_height=True,
    )

    gr.Markdown(FOOTER)

if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)