Spaces:

SarowarSaurav
/

batb

Running

App Files Files Community

SarowarSaurav commited on May 8, 2024

Commit

370110e

verified ·

1 Parent(s): 25e01a6

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -119

app.py CHANGED Viewed

@@ -1,144 +1,128 @@
-import gradio as gr
 import os
 import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
-# Set an environment variable
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-DESCRIPTION = '''
-<div>
-<h1 style="text-align: center;">Meta Llama3 8B</h1>
-<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
-<p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
-<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
-</div>
-'''
-LICENSE = """
-<p/>
----
-Built with Meta Llama 3
-"""
-PLACEHOLDER = """
-<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/8e75e61cc9bab22b7ce3dec85ab0e6db1da5d107/Meta_lockup_positive%20primary_RGB.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3</h1>
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
-</div>
-"""
-css = """
-h1 {
-  text-align: center;
-  display: block;
-}
-#duplicate-button {
-  margin: auto;
-  color: white;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-"""
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
-terminators = [
-    tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids("<|eot_id|>")
-]
-@spaces.GPU(duration=120)
-def chat_llama3_8b(message: str,
-              history: list,
-              temperature: float,
-              max_new_tokens: int
-             ) -> str:
-    """
-    Generate a streaming response using the llama3-8b model.
-    Args:
-        message (str): The input message.
-        history (list): The conversation history used by ChatInterface.
-        temperature (float): The temperature for generating the response.
-        max_new_tokens (int): The maximum number of new tokens to generate.
-    Returns:
-        str: The generated response.
-    """
-    conversation = []
-    for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids= input_ids,
         streamer=streamer,
-        max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
-    for text in streamer:
-        outputs.append(text)
-        #print(outputs)
         yield "".join(outputs)
-# Gradio block
-chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
-with gr.Blocks(fill_height=True, css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
-    gr.ChatInterface(
-        fn=chat_llama3_8b,
-        chatbot=chatbot,
-        fill_height=True,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
-        additional_inputs=[
-            gr.Slider(minimum=0,
-                      maximum=1,
-                      step=0.1,
-                      value=0.95,
-                      label="Temperature",
-                      render=False),
-            gr.Slider(minimum=128,
-                      maximum=4096,
-                      step=1,
-                      value=512,
-                      label="Max new tokens",
-                      render=False ),
-            ],
-        examples=[
-            ['How to setup a human base on Mars? Give short answer.'],
-            ['Explain theory of relativity to me like I’m 8 years old.'],
-            ['What is 9,000 * 9,000?'],
-            ['Write a pun-filled happy birthday message to my friend Alex.'],
-            ['Justify why a penguin might make a good king of the jungle.']
-            ],
-        cache_examples=False,
-                     )
-    gr.Markdown(LICENSE)
-if __name__ == "__main__":
-    demo.launch()

 import os
+import torch
 import spaces
+import gradio as gr
+from threading import Thread
+from huggingface_hub import login
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+login(os.environ.get("HF_TOKEN"))
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto"
+)
+@spaces.GPU()
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: int
+    ):
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.append({"role": "user", "content": user})
+        conversation.append({"role": "assistant", "content": assistant})
     conversation.append({"role": "user", "content": message})
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    input_ids, attention_mask = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True
+    ).to(model.device).values()
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    ]
     generate_kwargs = dict(
+        {"input_ids": input_ids, "attention_mask": attention_mask},
         streamer=streamer,
         do_sample=True,
         temperature=temperature,
+        max_new_tokens=max_new_tokens,
         eos_token_id=terminators,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        top_p=top_p
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
+    for new_token in streamer:
+        outputs.append(new_token)
         yield "".join(outputs)
+gr.ChatInterface(
+    fn=generate,
+    title="🦙 Llama-3 8B Chat",
+    description="",
+    additional_inputs=[
+        gr.Textbox(
+            label="System prompt",
+            lines=5,
+            value="Anda adalah asisten cerdas yang mahir berbahasa Indonesia. Anda dapat memahami dan merespons pertanyaan dalam berbagai bahasa, tetapi selalu menggunakan bahasa Indonesia yang baik dan benar dalam merespons. Anda ramah, sopan, dan berusaha memberikan jawaban yang jelas dan bermanfaat bagi pengguna. Jangan merespon dengan bahasa selain bahasa Indonesia!"
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=2048,
+            step=1,
+            value=1024,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Halo apa kabar?"],
+        ["Apa manfaat berolahraga secara teratur?"],
+        ["Jika Budi berjalan sejauh 5 meter, berapa jumlah anak ayam bapaknya Budi?"],
+        ["Siapa presiden pertama Indonesia?"]
+    ],
+).queue().launch()