Spaces:

openpecha
/

demo

Sleeping

App Files Files Community

TenzinGayche commited on Oct 2, 2024

Commit

717452a

verified ·

1 Parent(s): 4fa525d

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -40

app.py CHANGED Viewed

@@ -1,50 +1,137 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, GemmaTokenizerFast, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
-from threading import Thread
-# Load tokenizer and model
-tokenizer = GemmaTokenizerFast.from_pretrained("buddhist-nlp/gemma2-mitra-bo-instruct")
-model = AutoModelForCausalLM.from_pretrained("buddhist-nlp/gemma2-mitra-bo-instruct", torch_dtype=torch.float16).to('cuda:0')
-# Define custom stopping criteria
-class StopOnTokens(StoppingCriteria):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        # Define stop tokens (adjust based on your model's tokenizer)
-        stop_ids = [29, 0]  # These should be the token IDs for end of response or similar tokens
-        for stop_id in stop_ids:
-            if input_ids[0][-1] == stop_id:
-                return True
-        return False
-# Define prediction function for the chat interface
-def predict(message, history):
-    # Format the input according to your specified structure
-    formatted_input = f"### user : {message}  ### input: ### answer:"
-    # Tokenize the input
-    model_inputs = tokenizer([formatted_input], return_tensors="pt").to("cuda")
-    # Set up the streamer for partial message output
-    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
-    # Generate settings
     generate_kwargs = dict(
-        model_inputs,
         streamer=streamer,
-        max_new_tokens=1024
     )
-    # Run generation in a separate thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # Stream partial messages as they are generated
-    partial_message = ""
-    for new_token in streamer:
-        if new_token != '<':  # Skip specific tokens if necessary
-            partial_message += new_token
-            yield partial_message
-# Create the chat interface using Gradio
-gr.ChatInterface(fn=predict, title="Monlam LLM", description="").launch(share=True)

+import os
+from threading import Thread, Event
+from typing import Iterator
 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
+DESCRIPTION = """\
+# Gemma 2 2B IT
+Gemma 2 is Google's latest iteration of open LLMs.
+This is a demo of [`google/gemma-2-2b-it`](https://huggingface.co/google/gemma-2-2b-it), fine-tuned for instruction following.
+For more details, please check [our post](https://huggingface.co/blog/gemma2).
+👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it) and the 9B version in [this Space](https://huggingface.co/spaces/huggingface-projects/gemma-2-9b-it).
+"""
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Load the model and tokenizer
+tokenizer = GemmaTokenizerFast.from_pretrained("TenzinGayche/example")
+model = AutoModelForCausalLM.from_pretrained("TenzinGayche/example", torch_dtype=torch.float16).to("cuda")
+model.config.sliding_window = 4096
+model.eval()
+# Create a shared stop event
+stop_event = Event()
+def generate(
+    message: str,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    # Clear the stop event before starting a new generation
+    stop_event.clear()
+    conversation = chat_history.copy()
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        {"input_ids": input_ids},
         streamer=streamer,
+        max_new_tokens=max_new_tokens,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    outputs = []
+    for text in streamer:
+        if stop_event.is_set():
+            break  # Stop if the stop button is pressed
+        outputs.append(text)
+        yield "".join(outputs)
+# Define a function to stop the generation
+def stop_generation():
+    stop_event.set()
+# Create the chat interface with additional inputs and the stop button
+with gr.Blocks(css="style.css", fill_height=True) as demo:
+    gr.Markdown(DESCRIPTION)
+    # Create the chat interface
+    chat_interface = gr.ChatInterface(
+        fn=generate,
+        additional_inputs=[
+            gr.Slider(
+                label="Max new tokens",
+                minimum=1,
+                maximum=MAX_MAX_NEW_TOKENS,
+                step=1,
+                value=DEFAULT_MAX_NEW_TOKENS,
+            ),
+            gr.Slider(
+                label="Temperature",
+                minimum=0.1,
+                maximum=4.0,
+                step=0.1,
+                value=0.6,
+            ),
+            gr.Slider(
+                label="Top-p (nucleus sampling)",
+                minimum=0.05,
+                maximum=1.0,
+                step=0.05,
+                value=0.9,
+            ),
+            gr.Slider(
+                label="Top-k",
+                minimum=1,
+                maximum=1000,
+                step=1,
+                value=50,
+            ),
+            gr.Slider(
+                label="Repetition penalty",
+                minimum=1.0,
+                maximum=2.0,
+                step=0.05,
+                value=1.2,
+            ),
+        ],
+        examples=[
+            ["Hello there! How are you doing?"],
+            ["Can you explain briefly to me what is the Python programming language?"],
+            ["Explain the plot of Cinderella in a sentence."],
+            ["How many hours does it take a man to eat a Helicopter?"],
+            ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+        ],
+        cache_examples=False,
+        type="messages",
+    )
+    # Create the stop button inside the Blocks context
+    stop_button = gr.Button("Stop", elem_id="stop-btn")
+    stop_button.click(fn=stop_generation, inputs=[], outputs=[])
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)