ZhongJingGPT

Running on Zero

App Files Files Community

CMLL commited on Jun 18, 2024

Commit

f3b7005

verified ·

1 Parent(s): 6e999ef

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -29

app.py CHANGED Viewed

@@ -1,38 +1,43 @@
 import os
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from threading import Thread
-from typing import Iterator
-# Constants
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
-# Llama-2 7B Chat
-This Space demonstrates model [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta, a Llama 2 model with 7B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
-🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
-🔨 Looking for an even more powerful model? Check out the [13B version](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat) or the large [70B model demo](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI).
 """
 LICENSE = """
 <p/>
 ---
-As a derivate work of [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta,
-this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
 """
-# Set the device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Model loading with the replacement setup
-base_model_id = "Qwen/Qwen1.5-1.8B-Chat"
-model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto")
-model.load_adapter("CMLM/ZhongJing-2-1_8b")
-tokenizer = AutoTokenizer.from_pretrained("CMLM/ZhongJing-2-1_8b", padding_side="right", trust_remote_code=True, pad_token='')
 @spaces.GPU
 def generate(
@@ -56,26 +61,27 @@ def generate(
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(device)  # Ensure the input tensor is on the correct device
-    outputs = []
-    generated_ids = model.generate(
-        input_ids,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
         num_beams=1,
-        repetition_penalty=repetition_penalty
     )
-    generated_ids = generated_ids.to(device)  # Ensure the generated ids are moved to the device
-    outputs.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
-    return "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,
@@ -135,4 +141,3 @@ with gr.Blocks(css="style.css") as demo:
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()

 import os
+from threading import Thread
+from typing import Iterator
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
+ZhongJing-2-1_8b Chat
+This Space demonstrates the ZhongJing-2-1_8b model, a fine-tuned model for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also deploy the model on Inference Endpoints.
 """
 LICENSE = """
 <p/>
 ---
+As a derivate work of [ZhongJing-2-1_8b](https://huggingface.co/CMLM/ZhongJing-2-1_8b) by 医哲未来 of Fudan University, this demo is governed by the original license and acceptable use policy.
 """
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+if torch.cuda.is_available():
+    base_model_id = "Qwen/Qwen1.5-1.8B-Chat"
+    peft_model_id = "CMLM/ZhongJing-2-1_8b"
+    model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16, device_map="auto")
+    model.load_adapter(peft_model_id)
+    tokenizer = AutoTokenizer.from_pretrained(
+        peft_model_id,
+        padding_side="right",
+        trust_remote_code=True,
+        pad_token=''
+    )
 @spaces.GPU
 def generate(
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
         num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()