Spaces:

lamhieu
/

ghost-8b-beta-8k

Paused

lamhieu commited on Jul 11, 2024

Commit

0a30342

1 Parent(s): 8fee735

chore: enable flash attention 2

Files changed (2) hide show

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Ghost 8B Beta
-emoji: 👻
 colorFrom: indigo
 colorTo: pink
 sdk: gradio

 ---
 title: Ghost 8B Beta
+emoji: 👻 / 🥸
 colorFrom: indigo
 colorTo: pink
 sdk: gradio

app.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import os
 from threading import Thread
 from typing import Iterator
@@ -7,6 +17,7 @@ import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1536
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
@@ -18,7 +29,7 @@ DESCRIPTION = """\
 The languages supported are 🇺🇸 English, 🇫🇷 French, 🇮🇹 Italian, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇩🇪 German, 🇻🇳 Vietnamese, 🇰🇷 Korean and 🇨🇳 Chinese.
-📋 Note: current model version is "disl-0x5-8k" (10 Jul 2024), context length 8k and current status is "moderating / previewing". For detailed information about the model, see [here](https://ghost-x.org/docs/models/ghost-8b-beta/). Try to experience it the way you want!
 """
@@ -241,6 +252,8 @@ if torch.cuda.is_available():
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
         trust_remote_code=True,
         token=model_tk,
     )
@@ -363,4 +376,3 @@ with gr.Blocks(fill_height=True, css="style.css") as demo:
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)
-    # demo.launch(share=True)

+# pylint: skip-file
+import subprocess
+subprocess.run(
+    f"pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 import os
 from threading import Thread
 from typing import Iterator
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1536
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 The languages supported are 🇺🇸 English, 🇫🇷 French, 🇮🇹 Italian, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇩🇪 German, 🇻🇳 Vietnamese, 🇰🇷 Korean and 🇨🇳 Chinese.
+📋 Note: current model version is "disl-0x5" (10 Jul 2024), context length 8k (8192 tokens) and current status is "moderating / previewing". For detailed information about the model, see [here](https://ghost-x.org/docs/models/ghost-8b-beta/). Try to experience it the way you want!
 """
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
         trust_remote_code=True,
         token=model_tk,
     )
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)