hfl-rc commited on
Commit
802ccb7
1 Parent(s): ff72155

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import spaces
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  import subprocess
7
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
8
 
9
  BANNER_HTML = """
10
  <p align="center">
@@ -37,7 +37,7 @@ def load_model(version):
37
  model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2")
38
  return f"Model {model_name} loaded."
39
 
40
- @spaces.GPU(duration=50)
41
  def stream_chat(message: str, history: list, system_prompt: str, model_version: str, temperature: float, max_new_tokens: int):
42
  conversation = [{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}]
43
  for prompt, answer in history:
 
4
  import spaces
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  import subprocess
7
+ #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
8
 
9
  BANNER_HTML = """
10
  <p align="center">
 
37
  model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2")
38
  return f"Model {model_name} loaded."
39
 
40
+ @spaces.GPU(duration=60)
41
  def stream_chat(message: str, history: list, system_prompt: str, model_version: str, temperature: float, max_new_tokens: int):
42
  conversation = [{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}]
43
  for prompt, answer in history: