Spaces:

mikeee
/

qwen-7b-chat

Running on T4

App Files Files Community

ffreemt commited on Aug 20, 2023

Commit

c48ba74

1 Parent(s): 584239a

Update API ready, TODO: fix info

Browse files

Files changed (1) hide show

app.py +183 -12

app.py CHANGED Viewed

@@ -6,6 +6,30 @@ transformers 4.31.0
 import torch
 torch.cuda.empty_cache()
 """
 # pylint: disable=line-too-long, invalid-name, no-member, redefined-outer-name, missing-function-docstring, missing-class-docstring, broad-except,
 import gc
@@ -14,7 +38,9 @@ import sys
 import time
 from collections import deque
 from dataclasses import asdict, dataclass
 from types import SimpleNamespace
 import gradio as gr
 import torch
@@ -100,6 +126,10 @@ model = None
 gc.collect()
 torch.cuda.empty_cache()
 model = gen_model(model_name)
@@ -136,6 +166,7 @@ def bot(chat_history, **kwargs):
         chat_history[:-1].append(["message", str(exc)])
         return chat_history
 def bot_stream(chat_history, **kwargs):
     logger.trace(f"{chat_history=}")
     logger.trace(f"{kwargs=}")
@@ -149,14 +180,17 @@ def bot_stream(chat_history, **kwargs):
     # for elm in model.chat_stream(tokenizer, message, chat_history):
     model.generation_config.update(**kwargs)
     for elm in model.chat_stream(tokenizer, message, chat_history):
         chat_history[-1] = [message, elm]
         yield chat_history
-    logger.debug(f"response: {elm}")
 SYSTEM_PROMPT = "You are a helpful assistant."
-MAX_MAX_NEW_TOKENS = 1024
 MAX_NEW_TOKENS = 256
@@ -172,6 +206,72 @@ class Config:
 # stats_default = SimpleNamespace(llm=model, system_prompt=SYSTEM_PROMPT, config=Config())
 stats_default = SimpleNamespace(llm=None, system_prompt=SYSTEM_PROMPT, config=Config())
 theme = gr.themes.Soft(text_size="sm")
 with gr.Blocks(
     theme=theme,
@@ -179,24 +279,69 @@ with gr.Blocks(
     css=css,
 ) as block:
     stats = gr.State(stats_default)
-    if not torch.cuda.is_available():
-        raise gr.Error("GPU not available, cant run. Turn on GPU and restart")
     config = asdict(stats.value.config)
     def bot_stream_state(chat_history):
         logger.trace(f"{chat_history=}")
         yield from bot_stream(chat_history, **config)
     with gr.Accordion("🎈 Info", open=False):
         gr.Markdown(
-            f"""<h5><center>{model_name.lower()}</center></h4>
-            Set `repetition_penalty` to 2.1 or higher for a chatty conversation. Lower it to 1.1 or smaller if more focused anwsers are desired (for example for translations or fact-oriented queries). Smaller `top_k` probably will result in smoothier sentences.
-            (`top_k=0` is equivalent to `top_k` equal to very very big though.) Consult `transformers` documentation for more details.
-            Most examples are meant for another model.
-            You probably should try to test
-            some related prompts.""",
             elem_classes="xsmall",
         )
@@ -367,5 +512,31 @@ with gr.Blocks(
             elem_classes=["disclaimer"],
         )
 if __name__ == "__main__":
     block.queue(max_size=8).launch(debug=True)

 import torch
 torch.cuda.empty_cache()
+model.chat(
+    tokenizer: transformers.tokenization_utils.PreTrainedTokenizer,
+    query: str,
+    history: Optional[List[Tuple[str, str]]],
+    system: str = 'You are a helpful assistant.',
+    append_history: bool = True,
+    stream: Optional[bool] = <object object at 0x7f905797ec20>,
+    stop_words_ids: Optional[List[List[int]]] = None,
+    **kwargs) -> Tuple[str, List[Tuple[str, str]]]
+)
+model.generation_config
+GenerationConfig {
+  "chat_format": "chatml",
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "max_new_tokens": 512,
+  "max_window_size": 6144,
+  "pad_token_id": 151643,
+  "top_k": 0,
+  "top_p": 0.5,
+  "transformers_version": "4.31.0",
+  "trust_remote_code": true
+}
 """
 # pylint: disable=line-too-long, invalid-name, no-member, redefined-outer-name, missing-function-docstring, missing-class-docstring, broad-except,
 import gc
 import time
 from collections import deque
 from dataclasses import asdict, dataclass
+from textwrap import dedent
 from types import SimpleNamespace
+from typing import List, Optional
 import gradio as gr
 import torch
 gc.collect()
 torch.cuda.empty_cache()
+if not torch.cuda.is_available():
+    # raise gr.Error("GPU not available, cant run. Turn on GPU and retry")
+    raise SystemExit("GPU not available, cant run. Turn on GPU and retry")
 model = gen_model(model_name)
         chat_history[:-1].append(["message", str(exc)])
         return chat_history
 def bot_stream(chat_history, **kwargs):
     logger.trace(f"{chat_history=}")
     logger.trace(f"{kwargs=}")
     # for elm in model.chat_stream(tokenizer, message, chat_history):
     model.generation_config.update(**kwargs)
+    response = ""
     for elm in model.chat_stream(tokenizer, message, chat_history):
         chat_history[-1] = [message, elm]
+        response = elm
         yield chat_history
+    logger.debug(f"{model.generation_config=}")
+    logger.debug(f"{response=}")
 SYSTEM_PROMPT = "You are a helpful assistant."
+MAX_MAX_NEW_TOKENS = 2048  # sequence length 2048
 MAX_NEW_TOKENS = 256
 # stats_default = SimpleNamespace(llm=model, system_prompt=SYSTEM_PROMPT, config=Config())
 stats_default = SimpleNamespace(llm=None, system_prompt=SYSTEM_PROMPT, config=Config())
+# input max_new_tokens temperature repetition_penalty top_k top_p system_prompt history
+def api_fn(  # pylint: disable=too-many-arguments
+    input_text: Optional[str],
+    # max_length: int = 256,
+    max_new_tokens: int = stats_default.config.max_new_tokens,
+    temperature: float = stats_default.config.temperature,
+    repetition_penalty: float = stats_default.config.repetition_penalty,
+    top_k: int = stats_default.config.top_k,
+    top_p: int = stats_default.config.top_p,
+    system_prompt: Optional[str] = None,
+    history: Optional[List[str]] = None,
+):
+    if input_text is None:
+        input_text = ""
+    try:
+        input_text = str(input_text).strip()
+    except Exception as exc:
+        logger.error(exc)
+        input_text = ""
+    if not input_text:
+        return ""
+    if history is None:
+        history = []
+    try:
+        temperature = float(temperature)
+    except Exception:
+        temperature = stats_default.config.temperature
+    if system_prompt is None:
+        system_prompt = stats_default.system_prompt
+    # if max_length < 10: max_length = 4096
+    if max_new_tokens < 10:
+        max_new_tokens = stats_default.config.max_new_tokens
+    if top_p < 0.1 or top_p > 1:
+        top_p = stats_default.config.top_p
+    if temperature <= 0.5:
+        temperature = stats_default.config.temperature
+    _ = {
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "repetition_penalty": repetition_penalty,
+        "top_k": top_k,
+        "top_p": top_p,
+    }
+    model.generation_config.update(**_)
+    try:
+        res, _ = model.chat(
+            tokenizer,
+            input_text,
+            history=history,
+            # max_length=max_length,
+            append_history=False,
+        )
+        # logger.debug(f"{res=} \n{_=}")
+    except Exception as exc:
+        logger.error(f"{exc=}")
+        res = str(exc)
+    logger.debug(f"api {model.generation_config=}")
+    logger.debug(f"api {res=}")
+    return res
 theme = gr.themes.Soft(text_size="sm")
 with gr.Blocks(
     theme=theme,
     css=css,
 ) as block:
     stats = gr.State(stats_default)
+    # would this reset model?
+    model.generation_config = GenerationConfig.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+    )
     config = asdict(stats.value.config)
     def bot_stream_state(chat_history):
         logger.trace(f"{chat_history=}")
         yield from bot_stream(chat_history, **config)
     with gr.Accordion("🎈 Info", open=False):
         gr.Markdown(
+            dedent(
+                f"""
+                ## {model_name.lower()}
+                * temperature range: .51 and up; higher temperature implies more random outputs. Suggested temperature for chatting and creative writing is around 1.1 while it should be set to 0.51-1.0 for summerizing and translation for example.
+                * Set `repetition_penalty` to 2.1 or higher for a chatty conversation (more unpredictable and undesirable output). Lower it to 1.1 or smaller if more focused anwsers are desired (for example for translations or fact-oriented queries).
+                * Smaller `top_k` probably will result in smoothier sentences.
+                (`top_k=0` is equivalent to `top_k` equal to very very big though.) Consult `transformers` documentation for more details.
+                * If you inadvertanyl messed up the parameters or the model, reset it in Advanced Options or reload the browser.
+                <p></p>
+                An api is available at, well, https://mikeee-qwen-7b-chat.hf.space/, e.g. in python
+                ```python
+                from gradio_client import Client
+                client = Client("https://7cff5e13976c7ba889.gradio.live/")
+                result = client.predict(
+                    "你好!",  # user prompt
+                    256,  # max_new_tokens
+                    0.951,  # temperature
+                    1.1,  # repetition_penalty
+                    0,  # top_k
+                    0.9,  # top_p
+                    "You are a help assistant",  # system_prompt
+                    None,  # history
+                    api_name="/api"
+                )
+                print(result)
+                ```
+                or in javascript
+                ```js
+                import {{ client }} from "@gradio/client";
+                const app = await client("https://mikeee-qwen-7b-chat.hf.space/");
+                const result = await app.predict("api", [...]);
+                console.log(result.data);
+                ```
+                Check documentation and examples by clicking `Use via API` at the very bottom of this page.
+                <p></p>
+                Most examples are meant for another model.
+                You probably should try to test
+                some related prompts."""
+            ),
             elem_classes="xsmall",
         )
             elem_classes=["disclaimer"],
         )
+    with gr.Accordion("For Chat/Translation API", open=False, visible=False):
+        input_text = gr.Text()
+        api_history = gr.Chatbot(value=[])
+        api_btn = gr.Button("Go", variant="primary")
+        out_text = gr.Text()
+    # api_fn args order
+    # input_text max_new_tokens temperature repetition_penalty top_k top_p system_prompt history
+    api_btn.click(
+        api_fn,
+        [
+            input_text,
+            max_new_tokens,
+            temperature,
+            repetition_penalty,
+            top_k,
+            top_p,
+            system_prompt,
+            api_history,  # dont know how to pass this in gradio_client.Client calls
+        ],
+        out_text,
+        api_name="api",
+    )
 if __name__ == "__main__":
+    logger.info("Just record start time")
     block.queue(max_size=8).launch(debug=True)