Qwen
/

Qwen-7B-Chat

@@ -5,7 +5,7 @@
 import importlib
 import math
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List
 import torch
 import torch.nn.functional as F
@@ -53,6 +53,13 @@ _CONFIG_FOR_DOC = "QWenConfig"
 QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
 apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
@@ -971,6 +978,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         stop_words_ids: Optional[List[List[int]]] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
         if history is None:
             history = []
         if stop_words_ids is None:
@@ -990,14 +998,17 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
         if stream:
-            assert self.generation_config.chat_format == 'chatml'
             from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-            self.__class__.generate = NewGenerationMixin.generate
             self.__class__.sample_stream = NewGenerationMixin.sample_stream
             stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
             def stream_generator():
                 outputs = []
-                for token in self.generate(
                         input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
                     outputs.append(token.item())
                     if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
@@ -1027,6 +1038,62 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         return response, history
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
@@ -1037,6 +1104,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             Callable[[int, torch.Tensor], List[int]]
         ] = None,
         synced_gpus: Optional[bool] = None,
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
@@ -1059,12 +1127,13 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         return super().generate(
             inputs,
-            generation_config,
-            logits_processor,
-            stopping_criteria,
-            prefix_allowed_tokens_fn,
-            synced_gpus,
-            streamer,
             **kwargs,
         )

 import importlib
 import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
 import torch
 import torch.nn.functional as F
 QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
+_ERROR_BAD_CHAT_FORMAT = """\
+We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
+If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
+我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
+如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
+"""
 apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
         stop_words_ids: Optional[List[List[int]]] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
+        assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
         if stream:
+            logger.warn(
+                "[WARNING] This usage is deprecated and marked for removal."
+                "Please use chat_stream() instead of chat(stream=True)."
+            )
             from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
+            self.__class__.generate_stream = NewGenerationMixin.generate
             self.__class__.sample_stream = NewGenerationMixin.sample_stream
             stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
             def stream_generator():
                 outputs = []
+                for token in self.generate_stream(
                         input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
                     outputs.append(token.item())
                     if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
         return response, history
+    def chat_stream(
+            self,
+            tokenizer: PreTrainedTokenizer,
+            query: str,
+            history: Optional[HistoryType],
+            system: str = "You are a helpful assistant.",
+            stop_words_ids: Optional[List[List[int]]] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            **kwargs,
+    ) -> Generator[str, Any, None]:
+        assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
+        if history is None:
+            history = []
+        if stop_words_ids is None:
+            stop_words_ids = []
+        raw_text, context_tokens = make_context(
+            tokenizer,
+            query,
+            history=history,
+            system=system,
+            max_window_size=6144,
+            chat_format=self.generation_config.chat_format,
+        )
+        stop_words_ids.extend(get_stop_words_ids(
+            self.generation_config.chat_format, tokenizer
+        ))
+        if stop_words_ids is not None:
+            stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=self.generation_config.eos_token_id,
+            )
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList([stop_words_logits_processor])
+            else:
+                logits_processor.append(stop_words_logits_processor)
+        input_ids = torch.tensor([context_tokens]).to(self.device)
+        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
+        self.__class__.generate_stream = NewGenerationMixin.generate
+        self.__class__.sample_stream = NewGenerationMixin.sample_stream
+        stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
+        def stream_generator():
+            outputs = []
+            for token in self.generate_stream(
+                    input_ids,
+                    return_dict_in_generate=False,
+                    generation_config=stream_config,
+                    logits_processor=logits_processor,
+                    **kwargs):
+                outputs.append(token.item())
+                yield tokenizer.decode(outputs, skip_special_tokens=True, erros='ignore')
+        return stream_generator()
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
             Callable[[int, torch.Tensor], List[int]]
         ] = None,
         synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         return super().generate(
             inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
             **kwargs,
         )