Spaces:

kevinwang676
/

ChatGLM3-demo

Paused

App Files Files Community

kevinwang676 commited on Dec 2, 2023

Commit

4721aa1

1 Parent(s): 0801e4f

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.github/ISSUE_TEMPLATE/bug_report.yaml +77 -0
.github/ISSUE_TEMPLATE/feature-request.yaml +34 -0
.github/PULL_REQUEST_TEMPLATE/pr_template.md +34 -0
.gitignore +10 -0
.ipynb_checkpoints/Untitled-checkpoint.ipynb +6 -0
DEPLOYMENT.md +39 -0
DEPLOYMENT_en.md +43 -0
MODEL_LICENSE +65 -0
PROMPT.md +203 -0
PROMPT_en.md +201 -0
README_en.md +250 -0
Untitled.ipynb +192 -0
basic_demo/cli_demo.py +79 -0
basic_demo/cli_demo_bad_word_ids.py +83 -0
basic_demo/utils.py +58 -0
basic_demo/web_demo.py +117 -0
basic_demo/web_demo2.py +89 -0
chatglm3-6b/.gitattributes +35 -0
chatglm3-6b/MODEL_LICENSE +65 -0
chatglm3-6b/README.md +102 -0
chatglm3-6b/config.json +42 -0
chatglm3-6b/configuration_chatglm.py +61 -0
chatglm3-6b/modeling_chatglm.py +1293 -0
chatglm3-6b/pytorch_model-00001-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00002-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00003-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00004-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00005-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00006-of-00007.bin +3 -0
chatglm3-6b/pytorch_model-00007-of-00007.bin +3 -0
chatglm3-6b/pytorch_model.bin.index.json +207 -0
chatglm3-6b/quantization.py +188 -0
chatglm3-6b/tokenization_chatglm.py +300 -0
chatglm3-6b/tokenizer.model +3 -0
chatglm3-6b/tokenizer_config.json +12 -0
composite_demo/.streamlit/config.toml +2 -0
composite_demo/README.md +85 -0
composite_demo/README_en.md +85 -0
composite_demo/assets/demo.png +0 -0
composite_demo/assets/emojis.png +0 -0
composite_demo/assets/heart.png +0 -0
composite_demo/assets/tool.png +0 -0
composite_demo/client.py +197 -0
composite_demo/conversation.py +119 -0
composite_demo/demo_chat.py +80 -0
composite_demo/demo_ci.py +329 -0
composite_demo/demo_tool.py +192 -0
composite_demo/main.py +63 -0
composite_demo/requirements.txt +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resources/code_en.gif filter=lfs diff=lfs merge=lfs -text
+resources/web-demo.gif filter=lfs diff=lfs merge=lfs -text
+resources/web-demo2.gif filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug_report.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve ChatGLM3 / 提交一个 Bug 问题报告来帮助我们改进 ChatGLM3
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info / 系統信息
+      description: Your operating environment / 您的运行环境信息
+      placeholder: Includes Cuda version, Transformers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本，Transformers版本，Python版本，操作系统，硬件信息(如果您怀疑是硬件方面的问题)...
+    validations:
+      required: true
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help? / 谁可以帮助到您？
+      description: |
+        Your issue will be replied to more quickly if you can figure out the right person to tag with @
+        All issues are read by one of the maintainers, so if you don't know who to tag, just leave this blank and our maintainer will ping the right person.
+        Please tag fewer than 3 people.
+        如果您能找到合适的标签 @，您的问题会更快得到回复。
+        所有问题都会由我们的维护者阅读，如果您不知道该标记谁，只需留空，我们的维护人员会找到合适的开发组成员来解决问题。
+        标记的人数应该不超过 3 个人。
+        Related demo leader / 相关demo负责人 :
+        - finetune_demo: @Btlmd
+        - langchain_demo: @yincf
+        - composite_demo: @abmfy
+        If it's not a bug in these three subsections, you may not specify the helper. Our maintainer will find the right person in the development group to solve the problem.
+        如果不是这三个子版块的bug，您可以不指明帮助者，我们的维护人员会找到合适的开发组成员来解决问题。
+      placeholder: "@Username ..."
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information / 问题信息
+      description: 'The problem arises when using: / 问题出现在'
+      options:
+        - label: "The official example scripts / 官方的示例脚本"
+        - label: "My own modified scripts / 我自己修改的脚本和任务"
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction / 复现过程
+      description: |
+        Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
+        If you have code snippets, error messages, stack traces, please provide them here as well.
+        Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+        Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
+        请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
+        如果您有代码片段、错误信息、堆栈跟踪，也请在此提供。
+        请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+        请勿使用截图，因为截图难以阅读，而且（更重要的是）不允许他人复制粘贴您的代码。
+      placeholder: |
+        Steps to reproduce the behavior/复现Bug的步骤:
+          1.
+          2.
+          3.
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior / 期待表现
+      description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"

.github/ISSUE_TEMPLATE/feature-request.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: "\U0001F680 Feature request"
+description: Submit a request for a new ChatGLM3 feature / 提交一个新的 ChatGLM3 的功能建议
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request  / 功能建议
+      description: |
+        A brief description of the functional proposal. Links to corresponding papers and code are desirable.
+        对功能建议的简述。最好提供对应的论文和代码链接
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation / 动机
+      description: |
+        Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
+        您提出建议的动机。如果该动机与另一个 GitHub 问题有关，请在此处提供对应的链接。
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution / 您的贡献
+      description: |
+        Your PR link or any other link you can help with.
+        您的PR链接或者其他您能提供帮助的链接。

.github/PULL_REQUEST_TEMPLATE/pr_template.md ADDED Viewed

	@@ -0,0 +1,34 @@

+#  Raise valuable PR / 提出有价值的PR
+## Caution/ 注意事项:
+Users should keep the following points in mind when submitting PRs:
+1. The proposed PR should be about this project.
+2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs.
+用户在提交PR时候应该注意以下几点:
+1. 提出的PR应该是关于本项目的。
+2. 提出的PR应该具有针对性，如果具有多个不同的想法和优化方案，应该分配到不同的PR中。
+## 不应该提出的PR / PRs that should not be proposed
+If a developer proposes a PR about any of the following, it may be closed or Rejected.
+1. those that don't describe improvement options.
+2. multiple issues of different types combined in one PR.
+3. The proposed PR is highly duplicative of already existing PRs.
+如果开发者提出关于以下方面的PR，则可能会被直接关闭或拒绝通过。
+1. 没有说明改进方案的。
+2. 多个不同类型的问题合并在一个PR中的。
+3. 提出的PR与已经存在的PR高度重复的。
+# 检查您的PR
+- [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分？
+- [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过？如果是，请添加链接。
+- [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档？这里是文档指南，这里是文档格式化技巧。
+- [ ] Did you write new required tests? / 您是否编写了新的必要测试？
+- [ ]  Are your PRs for only one issue / 您的PR是否仅针对一个问题

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+__pycache__
+# finetune_demo: generated & downloaded files
+finetune_demo/output
+finetune_demo/data
+finetune_demo/formatted_data
+ToolAlpaca/
+AdvertiseGen/
+*.gz
+*.idea

.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,39 @@

+## 低成本部署
+### 模型量化
+默认情况下，模型以 FP16 精度加载，运行上述代码需要大概 13GB 显存。如果你的 GPU 显存有限，可以尝试以量化方式加载模型，使用方法如下：
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b",trust_remote_code=True).quantize(4).cuda()
+```
+模型量化会带来一定的性能损失，经过测试，ChatGLM3-6B 在 4-bit 量化下仍然能够进行自然流畅的生成。
+### CPU 部署
+如果你没有 GPU 硬件的话，也可以在 CPU 上进行推理，但是推理速度会更慢。使用方法如下（需要大概 32GB 内存）
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).float()
+```
+### Mac 部署
+对于搭载了 Apple Silicon 或者 AMD GPU 的 Mac，可以使用 MPS 后端来在 GPU 上运行 ChatGLM3-6B。需要参考 Apple 的 [官方说明](https://developer.apple.com/metal/pytorch) 安装 PyTorch-Nightly（正确的版本号应该是2.x.x.dev2023xxxx，而不是 2.x.x）。
+目前在 MacOS 上只支持[从本地加载模型](README.md#从本地加载模型)。将代码中的模型加载改为从本地加载，并使用 mps 后端：
+```python
+model = AutoModel.from_pretrained("your local path", trust_remote_code=True).to('mps')
+```
+加载半精度的 ChatGLM3-6B 模型需要大概 13GB 内存。内存较小的机器（比如 16GB 内存的 MacBook Pro），在空余内存不足的情况下会使用硬盘上的虚拟内存，导致推理速度严重变慢。
+### 多卡部署
+如果你有多张 GPU，但是每张 GPU 的显存大小都不足以容纳完整的模型，那么可以将模型切分在多张GPU上。首先安装 accelerate: `pip install accelerate`，然后通过如下方法加载模型：
+```python
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm3-6b", num_gpus=2)
+```
+即可将模型部署到两张 GPU 上进行推理。你可以将 `num_gpus` 改为你希望使用的 GPU 数。默认是均匀切分的，你也可以传入 `device_map` 参数来自己指定。

DEPLOYMENT_en.md ADDED Viewed

	@@ -0,0 +1,43 @@

+## Low-Cost Deployment
+### Model Quantization
+By default, the model is loaded with FP16 precision, running the above code requires about 13GB of VRAM. If your GPU's VRAM is limited, you can try loading the model quantitatively, as follows:
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b",trust_remote_code=True).quantize(4).cuda()
+```
+Model quantization will bring some performance loss. Through testing, ChatGLM3-6B can still perform natural and smooth generation under 4-bit quantization.
+### CPU Deployment
+If you don't have GPU hardware, you can also run inference on the CPU, but the inference speed will be slower. The usage is as follows (requires about 32GB of memory):
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).float()
+```
+### Mac Deployment
+For Macs equipped with Apple Silicon or AMD GPUs, the MPS backend can be used to run ChatGLM3-6B on the GPU. Refer to Apple's [official instructions](https://developer.apple.com/metal/pytorch) to install PyTorch-Nightly (the correct version number should be 2.x.x.dev2023xxxx, not 2.x.x).
+Currently, only [loading the model locally](README_en.md#load-model-locally) is supported on MacOS. Change the model loading in the code to load locally and use the MPS backend:
+```python
+model = AutoModel.from_pretrained("your local path", trust_remote_code=True).to('mps')
+```
+Loading the half-precision ChatGLM3-6B model requires about 13GB of memory. Machines with smaller memory (such as a 16GB memory MacBook Pro) will use virtual memory on the hard disk when there is insufficient free memory, resulting in a significant slowdown in inference speed.
+### Multi-GPU Deployment
+If you have multiple GPUs, but each GPU's VRAM size is not enough to accommodate the complete model, then the model can be split across multiple GPUs. First, install accelerate: `pip install accelerate`, and then load the model through the following methods:
+```python
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm3-6b", num_gpus=2)
+```
+This allows the model to be deployed on two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. It is evenly split by default, but you can also pass the `device_map` parameter to specify it yourself.

MODEL_LICENSE ADDED Viewed

	@@ -0,0 +1,65 @@

+The ChatGLM3-6B License
+1. 定义
+“许可方”是指分发其软件的 ChatGLM3-6B 模型团队。
+“软件”是指根据本许可提供的 ChatGLM3-6B 模型参数。
+2. 许可授予
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+3.限制
+您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+4.免责声明
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+5. 责任限制
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+6.争议解决
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 [email protected] 与我们联系。
+1. Definitions
+“Licensor” means the ChatGLM3-6B Model Team that distributes its Software.
+“Software” means the ChatGLM3-6B model parameters made available under this license.
+2. License Grant
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software.
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+3. Restriction
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any military, or illegal purposes.
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+4. Disclaimer
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+5. Limitation of Liability
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+6. Dispute Resolution
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at [email protected].

PROMPT.md ADDED Viewed

	@@ -0,0 +1,203 @@

+## ChatGLM3 对话格式
+为了避免用户输入的注入攻击，以及统一 Code Interpreter，Tool & Agent 等任务的输入，ChatGLM3 采用了全新的对话格式。
+### 规定
+#### 整体结构
+ChatGLM3 对话的格式由若干对话组成，其中每个对话包含对话头和内容，一个典型的多轮对话结构如下
+```text
+<|system|>
+You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.
+<|user|>
+Hello
+<|assistant|>
+Hello, I'm ChatGLM3. What can I assist you today?
+```
+**实际中每轮对话内容并不一定以换行符结尾，这里只是为了美观，下同**
+#### 对话头
+对话头占完整的一行，格式为
+```text
+<|role|>{metadata}
+```
+其中 `<|role|>` 部分使用 special token 表示，无法从文本形式被 tokenizer 编码以防止注入。metadata 部分采用纯文本表示，为可选内容。
+* `<|system|>`：系统信息，设计上可穿插于对话中，**但目前规定仅可以出现在开头**
+* `<|user|>`：用户
+  - 不会连续出现多个来自 `<|user|>` 的信息
+* `<|assistant|>`：AI 助手
+  - 在出现之前必须有一个来自 `<|user|>` 的信息
+* `<|observation|>`：外部的返回结果
+  - 必须在 `<|assistant|>` 的信息之后
+### 样例场景
+为提升可读性，下列样例场景中表示角色的 special token 前均额外添加了一个换行符。实际使用及 tokenizer 实现中均无需额外添加这一换行。
+#### 多轮对话
+* 有且仅有 `<|user|>`、`<|assistant|>`、`<|system|>` 三种 role
+```text
+<|system|>
+You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.
+<|user|>
+Hello
+<|assistant|>
+Hello, I'm ChatGLM3. What can I assist you today?
+```
+#### 工具调用
+````
+<|system|>
+Answer the following questions as best as you can. You have access to the following tools:
+[
+    {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {"type": "string"},
+            },
+            "required": ["location"],
+        },
+    }
+]
+<|user|>
+今天北京的天气怎么样？
+<|assistant|>
+好的，让我们来查看今天的天气
+<|assistant|>get_current_weather
+```python
+tool_call(location="beijing", unit="celsius")
+```
+<|observation|>
+{"temperature": 22}
+<|assistant|>
+根据查询结果，今天北京的气温为 22 摄氏度。
+````
+#### 代码执行
+* 有 `<|user|>`、`<|assistant|>`、`<|system|>`、`<|observation|>` 四种 role。其中 `<|assistant|>` 的 metadata 只有 interpreter。
+`````text
+<|system|>
+你是一位智能AI助手，你叫ChatGLM3，你连接着一台电脑，但请注意不能联网。在使用Python解决任务时，你可以运行代码并得到结果，如果运行结果有错误，你需要尽可能对代码进行改进。你可以处理用户上传到电脑上的文件，文件默认存储路径是/mnt/data/。
+<|user|>
+#File: /mnt/data/metadata.jsonl
+#Size: 35380
+#File uploaded
+文件中是否存在缺失值或异常值？
+<|assistant|>
+首先，我将读取您提供的文件，并查看其内容以确定是否存在缺失值或异常值。
+我们从加载文件开始：
+<|assistant|>interpreter
+```python
+import json
+# Load the uploaded file
+data = []
+with open("/mnt/data/metadata.jsonl", 'r') as file:
+    for line in file:
+        data.append(json.loads(line))
+# Display the first few records to get a sense of the data
+data[:5]
+```
+<|observation|>
+```result
+[{'file_name': 'aceinthehole.png',
+  'name': 'Ace in the Hole',
+  'type': 'survivor',
+  'description': 'Lady Luck always seems to be throwing something good your way.'},
+ {'file_name': 'adrenaline.png',
+  'name': 'Adrenaline',
+  'type': 'survivor',
+  'description': 'You are fuelled by unexpected energy when on the verge of escape.'},
+ {'file_name': 'aftercare.png',
+  'name': 'Aftercare',
+  'type': 'survivor',
+  'description': "Unlocks potential in one's aura reading ability."},
+ {'file_name': 'agitation.png',
+  'name': 'Agitation',
+  'type': 'killer',
+  'description': 'You get excited in anticipation of hooking your prey.'},
+ {'file_name': 'alert.png',
+  'name': 'Alert',
+  'type': 'survivor',
+  'description': 'Your acute senses are on high alert.'}]
+```
+<|assistant|>
+该文件看起来包含有关某些条目的元数据，每个条目有以下字段：
+- `file_name`: 文件名称
+- `name`: 名称
+- `type`: 类型（例如 "survivor" 或 "killer"）
+- `description`: 描述
+接下来，我将检查这些字段是否存在缺失值。
+<|assistant|>interpreter
+```python
+# Check for missing values in each column
+missing_values = {key: sum(1 for item in data if key not in item) for key in data[0].keys()}
+missing_values
+```
+<|observation|>
+```result
+{'file_name': 0, 'name': 0, 'type': 0, 'description': 0}
+```
+<|assistant|>
+每个字段都没有缺失值。这意味着每个条目都完整。
+接下来，我们可以检查是否存在任何潜在的异常值。这通常需要更深入的分析和对数据的了解，但我可以从一些基本的统计和分布开始进行检查。为了这样做，我们可以查看“type”字段的值分布，看看是否有任何意外的类别。
+<|assistant|>interpreter
+```python
+# Check the distribution of values in the 'type' column
+type_distribution = {key: sum(1 for item in data if item['type'] == key) for key in set(item['type'] for item in data)}
+type_distribution
+```
+<|observation|>
+```result
+{'survivor': 116, 'killer': 103}
+```
+<|assistant|>
+“type”字段有两个值：“survivor”和“killer”。其中，“survivor”有116个条目，而“killer”有103个条目。
+从这个初步的检查中，我们没有发现任何明显的异常值。
+如果您有任何特定的标准或其他字段需要检查，我可以为您提供更详细的分析。您还希望我检查什么呢？
+<|user|>
+帮我画一个爱心
+<|assistant|>interpreter
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+# 爱心形状的参数方程
+def heart(t):
+    x = 16 * np.sin(t) ** 3
+    y = 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)
+    return x, y
+t = np.linspace(0, 2 * np.pi, 1000)
+x, y = heart(t)
+plt.figure(figsize=(6, 6))
+plt.plot(x, y, color='red')
+plt.axis('equal')
+plt.axis('off')
+plt.show()
+```
+<|observation|>
+```result
+[Image]
+```
+<|assistant|>
+这是一个爱心形状。我使用了参数方程来描述这个形状，并使用matplotlib进行了绘制。如果您有任何其他需求或问题，请随时告诉我。
+<|user|>  # End
+````

PROMPT_en.md ADDED Viewed

	@@ -0,0 +1,201 @@

+## ChatGLM3 Chat Format
+To avoid injection attacks from user input, and to unify the input of Code Interpreter, Tool & Agent and other tasks, ChatGLM3 adopts a brand-new dialogue format.
+### Regulations
+#### Overall Structure
+The format of the ChatGLM3 dialogue consists of several conversations, each of which contains a dialogue header and content. A typical multi-turn dialogue structure is as follows:
+```text
+<|system|>
+You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.
+<|user|>
+Hello
+<|assistant|>
+Hello, I'm ChatGLM3. What can I assist you today?
+```
+#### Chat Header
+The chat header occupies a complete line, formatted as:
+```text
+<|role|>{metadata}
+```
+Where `<|role|>` part is represented in a special token,  which can’t be encoded by the tokenizer from the text form to prevent injection attacks. The `metadata` part is represented in plain texts and is optional content.
+* `<|system|>`: System information, which can be interspersed in the dialogue in design, **but currently only appears at the beginning**
+* `<|user|>`: User
+  - Multiple messages from `<|user|>` will not appear continuously
+* `<|assistant|>`: AI assistant
+  - There must be a message from `<|user|>` before it appears
+* `<|observation|>`: External return result
+  - Must be after the message from `<|assistant|>`
+### Example Scenarios
+For better readablity, an extra `\n` is added before each role special token. This extra `\n` should not be added in actual use and tokenizer implementation.
+#### Multi-turn Dialogue
+* There are only three roles: `<|user|>`, `<|assistant|>`, and `<|system|>`.
+```text
+<|system|>
+You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.
+<|user|>
+Hello
+<|assistant|>
+Hello, I'm ChatGLM3. What can I assist you today?
+```
+#### Tool Calling
+````
+<|system|>
+Answer the following questions as best as you can. You have access to the following tools:
+[
+    {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {"type": "string"},
+            },
+            "required": ["location"],
+        },
+    }
+]
+<|user|>
+What's the weather in Beijing today?
+<|assistant|>
+Okay, let's look up the weather in Bejing today.
+<|assistant|>get_current_weather
+```python
+tool_call(location="beijing", unit="celsius")
+```
+<|observation|>
+{"temperature": 22}
+<|assistant|>
+According to the query results, the temperature in Beijing today is 22 degrees Celsius.
+````
+#### Code Execution
+* Four available roles: `<|user|>`, `<|assistant|>`, `<|system|>`, and `<|observation|>`, where `metadata` of `<|assistant|>` can only be interpreter.
+`````text
+<|system|>
+You are an intelligent AI assistant, named ChatGLM3. You are connected to a computer, but please note that you cannot connect to the internet. When using Python to solve tasks, you can run the code and get the results. If there are errors in the running results, you need to improve the code as much as possible. You can process files uploaded by users to your computer, and the default storage path for files is /mnt/data/.
+<|user|>
+#File: /mnt/data/metadata.jsonl
+#Size: 35380
+#File uploaded
+Are there missing or abnormal values in the file?
+<|assistant|>
+Firstly, I will read the file you provided and review its contents to determine if there are missing or abnormal values.
+We start by loading the file:
+<|assistant|>interpreter
+```python
+import json
+# Load the uploaded file
+data = []
+with open("/mnt/data/metadata.jsonl", 'r') as file:
+    for line in file:
+        data.append(json.loads(line))
+# Display the first few records to get a sense of the data
+data[:5]
+```
+<|observation|>
+```result
+[{'file_name': 'aceinthehole.png',
+  'name': 'Ace in the Hole',
+  'type': 'survivor',
+  'description': 'Lady Luck always seems to be throwing something good your way.'},
+ {'file_name': 'adrenaline.png',
+  'name': 'Adrenaline',
+  'type': 'survivor',
+  'description': 'You are fuelled by unexpected energy when on the verge of escape.'},
+ {'file_name': 'aftercare.png',
+  'name': 'Aftercare',
+  'type': 'survivor',
+  'description': "Unlocks potential in one's aura reading ability."},
+ {'file_name': 'agitation.png',
+  'name': 'Agitation',
+  'type': 'killer',
+  'description': 'You get excited in anticipation of hooking your prey.'},
+ {'file_name': 'alert.png',
+  'name': 'Alert',
+  'type': 'survivor',
+  'description': 'Your acute senses are on high alert.'}]
+```
+<|assistant|>
+This file appears to contain metadata about certain items, each with the following fields:
+- `file_name`
+- `name`
+- `type` (e.g. "survivor" or "killer")
+- `description`
+Next, I will check for missing values in these fields.
+<|assistant|>interpreter
+```python
+# Check for missing values in each column
+missing_values = {key: sum(1 for item in data if key not in item) for key in data[0].keys()}
+missing_values
+```
+<|observation|>
+```result
+{'file_name': 0, 'name': 0, 'type': 0, 'description': 0}
+```
+<|assistant|>
+Each field has no missing values. This means that each entry is complete.
+Next, we can check for any potential outliers. This usually requires deeper analysis and understanding of the data, but I can start by examining some basic statistics and distributions. To do this, we can check the value distribution of the "type" field to see if there are any unexpected categories.
+<|assistant|>interpreter
+```python
+# Check the distribution of values in the 'type' column
+type_distribution = {key: sum(1 for item in data if item['type'] == key) for key in set(item['type'] for item in data)}
+type_distribution
+```
+<|observation|>
+```result
+{'survivor': 116, 'killer': 103}
+```
+<|assistant|>
+The 'type' field has two values: 'survivor' and 'killer'. Among them, 'survivor' has 116 entries, while 'killer' has 103 entries.
+From this preliminary examination, we did not find any obvious abnormal values.
+If you have any specific standards or other fields that need to be checked, I can provide you with a more detailed analysis. What else do you want me to check?
+<|user|>
+Help me draw a heart
+<|assistant|>interpreter
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+# The parametric equation for the shape of a heart
+def heart(t):
+    x = 16 * np.sin(t) ** 3
+    y = 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t)
+    return x, y
+t = np.linspace(0, 2 * np.pi, 1000)
+x, y = heart(t)
+plt.figure(figsize=(6, 6))
+plt.plot(x, y, color='red')
+plt.axis('equal')
+plt.axis('off')
+plt.show()
+```
+<|observation|>
+```result
+[Image]
+```
+<|assistant|>
+This is a shape of heart. I used parametric equations to describe this shape and drew it using Matplotlib. If you have any other needs or questions, please feel free to let me know.
+<|user|>  # End
+````

README_en.md ADDED Viewed

	@@ -0,0 +1,250 @@

+# ChatGLM3
+<p align="center">
+🤗 <a href="https://huggingface.co/THUDM/chatglm3-6b" target="_blank">HF Repo</a> • 🤖 <a href="https://modelscope.cn/models/ZhipuAI/chatglm3-6b" target="_blank">ModelScope</a> • 📔 <a href="https://lslfd0slxc.feishu.cn/wiki/WvQbwIJ9tiPAxGk8ywDck6yfnof" target="_blank">Document</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2210.02414" target="_blank">[GLM-130B@ICLR 23]</a> <a href="https://github.com/THUDM/GLM-130B" target="_blank">[GitHub]</a> <br>
+</p>
+<p align="center">
+    👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-25ti5uohv-A_hs~am_D3Q8XPZMpj7wwQ" target="_blank">Slack</a> and <a href="resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+<p align="center">
+📍Experience the larger-scale ChatGLM model at <a href="https://www.chatglm.cn">chatglm.cn</a>
+</p>
+## Introduction
+ChatGLM3 is a new generation of pre-trained dialogue models jointly released by Zhipu AI and Tsinghua KEG. ChatGLM3-6B is the open-source model in the ChatGLM3 series, maintaining many excellent features of the first two generations such as smooth dialogue and low deployment threshold, while introducing the following features:
+1. **Stronger Base Model:** The base model of ChatGLM3-6B, ChatGLM3-6B-Base, adopts a more diverse training dataset, more sufficient training steps, and a more reasonable training strategy. Evaluations on datasets from various perspectives such as semantics, mathematics, reasoning, code, and knowledge show that **ChatGLM3-6B-Base has the strongest performance among base models below 10B**.
+2. **More Complete Function Support:** ChatGLM3-6B adopts a newly designed [Prompt format](PROMPT_en.md), supporting multi-turn dialogues as usual. It also natively supports [tool invocation](tool_using/README_en.md) (Function Call), code execution (Code Interpreter), and Agent tasks in complex scenarios.
+3. **More Comprehensive Open-source Series:** In addition to the dialogue model [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b), the basic model [ChatGLM3-6B-Base](https://huggingface.co/THUDM/chatglm3-6b-base), and the long-text dialogue model [ChatGLM3-6B-32K](https://huggingface.co/THUDM/chatglm3-6b-32k) have also been open-sourced. All these weights are **fully open** for academic research, and **free commercial use is also allowed** after registration via a [questionnaire](https://open.bigmodel.cn/mla/form).
+-----
+The ChatGLM3 open-source model aims to promote the development of large-model technology together with the open-source community. Developers and everyone are earnestly requested to comply with the [open-source protocol](MODEL_LICENSE), and not to use the open-source models, codes, and derivatives for any purposes that might harm the nation and society, and for any services that have not been evaluated and filed for safety. Currently, no applications, including web, Android, Apple iOS, and Windows App, have been developed based on the **ChatGLM3 open-source model** by our project team.
+Although every effort has been made to ensure the compliance and accuracy of the data at various stages of model training, due to the smaller scale of the ChatGLM3-6B model and the influence of probabilistic randomness factors, the accuracy of output content cannot be guaranteed. The model output is also easily misled by user input. **This project does not assume risks and liabilities caused by data security, public opinion risks, or any misleading, abuse, dissemination, and improper use of open-source models and codes.**
+## Model List
+| Model | Seq Length |                                                              Download
+| :---: |:---------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------:
+| ChatGLM3-6B | 8k |      [HuggingFace](https://huggingface.co/THUDM/chatglm3-6b) \| [ModelScope](https://modelscope.cn/models/ZhipuAI/chatglm3-6b)
+| ChatGLM3-6B-Base | 8k | [HuggingFace](https://huggingface.co/THUDM/chatglm3-6b-base) \| [ModelScope](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base)
+| ChatGLM3-6B-32K | 32k |                                   [HuggingFace](https://huggingface.co/THUDM/chatglm3-6b-32k) \| [ModelScope](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k)
+## Projects
+Open source projects that accelerate ChatGLM3:
+* [chatglm.cpp](https://github.com/li-plus/chatglm.cpp): Real-time inference on your laptop accelerated by quantization, similar to llama.cpp.
+* [ChatGLM3-TPU](https://github.com/sophgo/ChatGLM3-TPU): Using the TPU accelerated inference solution, it runs about 7.5 token/s in real time on the end-side chip BM1684X (16T@FP16, 16G DDR).
+## Evaluation Results
+### Typical Tasks
+We selected 8 typical Chinese-English datasets and conducted performance tests on the ChatGLM3-6B (base) version.
+| Model            | GSM8K | MATH | BBH  | MMLU | C-Eval | CMMLU | MBPP | AGIEval |
+|------------------|:-----:|:----:|:----:|:----:|:------:|:-----:|:----:|:-------:|
+| ChatGLM2-6B-Base | 32.4  | 6.5  | 33.7 | 47.9 |  51.7  | 50.0  |  -   |    -    |
+| Best Baseline    | 52.1  | 13.1 | 45.0 | 60.1 |  63.5  | 62.2  | 47.5 |  45.8   |
+| ChatGLM3-6B-Base | 72.3  | 25.7 | 66.1 | 61.4 |  69.0  | 67.5  | 52.4 |  53.7   |
+> "Best Baseline" refers to the pre-trained models that perform best on the corresponding datasets with model parameters below 10B, excluding models that are trained specifically for a single task and do not maintain general capabilities.
+> In the tests of ChatGLM3-6B-Base, BBH used a 3-shot test, GSM8K and MATH that require inference used a 0-shot CoT test, MBPP used a 0-shot generation followed by running test cases to calculate Pass@1, and other multiple-choice type datasets all used a 0-shot test.
+We have conducted manual evaluation tests on ChatGLM3-6B-32K in multiple long-text application scenarios. Compared with the second-generation model, its effect has improved by more than 50% on average. In applications such as paper reading, document summarization, and financial report analysis, this improvement is particularly significant. In addition, we also tested the model on the LongBench evaluation set, and the specific results are shown in the table below.
+| Model                |  Average |  Summary | Single-Doc QA |  Multi-Doc QA | Code | Few-shot | Synthetic |
+|----------------------|:-----:|:----:|:----:|:----:|:------:|:-----:|:-----:|
+| ChatGLM2-6B-32K   |  41.5 | 24.8 | 37.6 | 34.7 |  52.8  |  51.3 | 47.7 |
+| ChatGLM3-6B-32K   |  50.2 | 26.6 | 45.8 | 46.1 |  56.2  |  61.2 | 65 |
+## How to Use
+### Environment Installation
+First, you need to download this repository:
+```shell
+git clone https://github.com/THUDM/ChatGLM3
+cd ChatGLM3
+```
+Then use pip to install the dependencies:
+```
+pip install -r requirements.txt
+```
++ The `transformers` library version should be `4.30.2` and above, and `torch` library should be 2.0 and above to obtain the best inference performance.
++ In order to ensure that the version of `torch` is correct, please strictly follow the instructions of [official documentation](https://pytorch.org/get-started/locally/) for installation.
++ The `gradio` library version should be the `3.x` version.
+### Integrated Demo
+We provide an integrated demo that incorporates the following three functionalities. Please refer to [Integrated Demo](composite_demo/README_en.md) for how to run it.
+- Chat: Dialogue mode, where you can interact with the model.
+- Tool: Tool mode, where in addition to dialogue, the model can also perform other operations using tools.
+    ![tool](resources/tool_en.png)
+- Code Interpreter: Code interpreter mode, where the model can execute code in a Jupyter environment and obtain results to complete complex tasks.
+    ![code](resources/code_en.gif)
+### Usage
+The ChatGLM model can be called to start a conversation using the following code:
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
+>>> model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, device='cuda')
+>>> model = model.eval()
+>>> response, history = model.chat(tokenizer, "Hello", history=[])
+>>> print(response)
+Hello 👋! I'm ChatGLM3-6B, the artificial intelligence assistant, nice to meet you. Feel free to ask me any questions.
+>>> response, history = model.chat(tokenizer, "What should I do if I can't sleep at night", history=history)
+>>> print(response)
+If you're having trouble sleeping at night, here are a few suggestions that might help:
+1. Create a relaxing sleep environment: Make sure your bedroom is cool, quiet, and dark. Consider using earplugs, a white noise machine, or a fan to help create an optimal environment.
+2. Establish a bedtime routine: Try to go to bed and wake up at the same time every day, even on weekends. A consistent routine can help regulate your body's internal clock.
+3. Avoid stimulating activities before bedtime: Avoid using electronic devices, watching TV, or engaging in stimulating activities like exercise or puzzle-solving, as these can interfere with your ability to fall asleep.
+4. Limit caffeine and alcohol: Avoid consuming caffeine and alcohol close to bedtime, as these can disrupt your sleep patterns.
+5. Practice relaxation techniques: Try meditation, deep breathing, or progressive muscle relaxation to help calm your mind and body before sleep.
+6. Consider taking a warm bath or shower: A warm bath or shower can help relax your muscles and promote sleep.
+7. Get some fresh air: Make sure to get some fresh air during the day, as lack of vitamin D can interfere with sleep quality.
+If you continue to have difficulty sleeping, consult with a healthcare professional for further guidance and support.
+```
+#### Load Model Locally
+The above code will automatically download the model implementation and parameters by `transformers`. The complete model implementation is available on [Hugging Face Hub](https://huggingface.co/THUDM/chatglm3-6b). If your network environment is poor, downloading model parameters might take a long time or even fail. In this case, you can first download the model to your local machine, and then load it from there.
+To download the model from Hugging Face Hub, you need to [install Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage) first, then run
+```Shell
+git clone https://huggingface.co/THUDM/chatglm3-6b
+```
+If the download from HuggingFace is slow, you can also download it from [ModelScope](https://modelscope.cn/models/ZhipuAI/chatglm3-6b).
+# Model Fine-tuning
+Please refer to the dialog model fine-tuning [ChatGLM3-6B fine-tuning example](finetune_chatmodel_demo/README.md), or the base model fine-tuning [ChatGLM3-6B-base fine-tuning example](finetune_basemodel_demo/README.md).
+Please note that different fine-tuning scripts correspond to different models. Please select the corresponding model according to your needs.
+### Web-based Dialogue Demo
+![web-demo](resources/web-demo.gif)
+You can launch a web-based demo using Gradio with the following command:
+```shell
+python web_demo.py
+```
+![web-demo](resources/web-demo2.png)
+You can launch a web-based demo using Streamlit with the following command:
+```shell
+streamlit run web_demo2.py
+```
+The web-based demo will run a Web Server and output an address. You can use it by opening the output address in a browser. Based on tests, the web-based demo using Streamlit runs more smoothly.
+### Command Line Dialogue Demo
+![cli-demo](resources/cli-demo.png)
+Run [cli_demo.py](basic_demo/cli_demo.py) in the repository:
+```shell
+python cli_demo.py
+```
+The program will interact in the command line, enter instructions in the command line and hit enter to generate a response. Enter `clear` to clear the dialogue history, enter `stop` to terminate the program.
+### API Deployment
+Thanks to [@xusenlinzy](https://github.com/xusenlinzy) for implementing the OpenAI format streaming API deployment, which can serve as the backend for any ChatGPT-based application, such as [ChatGPT-Next-Web](https://github.com/Yidadaa/ChatGPT-Next-Web). You can deploy it by running [openai_api.py](openai_api_demo/openai_api.py) in the repository:
+```shell
+cd openai_api_demo
+python openai_api.py
+```
+Also, we have written a sample code to test the performance of the API calls. This can be tested by running [openai_api_request.py](openai_api_demo/openai_api_request.py) in the repository
++ Test with Curl
+```shell
+curl -X POST "http://127.0.0.1:8000/v1/chat/completions" \
+-H "Content-Type: application/json" \\
+-d "{\"model\": \"chatglm3-6b\", \"messages\": [{\"role\": \"system\", \"content\": \"You are ChatGLM3, a large language model trained by Zhipu. Follow the user's instructions carefully. Respond using markdown.\"}, {\"role\": \"user\", \"content\": \"Hello, tell me a story, about 100 words\"}], \"stream\": false, \"max_title": \"\". false, \"max_tokens\": 100, \"temperature\": 0.8, \"top_p\": 0.8}"
+````
++ Testing with Python
+```shell
+cd openai_api_demo
+python openai_api_request.py
+```
+If the test is successful, the model should return a story.
+### Tool Invocation
+For methods of tool invocation, please refer to [Tool Invocation](tool_using/README_en.md).
+## Low-Cost Deployment
+### Model Quantization
+By default, the model is loaded with FP16 precision, running the above code requires about 13GB of VRAM. If your GPU's VRAM is limited, you can try loading the model quantitatively, as follows:
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b",trust_remote_code=True).quantize(4).cuda()
+```
+Model quantization will bring some performance loss. Through testing, ChatGLM3-6B can still perform natural and smooth generation under 4-bit quantization.
+### CPU Deployment
+If you don't have GPU hardware, you can also run inference on the CPU, but the inference speed will be slower. The usage is as follows (requires about 32GB of memory):
+```python
+model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).float()
+```
+### Mac Deployment
+For Macs equipped with Apple Silicon or AMD GPUs, the MPS backend can be used to run ChatGLM3-6B on the GPU. Refer to Apple's [official instructions](https://developer.apple.com/metal/pytorch) to install PyTorch-Nightly (the correct version number should be 2.x.x.dev2023xxxx, not 2.x.x).
+Currently, only [loading the model locally](README_en.md#load-model-locally) is supported on MacOS. Change the model loading in the code to load locally and use the MPS backend:
+```python
+model = AutoModel.from_pretrained("your local path", trust_remote_code=True).to('mps')
+```
+Loading the half-precision ChatGLM3-6B model requires about 13GB of memory. Machines with smaller memory (such as a 16GB memory MacBook Pro) will use virtual memory on the hard disk when there is insufficient free memory, resulting in a significant slowdown in inference speed.
+### Multi-GPU Deployment
+If you have multiple GPUs, but each GPU's VRAM size is not enough to accommodate the complete model, then the model can be split across multiple GPUs. First, install accelerate: `pip install accelerate`, and then load the model through the following methods:
+```python
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm3-6b", num_gpus=2)
+```
+This allows the model to be deployed on two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. It is evenly split by default, but you can also pass the `device_map` parameter to specify it yourself.
+## Citation
+If you find our work helpful, please consider citing the following papers.
+```
+@article{zeng2022glm,
+  title={Glm-130b: An open bilingual pre-trained model},
+  author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
+  journal={arXiv preprint arXiv:2210.02414},
+  year={2022}
+}
+```
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+```

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,192 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d1389f08-db10-44c0-acc0-08170ac549d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.19.4)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.8.0)\n",
+      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "37ce339b-5f87-4268-a808-7ec1f9b04055",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e74cef2b1046454abf03203299de8bd7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b338999a-2f2e-45fd-8bfe-ff5c035e79f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff4e7de5abd24397a2476b50df747fc0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model-00002-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4810a0b8ac7c4537ae3b825eaaacd03d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 11 LFS files:   0%|          | 0/11 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c378790ca33452bac16db72d1a93b5d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model-00005-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "19aa1f962f30403d8487851207ebc11f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model-00003-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "700a7f07898f476c84dee0481d5dd594",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model-00001-of-00007.bin:   0%|          | 0.00/1.83G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0dbd5185b81f4cb380005865a8036e8b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model-00004-of-00007.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "\n",
+    "api.upload_folder(\n",
+    "    folder_path=\"\",\n",
+    "    repo_id=\"kevinwang676/ChatGLM3-demo\",\n",
+    "    repo_type=\"space\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff1ddc96-5925-4978-bb9c-d016b716ade5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

basic_demo/cli_demo.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import platform
+from transformers import AutoTokenizer, AutoModel
+import torch
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# for Mac Computer like M1
+# You Need Use Pytorch compiled with Metal
+# DEVICE = 'mps'
+# for AMD gpu likes MI100 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with ROCm
+# DEVICE = 'cuda'
+# for Intel gpu likes A770 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with oneDNN and install intel-extension-for-pytorch
+# import intel_extension_for_pytorch as ipex
+# DEVICE = 'xpu'
+# for Moore Threads gpu like MTT S80 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with Musa
+# DEVICE = 'musa'
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+welcome_prompt = "欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+def build_prompt(history):
+    prompt = welcome_prompt
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM3-6B：{response}"
+    return prompt
+def main():
+    past_key_values, history = None, []
+    global stop_stream
+    print(welcome_prompt)
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            past_key_values, history = None, []
+            os.system(clear_command)
+            print(welcome_prompt)
+            continue
+        print("\nChatGLM：", end="")
+        current_length = 0
+        for response, history, past_key_values in model.stream_chat(tokenizer, query, history=history, top_p=1,
+                                                                    temperature=0.01,
+                                                                    past_key_values=past_key_values,
+                                                                    return_past_key_values=True):
+            if stop_stream:
+                stop_stream = False
+                break
+            else:
+                print(response[current_length:], end="", flush=True)
+                current_length = len(response)
+        print("")
+if __name__ == "__main__":
+    main()

basic_demo/cli_demo_bad_word_ids.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+This script demonstrates how to use the `bad_words_ids` argument to filter out.
+"""
+import os
+import platform
+from transformers import AutoTokenizer, AutoModel
+import torch
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+welcome_prompt = "欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+# 定义不希望出现的词汇, 你可以自定义, 在这个例子中，如果模型回答包含 "你好" 或 "ChatGLM"，则会出现这个报错
+# probability tensor contains either `inf`, `nan` or element < 0
+bad_words = ["你好", "ChatGLM"]
+# 将这些词汇转换为token ID列表，每个短语是一个子列表
+bad_word_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in bad_words]
+def build_prompt(history):
+    prompt = welcome_prompt
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM3-6B：{response}"
+    return prompt
+def main():
+    past_key_values, history = None, []
+    global stop_stream
+    print(welcome_prompt)
+    while True:
+        query = input("\n用户：")
+        if query.strip().lower() == "stop":
+            break
+        if query.strip().lower() == "clear":
+            past_key_values, history = None, []
+            os.system(clear_command)
+            print(welcome_prompt)
+            continue
+        # Attempt to generate a response
+        try:
+            print("\nChatGLM：", end="")
+            current_length = 0
+            response_generated = False
+            for response, history, past_key_values in model.stream_chat(
+                tokenizer, query, history=history, top_p=1,
+                temperature=0.01,
+                past_key_values=past_key_values,
+                return_past_key_values=True,
+                bad_words_ids=bad_word_ids  # assuming this is implemented correctly
+            ):
+                response_generated = True
+                # Check if the response contains any bad words
+                if any(bad_word in response for bad_word in bad_words):
+                    print("我的回答涉嫌了bad word")
+                    break  # Break the loop if a bad word is detected
+                # Otherwise, print the generated response
+                print(response[current_length:], end="", flush=True)
+                current_length = len(response)
+            if not response_generated:
+                print("没有生成任何回答。")
+        except RuntimeError as e:
+            print(f"生成文本时发生错误：{e}，这可能是涉及到设定的敏感词汇")
+        print("")
+if __name__ == "__main__":
+    main()

basic_demo/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from typing import Dict, Union, Optional
+from torch.nn import Module
+from transformers import AutoModel
+def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
+    # transformer.word_embeddings 占用1层
+    # transformer.final_layernorm 和 lm_head 占用1层
+    # transformer.layers 占用 28 层
+    # 总共30层分配到num_gpus张卡上
+    num_trans_layers = 28
+    per_gpu_layers = 30 / num_gpus
+    # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
+    # windows下 model.device 会被设置成 transformer.word_embeddings.device
+    # linux下 model.device 会被设置成 lm_head.device
+    # 在调用chat或者stream_chat时,input_ids会被放到model.device上
+    # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
+    # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
+    # 本文件来源于https://github.com/THUDM/ChatGLM-6B/blob/main/utils.py
+    # 仅此处做少许修改以支持ChatGLM3
+    device_map = {
+        'transformer.embedding.word_embeddings': 0,
+        'transformer.encoder.final_layernorm': 0,
+        'transformer.output_layer': 0,
+        'transformer.rotary_pos_emb': 0,
+        'lm_head': 0
+    }
+    used = 2
+    gpu_target = 0
+    for i in range(num_trans_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0
+        assert gpu_target < num_gpus
+        device_map[f'transformer.encoder.layers.{i}'] = gpu_target
+        used += 1
+    return device_map
+def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
+    if num_gpus < 2 and device_map is None:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
+    else:
+        from accelerate import dispatch_model
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
+        if device_map is None:
+            device_map = auto_configure_device_map(num_gpus)
+        model = dispatch_model(model, device_map=device_map)
+    return model

basic_demo/web_demo.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+import mdtex2html
+from utils import load_model_on_gpus
+import torch
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+# 多显卡支持，使用下面两行代替上面一行，将num_gpus改为你实际的显卡数量
+# from utils import load_model_on_gpus
+# model = load_model_on_gpus("THUDM/chatglm3-6b", num_gpus=2)
+"""Override Chatbot.postprocess"""
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+def predict(input, chatbot, max_length, top_p, temperature, history, past_key_values):
+    chatbot.append((parse_text(input), ""))
+    for response, history, past_key_values in model.stream_chat(tokenizer, input, history,
+                                                                past_key_values=past_key_values,
+                                                                return_past_key_values=True,
+                                                                max_length=max_length, top_p=top_p,
+                                                                temperature=temperature):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history, past_key_values
+def reset_user_input():
+    return gr.update(value='')
+def reset_state():
+    return [], [], None
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">ChatGLM3-6B</h1>""")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
+                    container=False)
+            with gr.Column(min_width=32, scale=1):
+                submitBtn = gr.Button("Submit", variant="primary")
+        with gr.Column(scale=1):
+            emptyBtn = gr.Button("Clear History")
+            max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.6, step=0.01, label="Temperature", interactive=True)
+    history = gr.State([])
+    past_key_values = gr.State(None)
+    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history, past_key_values],
+                    [chatbot, history, past_key_values], show_progress=True)
+    submitBtn.click(reset_user_input, [], [user_input])
+    emptyBtn.click(reset_state, outputs=[chatbot, history, past_key_values], show_progress=True)
+demo.queue().launch(share=False, server_name="127.0.0.1", server_port=8501, inbrowser=True)

basic_demo/web_demo2.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import streamlit as st
+import torch
+from transformers import AutoModel, AutoTokenizer
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 设置页面标题、图标和布局
+st.set_page_config(
+    page_title="ChatGLM3-6B 演示",
+    page_icon=":robot:",
+    layout="wide"
+)
+@st.cache_resource
+def get_model():
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+    if 'cuda' in DEVICE:  # AMD, NVIDIA GPU can use Half Precision
+        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+    else:  # CPU, Intel GPU and other GPU can use Float16 Precision Only
+        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+    # 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
+    # from utils import load_model_on_gpus
+    # model = load_model_on_gpus("THUDM/chatglm3-6b", num_gpus=2)
+    return tokenizer, model
+# 加载Chatglm3的model和tokenizer
+tokenizer, model = get_model()
+# 初始化历史记录和past key values
+if "history" not in st.session_state:
+    st.session_state.history = []
+if "past_key_values" not in st.session_state:
+    st.session_state.past_key_values = None
+# 设置max_length、top_p和temperature
+max_length = st.sidebar.slider("max_length", 0, 32768, 8192, step=1)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.6, step=0.01)
+# 清理会话历史
+buttonClean = st.sidebar.button("清理会话历史", key="clean")
+if buttonClean:
+    st.session_state.history = []
+    st.session_state.past_key_values = None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    st.rerun()
+# 渲染聊天历史记录
+for i, message in enumerate(st.session_state.history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            st.markdown(message["content"])
+    else:
+        with st.chat_message(name="assistant", avatar="assistant"):
+            st.markdown(message["content"])
+# 输入框和输出框
+with st.chat_message(name="user", avatar="user"):
+    input_placeholder = st.empty()
+with st.chat_message(name="assistant", avatar="assistant"):
+    message_placeholder = st.empty()
+# 获取用户输入
+prompt_text = st.chat_input("请输入您的问题")
+# 如果用户输入了内容,则生成回复
+if prompt_text:
+    input_placeholder.markdown(prompt_text)
+    history = st.session_state.history
+    past_key_values = st.session_state.past_key_values
+    for response, history, past_key_values in model.stream_chat(
+        tokenizer,
+        prompt_text,
+        history,
+        past_key_values=past_key_values,
+        max_length=max_length,
+        top_p=top_p,
+        temperature=temperature,
+        return_past_key_values=True,
+    ):
+        message_placeholder.markdown(response)
+    # 更新历史记录和past key values
+    st.session_state.history = history
+    st.session_state.past_key_values = past_key_values

chatglm3-6b/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

chatglm3-6b/MODEL_LICENSE ADDED Viewed

	@@ -0,0 +1,65 @@

+The ChatGLM3-6B License
+1. 定义
+“许可方”是指分发其软件的 ChatGLM3-6B 模型团队。
+“软件”是指根据本许可提供的 ChatGLM3-6B 模型参数。
+2. 许可授予
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+3.限制
+您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+4.免责声明
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+5. 责任限制
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+6.争议解决
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 [email protected] 与我们联系。
+1. Definitions
+“Licensor” means the ChatGLM3-6B Model Team that distributes its Software.
+“Software” means the ChatGLM3-6B model parameters made available under this license.
+2. License Grant
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software.
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+3. Restriction
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any military, or illegal purposes.
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+4. Disclaimer
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+5. Limitation of Liability
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+6. Dispute Resolution
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at [email protected].

chatglm3-6b/README.md ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+language:
+- zh
+- en
+tags:
+- glm
+- chatglm
+- thudm
+---
+# ChatGLM3-6B
+<p align="center">
+  💻 <a href="https://github.com/THUDM/ChatGLM" target="_blank">Github Repo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a> • 📃 <a href="https://arxiv.org/abs/2103.10360" target="_blank">[GLM@ACL 22]</a> <a href="https://github.com/THUDM/GLM" target="_blank">[GitHub]</a> • 📃 <a href="https://arxiv.org/abs/2210.02414" target="_blank">[GLM-130B@ICLR 23]</a> <a href="https://github.com/THUDM/GLM-130B" target="_blank">[GitHub]</a> <br>
+</p>
+<p align="center">
+    👋 Join our <a href="https://join.slack.com/t/chatglm/shared_invite/zt-25ti5uohv-A_hs~am_D3Q8XPZMpj7wwQ" target="_blank">Slack</a> and <a href="https://github.com/THUDM/ChatGLM/blob/main/resources/WECHAT.md" target="_blank">WeChat</a>
+</p>
+<p align="center">
+📍Experience the larger-scale ChatGLM model at <a href="https://www.chatglm.cn">chatglm.cn</a>
+</p>
+## 介绍 (Introduction)
+ChatGLM3-6B 是 ChatGLM 系列最新一代的开源模型，在保留了前两代模型对话流畅、部署门槛低等众多优秀特性的基础上，ChatGLM3-6B 引入了如下特性：
+1. **更强大的基础模型：** ChatGLM3-6B 的基础模型 ChatGLM3-6B-Base 采用了更多样的训练数据、更充分的训练步数和更合理的训练策略。在语义、数学、推理、代码、知识等不同角度的数据集上测评显示，ChatGLM3-6B-Base 具有在 10B 以下的预训练模型中最强的性能。
+2. **更完整的功能支持：** ChatGLM3-6B 采用了全新设计的 [Prompt 格式](https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md)，除正常的多轮对话外。同时原生支持[工具调用](https://github.com/THUDM/ChatGLM3/blob/main/tool_using/README.md)（Function Call）、代码执行（Code Interpreter）和 Agent 任务等复杂场景。
+3. **更全面的开源序列：** 除了对话模型 ChatGLM3-6B 外，还开源了基础模型 ChatGLM-6B-Base、长文本对话模型 ChatGLM3-6B-32K。以上所有权重对学术研究**完全开放**，在填写[问卷](https://open.bigmodel.cn/mla/form)进行登记后**亦允许免费商业使用**。
+ChatGLM3-6B is the latest open-source model in the ChatGLM series. While retaining many excellent features such as smooth dialogue and low deployment threshold from the previous two generations, ChatGLM3-6B introduces the following features:
+1. **More Powerful Base Model:** The base model of ChatGLM3-6B, ChatGLM3-6B-Base, employs a more diverse training dataset, more sufficient training steps, and a more reasonable training strategy. Evaluations on datasets such as semantics, mathematics, reasoning, code, knowledge, etc., show that ChatGLM3-6B-Base has the strongest performance among pre-trained models under 10B.
+2. **More Comprehensive Function Support:** ChatGLM3-6B adopts a newly designed [Prompt format](https://github.com/THUDM/ChatGLM3/blob/main/PROMPT_en.md), in addition to the normal multi-turn dialogue. It also natively supports [function call](https://github.com/THUDM/ChatGLM3/blob/main/tool_using/README_en.md), code interpreter, and complex scenarios such as agent tasks.
+3. **More Comprehensive Open-source Series:** In addition to the dialogue model ChatGLM3-6B, the base model ChatGLM-6B-Base and the long-text dialogue model ChatGLM3-6B-32K are also open-sourced. All the weights are **fully open** for academic research, and after completing the [questionnaire](https://open.bigmodel.cn/mla/form) registration, they are also **allowed for free commercial use**.
+## 软件依赖 (Dependencies)
+```shell
+pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate
+```
+## 代码调用 (Code Usage)
+可以通过如下代码调用 ChatGLM3-6B 模型来生成对话：
+You can generate dialogue by invoking the ChatGLM3-6B model with the following code:
+```ipython
+>>> from transformers import AutoTokenizer, AutoModel
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
+>>> model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).half().cuda()
+>>> model = model.eval()
+>>> response, history = model.chat(tokenizer, "你好", history=[])
+>>> print(response)
+你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。
+>>> response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
+>>> print(response)
+晚上睡不着可能会让你感到焦虑或不舒服,但以下是一些可以帮助你入睡的方法:
+1. 制定规律的睡眠时间表:保持规律的睡眠时间表可以帮助你建立健康的睡眠习惯,使你更容易入睡。尽量在每天的相同时间上床,并在同一时间起床。
+2. 创造一个舒适的睡眠环境:确保睡眠环境舒适,安静,黑暗且温度适宜。可以使用舒适的床上用品,并保持房间通风。
+3. 放松身心:在睡前做些放松的活动,例如泡个热水澡,听些轻柔的音��,阅读一些有趣的书籍等,有助于缓解紧张和焦虑,使你更容易入睡。
+4. 避免饮用含有咖啡因的饮料:咖啡因是一种刺激性物质,会影响你的睡眠质量。尽量避免在睡前饮用含有咖啡因的饮料,例如咖啡,茶和可乐。
+5. 避免在床上做与睡眠无关的事情:在床上做些与睡眠无关的事情,例如看电影,玩游戏或工作等,可能会干扰你的睡眠。
+6. 尝试呼吸技巧:深呼吸是一种放松技巧,可以帮助你缓解紧张和焦虑,使你更容易入睡。试着慢慢吸气,保持几秒钟,然后缓慢呼气。
+如果这些方法无法帮助你入睡,你可以考虑咨询医生或睡眠专家,寻求进一步的建议。
+```
+关于更多的使用说明，包括如何运行命令行和网页版本的 DEMO，以及使用模型量化以节省显存，请参考我们的 [Github Repo](https://github.com/THUDM/ChatGLM)。
+For more instructions, including how to run CLI and web demos, and model quantization, please refer to our [Github Repo](https://github.com/THUDM/ChatGLM).
+## 协议 (License)
+本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源，ChatGLM3-6B 模型的权重的使用则需要遵循 [Model License](MODEL_LICENSE)。
+The code in this repository is open-sourced under the [Apache-2.0 license](LICENSE), while the use of the ChatGLM3-6B model weights needs to comply with the [Model License](MODEL_LICENSE).
+## 引用 (Citation)
+如果你觉得我们的工作有帮助的话，请考虑引用下列论文。
+If you find our work helpful, please consider citing the following papers.
+```
+@article{zeng2022glm,
+  title={Glm-130b: An open bilingual pre-trained model},
+  author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
+  journal={arXiv preprint arXiv:2210.02414},
+  year={2022}
+}
+```
+```
+@inproceedings{du2022glm,
+  title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
+  author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={320--335},
+  year={2022}
+}
+```

chatglm3-6b/config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "THUDM/chatglm3-6b",
+  "model_type": "chatglm",
+  "architectures": [
+    "ChatGLMModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "rmsnorm": true,
+  "seq_length": 8192,
+  "use_cache": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "tie_word_embeddings": false,
+  "eos_token_id": 2,
+  "pad_token_id": 0
+}

chatglm3-6b/configuration_chatglm.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from transformers import PretrainedConfig
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)

chatglm3-6b/modeling_chatglm.py ADDED Viewed

	@@ -0,0 +1,1293 @@

+""" PyTorch ChatGLM model. """
+import math
+import copy
+import warnings
+import re
+import sys
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+from .configuration_chatglm import ChatGLMConfig
+# flags required to enable jit fusion kernels
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return (self.weight * hidden_states).to(input_dtype)
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+        self.core_attention = CoreAttention(config, self.layer_number)
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+        # ==================================
+        # core attention computation
+        # ==================================
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output = self.dense(context_layer)
+        return output, kv_cache
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+class MLP(torch.nn.Module):
+    """MLP.
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+        self.add_bias = config.add_bias_linear
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+        self.activation_func = swiglu
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.fp32_residual_connection = config.fp32_residual_connection
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+        # MLP
+        self.mlp = MLP(config, device=device)
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+        return output, kv_cache
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+        # Number of layers.
+        self.num_layers = config.num_layers
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+        self.gradient_checkpointing = False
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_ids.shape
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+        from .quantization import quantize
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+        self.quantized = True
+        self.config.quantization_bit = bits
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

chatglm3-6b/pytorch_model-00001-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d5567466e89625dbd10e51c69a02982f233a10108cf232a379defdbb065ae0b
+size 1827781090

chatglm3-6b/pytorch_model-00002-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad41534016ac4a2431fa2d4b08efbe28f963744135ec52b2ea13cc86730fa2a
+size 1968299480

chatglm3-6b/pytorch_model-00003-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2be9b17c332a8345e787953d4879caee4747ad4b263f013aa3c4654c94c3d24
+size 1927415036

chatglm3-6b/pytorch_model-00004-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5526891e1b4c8edd2e3688df5156aa107e2140fe7e3f9d0d54f9cbe3b6ee3b5
+size 1815225998

chatglm3-6b/pytorch_model-00005-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84bb18a476f74beaf4d941733bd1c475791eba799b228f78d0165de989cb7a40
+size 1968299544

chatglm3-6b/pytorch_model-00006-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1181875a2dc30fba968d72d0fc4628b9a60d3866bf680eb14b9822b5b504830f
+size 1927415036

chatglm3-6b/pytorch_model-00007-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1271b638dc0a88206d1c7a51bcaa862410eb56f3e59fd0068a96e96cb5e3f4f0
+size 1052808542

chatglm3-6b/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "metadata": {
+    "total_size": 12487168064
+  },
+  "weight_map": {
+    "transformer.embedding.word_embeddings.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.final_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.self_attention.dense.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.self_attention.query_key_value.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.0.self_attention.query_key_value.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.self_attention.dense.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.self_attention.query_key_value.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.1.self_attention.query_key_value.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.10.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.self_attention.dense.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.self_attention.query_key_value.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.10.self_attention.query_key_value.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.self_attention.dense.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.self_attention.query_key_value.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.11.self_attention.query_key_value.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.12.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.self_attention.dense.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.self_attention.query_key_value.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.12.self_attention.query_key_value.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.13.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.self_attention.dense.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.self_attention.query_key_value.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.13.self_attention.query_key_value.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.self_attention.dense.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.self_attention.query_key_value.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.14.self_attention.query_key_value.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.self_attention.dense.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.self_attention.query_key_value.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.15.self_attention.query_key_value.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.self_attention.dense.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.self_attention.query_key_value.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.16.self_attention.query_key_value.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.17.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.17.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.17.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.17.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.17.self_attention.dense.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.17.self_attention.query_key_value.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.17.self_attention.query_key_value.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.encoder.layers.18.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.self_attention.dense.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.self_attention.query_key_value.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.18.self_attention.query_key_value.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.self_attention.dense.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.self_attention.query_key_value.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.19.self_attention.query_key_value.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.self_attention.dense.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.self_attention.query_key_value.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.2.self_attention.query_key_value.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.20.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.self_attention.dense.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.self_attention.query_key_value.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.20.self_attention.query_key_value.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.self_attention.dense.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.self_attention.query_key_value.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.21.self_attention.query_key_value.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.22.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.encoder.layers.22.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.22.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.22.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.22.self_attention.dense.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.22.self_attention.query_key_value.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.22.self_attention.query_key_value.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.self_attention.dense.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.self_attention.query_key_value.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.23.self_attention.query_key_value.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.self_attention.dense.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.self_attention.query_key_value.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.24.self_attention.query_key_value.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.self_attention.dense.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.self_attention.query_key_value.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.25.self_attention.query_key_value.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.26.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.self_attention.dense.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.self_attention.query_key_value.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.26.self_attention.query_key_value.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.encoder.layers.27.input_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.post_attention_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.self_attention.dense.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.self_attention.query_key_value.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.27.self_attention.query_key_value.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.encoder.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.3.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.3.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.3.self_attention.dense.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.3.self_attention.query_key_value.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.3.self_attention.query_key_value.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.encoder.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.self_attention.dense.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.self_attention.query_key_value.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.4.self_attention.query_key_value.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.self_attention.dense.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.self_attention.query_key_value.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.5.self_attention.query_key_value.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.self_attention.dense.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.self_attention.query_key_value.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.6.self_attention.query_key_value.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.self_attention.dense.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.self_attention.query_key_value.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.7.self_attention.query_key_value.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.encoder.layers.8.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.8.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.8.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.8.self_attention.dense.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.8.self_attention.query_key_value.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.8.self_attention.query_key_value.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.self_attention.dense.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.self_attention.query_key_value.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.encoder.layers.9.self_attention.query_key_value.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.output_layer.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.rotary_pos_emb.inv_freq": "pytorch_model-00001-of-00007.bin"
+  }
+}

chatglm3-6b/quantization.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+from typing import List
+from functools import partial
+logger = logging.get_logger(__name__)
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+        shape = weight.shape
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+    return model

chatglm3-6b/tokenization_chatglm.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+        return (vocab_file,)
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        return encoded_inputs

chatglm3-6b/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370

chatglm3-6b/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "name_or_path": "THUDM/chatglm3-6b",
+  "remove_space": false,
+  "do_lower_case": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+      ]
+  }
+}

composite_demo/.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ font = "monospace"

composite_demo/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# ChatGLM3 Web Demo
+![Demo webpage](assets/demo.png)
+## 安装
+我们建议通过 [Conda](https://docs.conda.io/en/latest/) 进行环境管理。
+执行以下命令新建一个 conda 环境并安装所需依赖：
+```bash
+conda create -n chatglm3-demo python=3.10
+conda activate chatglm3-demo
+pip install -r requirements.txt
+```
+请注意，本项目需要 Python 3.10 或更高版本。
+此外，使用 Code Interpreter 还需要安装 Jupyter 内核：
+```bash
+ipython kernel install --name chatglm3-demo --user
+```
+## 运行
+运行以下命令在本地加载模型并启动 demo：
+```bash
+streamlit run main.py
+```
+之后即可从命令行中看到 demo 的地址，点击即可访问。初次访问需要下载并加载模型，可能需要花费一定时间。
+如果已经在本地下载了模型，可以通过 `export MODEL_PATH=/path/to/model` 来指定从本地加载模型。如果需要自定义 Jupyter 内核，可以通过 `export IPYKERNEL=<kernel_name>` 来指定。
+## 使用
+ChatGLM3 Demo 拥有三种模式：
+- Chat: 对话模式，在此模式下可以与模型进行对话。
+- Tool: 工具模式，模型除了对话外，还可以通过工具进行其他操作。
+- Code Interpreter: 代码解释器模式，模型可以在一个 Jupyter 环境中执行代码并获取结果，以完成复杂任务。
+### 对话模式
+对话模式下，用户可以直接在侧边栏修改 top_p, temperature, System Prompt 等参数来调整模型的行为。例如
+![The model responses following system prompt](assets/emojis.png)
+### 工具模式
+可以通过在 `tool_registry.py` 中注册新的工具来增强模型的能力。只需要使用 `@register_tool` 装饰函数即可完成注册。对于工具声明，函数名称即为工具的名称，函数 docstring 即为工具的说明；对于工具的参数，使用 `Annotated[typ: type, description: str, required: bool]` 标注参数的类型、描述和是否必须。
+例如，`get_weather` 工具的注册如下：
+```python
+@register_tool
+def get_weather(
+    city_name: Annotated[str, 'The name of the city to be queried', True],
+) -> str:
+    """
+    Get the weather for `city_name` in the following week
+    """
+    ...
+```
+![The model uses tool to query the weather of pairs.](assets/tool.png)
+此外，你也可以在页面中通过 `Manual mode` 进入手动模式，在这一模式下你可以通过 YAML 来直接指定工具列表，但你需要手动将工具的输出反馈给模型。
+### 代码解释器模式
+由于拥有代码执行环境，此模式下的模型能够执行更为复杂的任务，例如绘制图表、执行符号运算等等。模型会根据对任务完成情况的理解自动地连续执行多个代码块，直到任务完成。因此，在这一模式下，你只需要指明希望模型执行的任务即可。
+例如，我们可以让 ChatGLM3 画一个爱心：
+![The code interpreter draws a heart according to the user's instructions.](assets/heart.png)
+### 额外技巧
+- 在模型生成文本时，可以通过页面右上角的 `Stop` 按钮进行打断。
+- 刷新页面即可清空对话记录。
+# Enjoy!

composite_demo/README_en.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# ChatGLM3 Web Demo
+![Demo webpage](assets/demo.png)
+## Installation
+We recommend managing environments through [Conda](https://docs.conda.io/en/latest/).
+Execute the following commands to create a new conda environment and install the necessary dependencies:
+```bash
+conda create -n chatglm3-demo python=3.10
+conda activate chatglm3-demo
+pip install -r requirements.txt
+```
+Please note that this project requires Python 3.10 or higher.
+Additionally, installing the Jupyter kernel is required for using the Code Interpreter:
+```bash
+ipython kernel install --name chatglm3-demo --user
+```
+## Execution
+Run the following command to load the model locally and start the demo:
+```bash
+streamlit run main.py
+```
+Afterward, the address of the demo can be seen from the command line; click to access. The first visit requires the download and loading of the model, which may take some time.
+If the model has already been downloaded locally, you can specify to load the model locally through `export MODEL_PATH=/path/to/model`. If you need to customize the Jupyter kernel, you can specify it through `export IPYKERNEL=<kernel_name>`.
+## Usage
+ChatGLM3 Demo has three modes:
+- Chat: Dialogue mode, where you can interact with the model.
+- Tool: Tool mode, where the model, in addition to dialogue, can perform other operations through tools.
+- Code Interpreter: Code interpreter mode, where the model can execute code in a Jupyter environment and obtain results to complete complex tasks.
+### Dialogue Mode
+In dialogue mode, users can directly modify parameters such as top_p, temperature, System Prompt in the sidebar to adjust the behavior of the model. For example,
+![The model responses following system prompt](assets/emojis.png)
+### Tool Mode
+You can enhance the model's capabilities by registering new tools in `tool_registry.py`. Just use the `@register_tool` decorator to complete the registration. For tool declarations, the function name is the name of the tool, and the function docstring is the description of the tool; for tool parameters, use `Annotated[typ: type, description: str, required: bool]` to annotate the type, description, and whether it is necessary of the parameters.
+For example, the registration of the `get_weather` tool is as follows:
+```python
+@register_tool
+def get_weather(
+    city_name: Annotated[str, 'The name of the city to be queried', True],
+) -> str:
+    """
+    Get the weather for `city_name` in the following week
+    """
+    ...
+```
+![The model uses tool to query the weather of pairs.](assets/tool.png)
+Additionally, you can enter the manual mode through `Manual mode` on the page. In this mode, you can directly specify the tool list through YAML, but you need to manually feed back the tool's output to the model.
+### Code Interpreter Mode
+Due to having a code execution environment, the model in this mode can perform more complex tasks, such as drawing charts, performing symbolic operations, etc. The model will automatically execute multiple code blocks in succession based on its understanding of the task completion status until the task is completed. Therefore, in this mode, you only need to specify the task you want the model to perform.
+For example, we can ask ChatGLM3 to draw a heart:
+![The code interpreter draws a heart according to the user's instructions.](assets/heart.png)
+### Additional Tips
+- While the model is generating text, it can be interrupted by the `Stop` button at the top right corner of the page.
+- Refreshing the page will clear the dialogue history.
+# Enjoy!

composite_demo/assets/demo.png ADDED Viewed

composite_demo/assets/emojis.png ADDED Viewed

composite_demo/assets/heart.png ADDED Viewed

composite_demo/assets/tool.png ADDED Viewed

composite_demo/client.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from __future__ import annotations
+from collections.abc import Iterable
+import os
+from typing import Any, Protocol
+from huggingface_hub.inference._text_generation import TextGenerationStreamResponse, Token
+import streamlit as st
+import torch
+from transformers import AutoModel, AutoTokenizer, AutoConfig
+from conversation import Conversation
+TOOL_PROMPT = 'Answer the following questions as best as you can. You have access to the following tools:'
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+PT_PATH = os.environ.get('PT_PATH', None)
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# for Mac Computer like M1
+# You Need Use Pytorch compiled with Metal
+# DEVICE = 'mps'
+# for AMD gpu likes MI100 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with ROCm
+# DEVICE = 'cuda'
+# for Intel gpu likes A770 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with oneDNN and install intel-extension-for-pytorch
+# import intel_extension_for_pytorch as ipex
+# DEVICE = 'xpu'
+# for Moore Threads gpu like MTT S80 (Not Official Steady Support yet)
+# You Need Use Pytorch compiled with Musa
+# DEVICE = 'musa'
+@st.cache_resource
+def get_client() -> Client:
+    client = HFClient(MODEL_PATH, TOKENIZER_PATH, PT_PATH, DEVICE)
+    return client
+class Client(Protocol):
+    def generate_stream(self,
+                        system: str | None,
+                        tools: list[dict] | None,
+                        history: list[Conversation],
+                        **parameters: Any
+                        ) -> Iterable[TextGenerationStreamResponse]:
+        ...
+def stream_chat(self, tokenizer, query: str, history: list[tuple[str, str]] = None, role: str = "user",
+                past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                repetition_penalty=1.0, length_penalty=1.0, num_beams=1,
+                logits_processor=None, return_past_key_values=False, **kwargs):
+    from transformers.generation.logits_process import LogitsProcessor
+    from transformers.generation.utils import LogitsProcessorList
+    class InvalidScoreLogitsProcessor(LogitsProcessor):
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+            if torch.isnan(scores).any() or torch.isinf(scores).any():
+                scores.zero_()
+                scores[..., 5] = 5e4
+            return scores
+    if history is None:
+        history = []
+    if logits_processor is None:
+        logits_processor = LogitsProcessorList()
+    logits_processor.append(InvalidScoreLogitsProcessor())
+    eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                    tokenizer.get_command("<|observation|>")]
+    gen_kwargs = {"max_length": max_length,
+                  "do_sample": do_sample,
+                  "top_p": top_p,
+                  "temperature": temperature,
+                  "logits_processor": logits_processor,
+                  "repetition_penalty": repetition_penalty,
+                  "length_penalty": length_penalty,
+                  "num_beams": num_beams,
+                  **kwargs
+                  }
+    print(gen_kwargs)
+    if past_key_values is None:
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+    else:
+        inputs = tokenizer.build_chat_input(query, role=role)
+    inputs = inputs.to(self.device)
+    if past_key_values is not None:
+        past_length = past_key_values[0][0].shape[0]
+        if self.transformer.pre_seq_len is not None:
+            past_length -= self.transformer.pre_seq_len
+        inputs.position_ids += past_length
+        attention_mask = inputs.attention_mask
+        attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+        inputs['attention_mask'] = attention_mask
+    history.append({"role": role, "content": query})
+    print("input_shape>", inputs['input_ids'].shape)
+    input_sequence_length = inputs['input_ids'].shape[1]
+    if max_length < input_sequence_length <= self.config.seq_length:
+        yield "Current input sequence length {} exceeds sequence length set in generation parameters {}. The maximum model sequence length is {}. You may adjust the generation parameter to enable longer chat history.".format(
+            input_sequence_length, max_length, self.config.seq_length
+        ), history
+        return
+    if input_sequence_length > self.config.seq_length:
+        yield "Current input sequence length {} exceeds maximum model sequence length {}. Unable to generate tokens.".format(
+            input_sequence_length, self.config.seq_length
+        ), history
+        return
+    for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                        eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                        **gen_kwargs):
+        if return_past_key_values:
+            outputs, past_key_values = outputs
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        if response and response[-1] != "�":
+            new_history = history
+            if return_past_key_values:
+                yield response, new_history, past_key_values
+            else:
+                yield response, new_history
+class HFClient(Client):
+    def __init__(self, model_path: str, tokenizer_path: str, pt_checkpoint: str | None = None, DEVICE = 'cpu'):
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
+        if pt_checkpoint is not None:
+            config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, pre_seq_len=128)
+            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True, config=config)
+            prefix_state_dict = torch.load(os.path.join(pt_checkpoint, "pytorch_model.bin"))
+            new_prefix_state_dict = {}
+            for k, v in prefix_state_dict.items():
+                if k.startswith("transformer.prefix_encoder."):
+                    new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
+            print("Loaded from pt checkpoints", new_prefix_state_dict.keys())
+            self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
+        else:
+            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+        self.model = self.model.to(DEVICE).eval() if 'cuda' in DEVICE else self.model.float().to(DEVICE).eval()
+    def generate_stream(self,
+                        system: str | None,
+                        tools: list[dict] | None,
+                        history: list[Conversation],
+                        **parameters: Any
+                        ) -> Iterable[TextGenerationStreamResponse]:
+        chat_history = [{
+            'role': 'system',
+            'content': system if not tools else TOOL_PROMPT,
+        }]
+        if tools:
+            chat_history[0]['tools'] = tools
+        for conversation in history[:-1]:
+            chat_history.append({
+                'role': str(conversation.role).removeprefix('<|').removesuffix('|>'),
+                'content': conversation.content,
+            })
+        query = history[-1].content
+        role = str(history[-1].role).removeprefix('<|').removesuffix('|>')
+        text = ''
+        for new_text, _ in stream_chat(self.model,
+                                       self.tokenizer,
+                                       query,
+                                       chat_history,
+                                       role,
+                                       **parameters,
+                                       ):
+            word = new_text.removeprefix(text)
+            word_stripped = word.strip()
+            text = new_text
+            yield TextGenerationStreamResponse(
+                generated_text=text,
+                token=Token(
+                    id=0,
+                    logprob=0,
+                    text=word,
+                    special=word_stripped.startswith('<|') and word_stripped.endswith('|>'),
+                )
+            )

composite_demo/conversation.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from dataclasses import dataclass
+from enum import auto, Enum
+import json
+from PIL.Image import Image
+import streamlit as st
+from streamlit.delta_generator import DeltaGenerator
+TOOL_PROMPT = 'Answer the following questions as best as you can. You have access to the following tools:\n'
+class Role(Enum):
+    SYSTEM = auto()
+    USER = auto()
+    ASSISTANT = auto()
+    TOOL = auto()
+    INTERPRETER = auto()
+    OBSERVATION = auto()
+    def __str__(self):
+        match self:
+            case Role.SYSTEM:
+                return "<|system|>"
+            case Role.USER:
+                return "<|user|>"
+            case Role.ASSISTANT | Role.TOOL | Role.INTERPRETER:
+                return "<|assistant|>"
+            case Role.OBSERVATION:
+                return "<|observation|>"
+    # Get the message block for the given role
+    def get_message(self):
+        # Compare by value here, because the enum object in the session state
+        # is not the same as the enum cases here, due to streamlit's rerunning
+        # behavior.
+        match self.value:
+            case Role.SYSTEM.value:
+                return
+            case Role.USER.value:
+                return st.chat_message(name="user", avatar="user")
+            case Role.ASSISTANT.value:
+                return st.chat_message(name="assistant", avatar="assistant")
+            case Role.TOOL.value:
+                return st.chat_message(name="tool", avatar="assistant")
+            case Role.INTERPRETER.value:
+                return st.chat_message(name="interpreter", avatar="assistant")
+            case Role.OBSERVATION.value:
+                return st.chat_message(name="observation", avatar="user")
+            case _:
+                st.error(f'Unexpected role: {self}')
+@dataclass
+class Conversation:
+    role: Role
+    content: str
+    tool: str | None = None
+    image: Image | None = None
+    def __str__(self) -> str:
+        print(self.role, self.content, self.tool)
+        match self.role:
+            case Role.SYSTEM | Role.USER | Role.ASSISTANT | Role.OBSERVATION:
+                return f'{self.role}\n{self.content}'
+            case Role.TOOL:
+                return f'{self.role}{self.tool}\n{self.content}'
+            case Role.INTERPRETER:
+                return f'{self.role}interpreter\n{self.content}'
+    # Human readable format
+    def get_text(self) -> str:
+        text = postprocess_text(self.content)
+        match self.role.value:
+            case Role.TOOL.value:
+                text = f'Calling tool `{self.tool}`:\n{text}'
+            case Role.INTERPRETER.value:
+                text = f'{text}'
+            case Role.OBSERVATION.value:
+                text = f'Observation:\n```\n{text}\n```'
+        return text
+    # Display as a markdown block
+    def show(self, placeholder: DeltaGenerator | None=None) -> str:
+        if placeholder:
+            message = placeholder
+        else:
+            message = self.role.get_message()
+        if self.image:
+            message.image(self.image)
+        else:
+            text = self.get_text()
+            message.markdown(text)
+def preprocess_text(
+    system: str | None,
+    tools: list[dict] | None,
+    history: list[Conversation],
+) -> str:
+    if tools:
+        tools = json.dumps(tools, indent=4, ensure_ascii=False)
+    prompt = f"{Role.SYSTEM}\n"
+    prompt += system if not tools else TOOL_PROMPT
+    if tools:
+        tools = json.loads(tools)
+        prompt += json.dumps(tools, ensure_ascii=False)
+    for conversation in history:
+        prompt += f'{conversation}'
+    prompt += f'{Role.ASSISTANT}\n'
+    return prompt
+def postprocess_text(text: str) -> str:
+    text = text.replace("\(", "$")
+    text = text.replace("\)", "$")
+    text = text.replace("\[", "$$")
+    text = text.replace("\]", "$$")
+    text = text.replace("<|assistant|>", "")
+    text = text.replace("<|observation|>", "")
+    text = text.replace("<|system|>", "")
+    text = text.replace("<|user|>", "")
+    return text.strip()

composite_demo/demo_chat.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import streamlit as st
+from streamlit.delta_generator import DeltaGenerator
+from client import get_client
+from conversation import postprocess_text, preprocess_text, Conversation, Role
+MAX_LENGTH = 8192
+client = get_client()
+# Append a conversation into history, while show it in a new markdown block
+def append_conversation(
+        conversation: Conversation,
+        history: list[Conversation],
+        placeholder: DeltaGenerator | None = None,
+) -> None:
+    history.append(conversation)
+    conversation.show(placeholder)
+def main(top_p: float, temperature: float, system_prompt: str, prompt_text: str, repetition_penalty: float):
+    placeholder = st.empty()
+    with placeholder.container():
+        if 'chat_history' not in st.session_state:
+            st.session_state.chat_history = []
+        history: list[Conversation] = st.session_state.chat_history
+        for conversation in history:
+            conversation.show()
+    if prompt_text:
+        prompt_text = prompt_text.strip()
+        append_conversation(Conversation(Role.USER, prompt_text), history)
+        input_text = preprocess_text(
+            system_prompt,
+            tools=None,
+            history=history,
+        )
+        print("=== Input:")
+        print(input_text)
+        print("=== History:")
+        print(history)
+        placeholder = st.empty()
+        message_placeholder = placeholder.chat_message(name="assistant", avatar="assistant")
+        markdown_placeholder = message_placeholder.empty()
+        output_text = ''
+        for response in client.generate_stream(
+                system_prompt,
+                tools=None,
+                history=history,
+                do_sample=True,
+                max_length=MAX_LENGTH,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=[str(Role.USER)],
+                repetition_penalty=repetition_penalty,
+        ):
+            token = response.token
+            if response.token.special:
+                print("=== Output:")
+                print(output_text)
+                match token.text.strip():
+                    case '<|user|>':
+                        break
+                    case _:
+                        st.error(f'Unexpected special token: {token.text.strip()}')
+                        break
+            output_text += response.token.text
+            markdown_placeholder.markdown(postprocess_text(output_text + '▌'))
+        append_conversation(Conversation(
+            Role.ASSISTANT,
+            postprocess_text(output_text),
+        ), history, markdown_placeholder)

composite_demo/demo_ci.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import base64
+from io import BytesIO
+import os
+from pprint import pprint
+import queue
+import re
+from subprocess import PIPE
+import jupyter_client
+from PIL import Image
+import streamlit as st
+from streamlit.delta_generator import DeltaGenerator
+from client import get_client
+from conversation import postprocess_text, preprocess_text, Conversation, Role
+IPYKERNEL = os.environ.get('IPYKERNEL', 'chatglm3-demo')
+SYSTEM_PROMPT = '你是一位智能AI助手，你叫ChatGLM，你连接着一台电脑，但请注意不能联网。在使用Python解决任务时，你可以运行代码并得到结果，如果运行结果有错误，你需要尽可能对代码进行改进。你可以处理用户上传到电脑上的文件，文件默认存储路径是/mnt/data/。'
+MAX_LENGTH = 8192
+TRUNCATE_LENGTH = 1024
+client = get_client()
+class CodeKernel(object):
+    def __init__(self,
+                 kernel_name='kernel',
+                 kernel_id=None,
+                 kernel_config_path="",
+                 python_path=None,
+                 ipython_path=None,
+                 init_file_path="./startup.py",
+                 verbose=1):
+        self.kernel_name = kernel_name
+        self.kernel_id = kernel_id
+        self.kernel_config_path = kernel_config_path
+        self.python_path = python_path
+        self.ipython_path = ipython_path
+        self.init_file_path = init_file_path
+        self.verbose = verbose
+        if python_path is None and ipython_path is None:
+            env = None
+        else:
+            env = {"PATH": self.python_path + ":$PATH", "PYTHONPATH": self.python_path}
+        # Initialize the backend kernel
+        self.kernel_manager = jupyter_client.KernelManager(kernel_name=IPYKERNEL,
+                                                           connection_file=self.kernel_config_path,
+                                                           exec_files=[self.init_file_path],
+                                                           env=env)
+        if self.kernel_config_path:
+            self.kernel_manager.load_connection_file()
+            self.kernel_manager.start_kernel(stdout=PIPE, stderr=PIPE)
+            print("Backend kernel started with the configuration: {}".format(
+                self.kernel_config_path))
+        else:
+            self.kernel_manager.start_kernel(stdout=PIPE, stderr=PIPE)
+            print("Backend kernel started with the configuration: {}".format(
+                self.kernel_manager.connection_file))
+        if verbose:
+            pprint(self.kernel_manager.get_connection_info())
+        # Initialize the code kernel
+        self.kernel = self.kernel_manager.blocking_client()
+        # self.kernel.load_connection_file()
+        self.kernel.start_channels()
+        print("Code kernel started.")
+    def execute(self, code):
+        self.kernel.execute(code)
+        try:
+            shell_msg = self.kernel.get_shell_msg(timeout=30)
+            io_msg_content = self.kernel.get_iopub_msg(timeout=30)['content']
+            while True:
+                msg_out = io_msg_content
+                ### Poll the message
+                try:
+                    io_msg_content = self.kernel.get_iopub_msg(timeout=30)['content']
+                    if 'execution_state' in io_msg_content and io_msg_content['execution_state'] == 'idle':
+                        break
+                except queue.Empty:
+                    break
+            return shell_msg, msg_out
+        except Exception as e:
+            print(e)
+            return None
+    def execute_interactive(self, code, verbose=False):
+        shell_msg = self.kernel.execute_interactive(code)
+        if shell_msg is queue.Empty:
+            if verbose:
+                print("Timeout waiting for shell message.")
+        self.check_msg(shell_msg, verbose=verbose)
+        return shell_msg
+    def inspect(self, code, verbose=False):
+        msg_id = self.kernel.inspect(code)
+        shell_msg = self.kernel.get_shell_msg(timeout=30)
+        if shell_msg is queue.Empty:
+            if verbose:
+                print("Timeout waiting for shell message.")
+        self.check_msg(shell_msg, verbose=verbose)
+        return shell_msg
+    def get_error_msg(self, msg, verbose=False) -> str | None:
+        if msg['content']['status'] == 'error':
+            try:
+                error_msg = msg['content']['traceback']
+            except:
+                try:
+                    error_msg = msg['content']['traceback'][-1].strip()
+                except:
+                    error_msg = "Traceback Error"
+            if verbose:
+                print("Error: ", error_msg)
+            return error_msg
+        return None
+    def check_msg(self, msg, verbose=False):
+        status = msg['content']['status']
+        if status == 'ok':
+            if verbose:
+                print("Execution succeeded.")
+        elif status == 'error':
+            for line in msg['content']['traceback']:
+                if verbose:
+                    print(line)
+    def shutdown(self):
+        # Shutdown the backend kernel
+        self.kernel_manager.shutdown_kernel()
+        print("Backend kernel shutdown.")
+        # Shutdown the code kernel
+        self.kernel.shutdown()
+        print("Code kernel shutdown.")
+    def restart(self):
+        # Restart the backend kernel
+        self.kernel_manager.restart_kernel()
+        # print("Backend kernel restarted.")
+    def interrupt(self):
+        # Interrupt the backend kernel
+        self.kernel_manager.interrupt_kernel()
+        # print("Backend kernel interrupted.")
+    def is_alive(self):
+        return self.kernel.is_alive()
+def b64_2_img(data):
+    buff = BytesIO(base64.b64decode(data))
+    return Image.open(buff)
+def clean_ansi_codes(input_string):
+    ansi_escape = re.compile(r'(\x9B|\x1B\[|\u001b\[)[0-?]*[ -/]*[@-~]')
+    return ansi_escape.sub('', input_string)
+def execute(code, kernel: CodeKernel) -> tuple[str, str | Image.Image]:
+    res = ""
+    res_type = None
+    code = code.replace("<|observation|>", "")
+    code = code.replace("<|assistant|>interpreter", "")
+    code = code.replace("<|assistant|>", "")
+    code = code.replace("<|user|>", "")
+    code = code.replace("<|system|>", "")
+    msg, output = kernel.execute(code)
+    if msg['metadata']['status'] == "timeout":
+        return res_type, 'Timed out'
+    elif msg['metadata']['status'] == 'error':
+        return res_type, clean_ansi_codes('\n'.join(kernel.get_error_msg(msg, verbose=True)))
+    if 'text' in output:
+        res_type = "text"
+        res = output['text']
+    elif 'data' in output:
+        for key in output['data']:
+            if 'text/plain' in key:
+                res_type = "text"
+                res = output['data'][key]
+            elif 'image/png' in key:
+                res_type = "image"
+                res = output['data'][key]
+                break
+    if res_type == "image":
+        return res_type, b64_2_img(res)
+    elif res_type == "text" or res_type == "traceback":
+        res = res
+    return res_type, res
+@st.cache_resource
+def get_kernel():
+    kernel = CodeKernel()
+    return kernel
+def extract_code(text: str) -> str:
+    pattern = r'```([^\n]*)\n(.*?)```'
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[-1][1]
+# Append a conversation into history, while show it in a new markdown block
+def append_conversation(
+    conversation: Conversation,
+    history: list[Conversation],
+    placeholder: DeltaGenerator | None=None,
+) -> None:
+    history.append(conversation)
+    conversation.show(placeholder)
+def main(top_p: float, temperature: float, prompt_text: str, repetition_penalty: float):
+    if 'ci_history' not in st.session_state:
+        st.session_state.ci_history = []
+    history: list[Conversation] = st.session_state.ci_history
+    for conversation in history:
+        conversation.show()
+    if prompt_text:
+        prompt_text = prompt_text.strip()
+        role = Role.USER
+        append_conversation(Conversation(role, prompt_text), history)
+        input_text = preprocess_text(
+            SYSTEM_PROMPT,
+            None,
+            history,
+        )
+        print("=== Input:")
+        print(input_text)
+        print("=== History:")
+        print(history)
+        placeholder = st.container()
+        message_placeholder = placeholder.chat_message(name="assistant", avatar="assistant")
+        markdown_placeholder = message_placeholder.empty()
+        for _ in range(5):
+            output_text = ''
+            for response in client.generate_stream(
+                system=SYSTEM_PROMPT,
+                tools=None,
+                history=history,
+                do_sample=True,
+                max_length=MAX_LENGTH,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=[str(r) for r in (Role.USER, Role.OBSERVATION)],
+                repetition_penalty=repetition_penalty,
+            ):
+                token = response.token
+                if response.token.special:
+                    print("=== Output:")
+                    print(output_text)
+                    match token.text.strip():
+                        case '<|user|>':
+                            append_conversation(Conversation(
+                                Role.ASSISTANT,
+                                postprocess_text(output_text),
+                            ), history, markdown_placeholder)
+                            return
+                        # Initiate tool call
+                        case '<|assistant|>':
+                            append_conversation(Conversation(
+                                Role.ASSISTANT,
+                                postprocess_text(output_text),
+                            ), history, markdown_placeholder)
+                            message_placeholder = placeholder.chat_message(name="interpreter", avatar="assistant")
+                            markdown_placeholder = message_placeholder.empty()
+                            output_text = ''
+                            continue
+                        case '<|observation|>':
+                            code = extract_code(output_text)
+                            print("Code:", code)
+                            display_text = output_text.split('interpreter')[-1].strip()
+                            append_conversation(Conversation(
+                                Role.INTERPRETER,
+                                postprocess_text(display_text),
+                            ), history, markdown_placeholder)
+                            message_placeholder = placeholder.chat_message(name="observation", avatar="user")
+                            markdown_placeholder = message_placeholder.empty()
+                            output_text = ''
+                            with markdown_placeholder:
+                                with st.spinner('Executing code...'):
+                                    try:
+                                        res_type, res = execute(code, get_kernel())
+                                    except Exception as e:
+                                        st.error(f'Error when executing code: {e}')
+                                        return
+                            print("Received:", res_type, res)
+                            if res_type == 'text' and len(res) > TRUNCATE_LENGTH:
+                                res = res[:TRUNCATE_LENGTH] + ' [TRUNCATED]'
+                            append_conversation(Conversation(
+                                Role.OBSERVATION,
+                                '[Image]' if res_type == 'image' else postprocess_text(res),
+                                tool=None,
+                                image=res if res_type == 'image' else None,
+                            ), history, markdown_placeholder)
+                            message_placeholder = placeholder.chat_message(name="assistant", avatar="assistant")
+                            markdown_placeholder = message_placeholder.empty()
+                            output_text = ''
+                            break
+                        case _:
+                            st.error(f'Unexpected special token: {token.text.strip()}')
+                            break
+                output_text += response.token.text
+                display_text = output_text.split('interpreter')[-1].strip()
+                markdown_placeholder.markdown(postprocess_text(display_text + '▌'))
+            else:
+                append_conversation(Conversation(
+                    Role.ASSISTANT,
+                    postprocess_text(output_text),
+                ), history, markdown_placeholder)
+                return

composite_demo/demo_tool.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import re
+import yaml
+from yaml import YAMLError
+import streamlit as st
+from streamlit.delta_generator import DeltaGenerator
+from client import get_client
+from conversation import postprocess_text, preprocess_text, Conversation, Role
+from tool_registry import dispatch_tool, get_tools
+MAX_LENGTH = 8192
+TRUNCATE_LENGTH = 1024
+EXAMPLE_TOOL = {
+    "name": "get_current_weather",
+    "description": "Get the current weather in a given location",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+    }
+}
+client = get_client()
+def tool_call(*args, **kwargs) -> dict:
+    print("=== Tool call:")
+    print(args)
+    print(kwargs)
+    st.session_state.calling_tool = True
+    return kwargs
+def yaml_to_dict(tools: str) -> list[dict] | None:
+    try:
+        return yaml.safe_load(tools)
+    except YAMLError:
+        return None
+def extract_code(text: str) -> str:
+    pattern = r'```([^\n]*)\n(.*?)```'
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[-1][1]
+# Append a conversation into history, while show it in a new markdown block
+def append_conversation(
+    conversation: Conversation,
+    history: list[Conversation],
+    placeholder: DeltaGenerator | None=None,
+) -> None:
+    history.append(conversation)
+    conversation.show(placeholder)
+def main(top_p: float, temperature: float, prompt_text: str, repetition_penalty: float):
+    manual_mode = st.toggle('Manual mode',
+        help='Define your tools in YAML format. You need to supply tool call results manually.'
+    )
+    if manual_mode:
+        with st.expander('Tools'):
+            tools = st.text_area(
+                'Define your tools in YAML format here:',
+                yaml.safe_dump([EXAMPLE_TOOL], sort_keys=False),
+                height=400,
+            )
+        tools = yaml_to_dict(tools)
+        if not tools:
+            st.error('YAML format error in tools definition')
+    else:
+        tools = get_tools()
+    if 'tool_history' not in st.session_state:
+        st.session_state.tool_history = []
+    if 'calling_tool' not in st.session_state:
+        st.session_state.calling_tool = False
+    history: list[Conversation] = st.session_state.tool_history
+    for conversation in history:
+        conversation.show()
+    if prompt_text:
+        prompt_text = prompt_text.strip()
+        role = st.session_state.calling_tool and Role.OBSERVATION or Role.USER
+        append_conversation(Conversation(role, prompt_text), history)
+        st.session_state.calling_tool = False
+        input_text = preprocess_text(
+            None,
+            tools,
+            history,
+        )
+        print("=== Input:")
+        print(input_text)
+        print("=== History:")
+        print(history)
+        placeholder = st.container()
+        message_placeholder = placeholder.chat_message(name="assistant", avatar="assistant")
+        markdown_placeholder = message_placeholder.empty()
+        for _ in range(5):
+            output_text = ''
+            for response in client.generate_stream(
+                system=None,
+                tools=tools,
+                history=history,
+                do_sample=True,
+                max_length=MAX_LENGTH,
+                temperature=temperature,
+                top_p=top_p,
+                stop_sequences=[str(r) for r in (Role.USER, Role.OBSERVATION)],
+                repetition_penalty=repetition_penalty,
+            ):
+                token = response.token
+                if response.token.special:
+                    print("=== Output:")
+                    print(output_text)
+                    match token.text.strip():
+                        case '<|user|>':
+                            append_conversation(Conversation(
+                                Role.ASSISTANT,
+                                postprocess_text(output_text),
+                            ), history, markdown_placeholder)
+                            return
+                        # Initiate tool call
+                        case '<|assistant|>':
+                            append_conversation(Conversation(
+                                Role.ASSISTANT,
+                                postprocess_text(output_text),
+                            ), history, markdown_placeholder)
+                            output_text = ''
+                            message_placeholder = placeholder.chat_message(name="tool", avatar="assistant")
+                            markdown_placeholder = message_placeholder.empty()
+                            continue
+                        case '<|observation|>':
+                            tool, *call_args_text = output_text.strip().split('\n')
+                            call_args_text = '\n'.join(call_args_text)
+                            append_conversation(Conversation(
+                                Role.TOOL,
+                                postprocess_text(output_text),
+                                tool,
+                            ), history, markdown_placeholder)
+                            message_placeholder = placeholder.chat_message(name="observation", avatar="user")
+                            markdown_placeholder = message_placeholder.empty()
+                            try:
+                                code = extract_code(call_args_text)
+                                args = eval(code, {'tool_call': tool_call}, {})
+                            except:
+                                st.error('Failed to parse tool call')
+                                return
+                            output_text = ''
+                            if manual_mode:
+                                st.info('Please provide tool call results below:')
+                                return
+                            else:
+                                with markdown_placeholder:
+                                    with st.spinner(f'Calling tool {tool}...'):
+                                        observation = dispatch_tool(tool, args)
+                                if len(observation) > TRUNCATE_LENGTH:
+                                    observation = observation[:TRUNCATE_LENGTH] + ' [TRUNCATED]'
+                                append_conversation(Conversation(
+                                    Role.OBSERVATION, observation
+                                ), history, markdown_placeholder)
+                                message_placeholder = placeholder.chat_message(name="assistant", avatar="assistant")
+                                markdown_placeholder = message_placeholder.empty()
+                                st.session_state.calling_tool = False
+                                break
+                        case _:
+                            st.error(f'Unexpected special token: {token.text.strip()}')
+                            return
+                output_text += response.token.text
+                markdown_placeholder.markdown(postprocess_text(output_text + '▌'))
+            else:
+                append_conversation(Conversation(
+                    Role.ASSISTANT,
+                    postprocess_text(output_text),
+                ), history, markdown_placeholder)
+                return

composite_demo/main.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from enum import Enum
+import streamlit as st
+st.set_page_config(
+    page_title="ChatGLM3 Demo",
+    page_icon=":robot:",
+    layout='centered',
+    initial_sidebar_state='expanded',
+)
+import demo_chat, demo_ci, demo_tool
+DEFAULT_SYSTEM_PROMPT = '''
+You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.
+'''.strip()
+# Set the title of the demo
+st.title("ChatGLM3 Demo")
+# Add your custom text here, with smaller font size
+st.markdown("<sub>智谱AI 公开在线技术文档: https://lslfd0slxc.feishu.cn/wiki/WvQbwIJ9tiPAxGk8ywDck6yfnof </sub> \n\n <sub> 更多 ChatGLM3-6B 的使用方法请参考文档。</sub>", unsafe_allow_html=True)
+class Mode(str, Enum):
+    CHAT, TOOL, CI = '💬 Chat', '🛠️ Tool', '🧑‍💻 Code Interpreter'
+with st.sidebar:
+    top_p = st.slider(
+        'top_p', 0.0, 1.0, 0.8, step=0.01
+    )
+    temperature = st.slider(
+        'temperature', 0.0, 1.5, 0.95, step=0.01
+    )
+    repetition_penalty = st.slider(
+        'repetition_penalty', 0.0, 2.0, 1.2, step=0.01
+    )
+    system_prompt = st.text_area(
+        label="System Prompt (Only for chat mode)",
+        height=300,
+        value=DEFAULT_SYSTEM_PROMPT,
+    )
+prompt_text = st.chat_input(
+    'Chat with ChatGLM3!',
+    key='chat_input',
+)
+tab = st.radio(
+    'Mode',
+    [mode.value for mode in Mode],
+    horizontal=True,
+    label_visibility='hidden',
+)
+match tab:
+    case Mode.CHAT:
+        demo_chat.main(top_p, temperature, system_prompt, prompt_text, repetition_penalty)
+    case Mode.TOOL:
+        demo_tool.main(top_p, temperature, prompt_text, repetition_penalty)
+    case Mode.CI:
+        demo_ci.main(top_p, temperature, prompt_text, repetition_penalty)
+    case _:
+        st.error(f'Unexpected tab: {tab}')

composite_demo/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+huggingface_hub
+ipykernel
+ipython
+jupyter_client
+pillow
+sentencepiece
+streamlit
+tokenizers
+torch
+transformers
+pyyaml
+requests