diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..73c8609270fc105dd7746e6515a311d36130745a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/apple.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/demo_vl.gif filter=lfs diff=lfs merge=lfs -text
+assets/mm_tutorial/Beijing.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/mm_tutorial/Chongqing.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/mm_tutorial/Rebecca_(1939_poster).jpeg filter=lfs diff=lfs merge=lfs -text
+assets/mm_tutorial/Shanghai_Output.jpg filter=lfs diff=lfs merge=lfs -text
+assets/touchstone_datasets.jpg filter=lfs diff=lfs merge=lfs -text
+assets/touchstone_logo.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99ce8124f50e14afc55fe8ad119e5c5792e5c004
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,88 @@
+name: 🐞 Bug
+description: 提交错误报告 | File a bug/issue
+title: "[BUG] <title>"
+labels: []
+body:
+  - type: checkboxes
+    attributes:
+      label: 是否已有关于该错误的issue或讨论？ | Is there an existing issue / discussion for this?
+      description: |
+        请先搜索您遇到的错误是否在已有的issues或讨论中提到过。
+        Please search to see if an issue / discussion already exists for the bug you encountered.
+        [Issues](https://github.com/QwenLM/Qwen-7B/issues)
+        [Discussions](https://github.com/QwenLM/Qwen-7B/discussions)
+      options:
+        - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions
+          required: true
+  - type: checkboxes
+    attributes:
+      label: 该问题是否在FAQ中有解答？ | Is there an existing answer for this in FAQ?
+      description: |
+        请先搜索您遇到的错误是否已在FAQ中有相关解答。
+        Please search to see if an answer already exists in FAQ for the bug you encountered.
+        [FAQ-en](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ.md)
+        [FAQ-zh](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ_zh.md)
+      options:
+        - label: 我已经搜索过FAQ | I have searched FAQ
+          required: true
+  - type: textarea
+    attributes:
+      label: 当前行为 | Current Behavior
+      description: |
+        准确描述遇到的行为。
+        A concise description of what you're experiencing.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 期望行为 | Expected Behavior
+      description: |
+        准确描述预期的行为。
+        A concise description of what you expected to happen.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 复现方法 | Steps To Reproduce
+      description: |
+        复现当前行为的详细步骤。
+        Steps to reproduce the behavior.
+      placeholder: |
+        1. In this environment...
+        2. With this config...
+        3. Run '...'
+        4. See error...
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 运行环境 | Environment
+      description: |
+        examples:
+          - **OS**: Ubuntu 20.04
+          - **Python**: 3.8
+          - **Transformers**: 4.31.0
+          - **PyTorch**: 2.0.1
+          - **CUDA**: 11.4
+      value: |
+        - OS:
+        - Python:
+        - Transformers:
+        - PyTorch:
+        - CUDA (`python -c 'import torch; print(torch.version.cuda)'`):
+      render: Markdown
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 备注 | Anything else?
+      description: |
+        您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。
+        
+        您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。
+        
+        Links? References? Anything that will give us more context about the issue you are encountering!
+        
+        Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yaml b/.github/ISSUE_TEMPLATE/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0086358db1eb971c0cfa8739c27518bbc18a5ff4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yaml
@@ -0,0 +1 @@
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e677af83ac00162afab9318e10e5fc1b9c229fd6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
@@ -0,0 +1,78 @@
+name: "💡 Feature Request"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "💡 [REQUEST] - <title>"
+labels: [
+  "question"
+]
+body:
+  - type: input
+    id: start_date
+    attributes:
+      label: "起始日期 | Start Date"
+      description: |
+        起始开发日期
+        Start of development
+      placeholder: "month/day/year"
+    validations:
+      required: false
+  - type: textarea
+    id: implementation_pr
+    attributes:
+      label: "实现PR | Implementation PR"
+      description: |
+        实现该功能的Pull request
+        Pull request used
+      placeholder: "#Pull Request ID"
+    validations:
+      required: false
+  - type: textarea
+    id: reference_issues
+    attributes:
+      label: "相关Issues | Reference Issues"
+      description: |
+        与该功能相关的issues
+        Common issues
+      placeholder: "#Issues IDs"
+    validations:
+      required: false
+  - type: textarea
+    id: summary
+    attributes:
+      label: "摘要 | Summary"
+      description: |
+        简要描述新功能的特点
+        Provide a brief explanation of the feature
+      placeholder: |
+        Describe in a few lines your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: basic_example
+    attributes:
+      label: "基本示例 | Basic Example"
+      description: Indicate here some basic examples of your feature.
+      placeholder: A few specific words about your feature request.
+    validations:
+      required: true
+  - type: textarea
+    id: drawbacks
+    attributes:
+      label: "缺陷 | Drawbacks"
+      description: |
+        该新功能有哪些缺陷/可能造成哪些影响？
+        What are the drawbacks/impacts of your feature request ?
+      placeholder: |
+        Identify the drawbacks and impacts while being neutral on your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: unresolved_question
+    attributes:
+      label: "未解决问题 | Unresolved questions"
+      description: |
+        有哪些尚未解决的问题？
+        What questions still remain unresolved ?
+      placeholder: |
+        Identify any unresolved issues.
+    validations:
+      required: false
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..39e90665d32fb065ef7a679d9e02769b1ef8fabe
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__
+*.so
+build
+.coverage_*
+*.egg-info
+*~
+.vscode/
+.idea/
+.DS_Store
+
+/private/
diff --git a/1.jpg b/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ec8e1873a202ed3f51bc6bbc658197393b4ef
Binary files /dev/null and b/1.jpg differ
diff --git a/FAQ.md b/FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8808428922b4b7fa6d93b5dffdb1bafebcc8032
--- /dev/null
+++ b/FAQ.md
@@ -0,0 +1,55 @@
+# FAQ
+
+## Installation & Environment
+
+#### Which version of transformers should I use?
+
+4.31.0 is preferred.
+
+#### I downloaded the codes and checkpoints but I can't load the model locally. What should I do?
+
+Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files.
+
+#### `qwen.tiktoken` is not found. What is it?
+
+This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file.
+
+#### transformers_stream_generator/tiktoken/accelerate not found
+
+Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt).
+<br><br>
+
+
+
+## Demo & Inference
+
+#### Is there any demo?
+
+Yes, see `web_demo_mm.py` for web demo. See README for more information.
+
+
+
+#### Can Qwen-VL support streaming?
+
+No. We do not support streaming yet.
+
+#### It seems that the generation is not related to the instruction...
+
+Please check if you are loading Qwen-VL-Chat instead of Qwen-VL. Qwen-VL is the base model without alignment, which behaves differently from the SFT/Chat model.
+
+#### Is quantization supported?
+
+No. We would support quantization asap.
+
+#### Unsatisfactory performance in processing long sequences
+
+Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default).
+<br><br>
+
+
+## Tokenizer
+
+#### bos_id/eos_id/pad_id not found
+
+In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer.
+
diff --git a/FAQ_zh.md b/FAQ_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..1626403a70f611fa87b42e8fa6af20fa633e5c94
--- /dev/null
+++ b/FAQ_zh.md
@@ -0,0 +1,52 @@
+# FAQ
+
+## 安装&环境
+
+#### 我应该用哪个transformers版本？
+
+建议使用4.31.0。
+
+#### 我把模型和代码下到本地，按照教程无法使用，该怎么办？
+
+答：别着急，先检查你的代码是不是更新到最新版本，然后确认你是否完整地将模型checkpoint下到本地。
+
+#### `qwen.tiktoken`这个文件找不到，怎么办？
+
+这个是我们的tokenizer的merge文件，你必须下载它才能使用我们的tokenizer。注意，如果你使用git clone却没有使用git-lfs，这个文件不会被下载。如果你不了解git-lfs，可点击[官网](https://git-lfs.com/)了解。
+
+#### transformers_stream_generator/tiktoken/accelerate，这几个库提示找不到，怎么办？
+
+运行如下命令：`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) 可以找到。
+<br><br>
+
+
+## Demo & 推理
+
+#### 是否提供Demo？
+
+`web_demo_mm.py`提供了Web UI。请查看README相关内容了解更多。
+
+#### Qwen-VL支持流式推理吗？
+
+Qwen-VL当前不支持流式推理。
+
+#### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的
+
+请检查是否加载的是Qwen-VL-Chat模型进行推理，Qwen-VL模型是未经align的预训练基模型，不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`接口内进行了检查，避免您误将预训练模型作为SFT/Chat模型使用。
+
+#### 是否有量化版本模型
+
+目前Qwen-VL不支持量化，后续我们将支持高效的量化推理实现。
+
+#### 处理长序列时效果有问题
+
+请确认是否开启ntk。若要启用这些技巧，请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。
+<br><br>
+
+
+## Tokenizer
+
+#### bos_id/eos_id/pad_id，这些token id不存在，为什么？
+
+在训练过程中，我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符，你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档，了解如何设置这些id。
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2995381cdff5a60abebebf13dce920ca5192294a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,53 @@
+Tongyi Qianwen LICENSE AGREEMENT
+
+Tongyi Qianwen Release Date: August 23, 2023
+
+By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+
+1. Definitions
+    a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
+    b. "We"(or "Us") shall mean Alibaba Cloud.
+    c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
+    d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
+    e. "Tongyi Qianwen" shall mean the large language models (including Qwen-VL model and Qwen-VL-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
+    f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
+    g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
+    h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+2. Grant of Rights
+You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
+
+3. Redistribution
+You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+    a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
+    b. You shall cause any modified files to carry prominent notices stating that You changed the files;
+    c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
+    d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
+
+4. Restrictions
+If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
+
+5. Rules of use
+    a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
+    b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
+
+6. Intellectual Property
+    a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
+    b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
+    c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
+
+7. Disclaimer of Warranty and Limitation of Liability
+
+    a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
+    b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
+    c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
+    d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+
+8. Survival and Termination.
+    a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+    b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
+
+9. Governing Law and Jurisdiction.
+    a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+    b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
\ No newline at end of file
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..22c063e728d032f6578e28eca18542ebd0c4610b
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,52 @@
+------------- LICENSE FOR NVIDIA Megatron-LM code  --------------
+
+Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+------------- LICENSE FOR OpenAI tiktoken code  --------------
+
+MIT License
+
+Copyright (c) 2022 OpenAI, Shantanu Jain
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index 1004acd57c295461024b3c4f14d39595668abef7..a6a71ce60636264d030ce259e632d4c6478dba6a 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,680 @@
 ---
-title: Qwen VL
-emoji: 👀
-colorFrom: yellow
-colorTo: yellow
+title: Qwen-VL
+app_file: web_demo_mm.py
 sdk: gradio
 sdk_version: 3.40.1
-app_file: app.py
-pinned: false
 ---
+<br>
+
+<p align="center">
+    <img src="assets/logo.jpg" width="400"/>
+<p>
+<br>
+
+<p align="center">
+        Qwen-VL <a href="https://modelscope.cn/models/qwen/Qwen-VL/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-VL">🤗</a>&nbsp ｜ Qwen-VL-Chat <a href="https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary">🤖 <a>| <a href="https://huggingface.co/Qwen/Qwen-VL-Chat">🤗</a>&nbsp ｜ &nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-VL-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a>Report</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>
+
+</p>
+<br>
+
+<p align="center">
+        <a href="README_CN.md">中文</a>&nbsp ｜ &nbsp English
+</p>
+<br><br>
+
+**Qwen-VL** (Qwen Large Vision Language Model) is the multimodal version of the large model series, Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-VL accepts image, text, and bounding box as inputs, outputs text and bounding box. The features of Qwen-VL include:
+- **Strong performance**: It significantly surpasses existing open-source Large Vision Language Models (LVLM) under similar model scale on multiple English evaluation benchmarks (including Zero-shot Captioning, VQA, DocVQA, and Grounding).
+- **Multi-lingual LVLM supporting text recognition**: Qwen-VL naturally supports English, Chinese, and multi-lingual conversation, and it promotes end-to-end recognition of Chinese and English bi-lingual text in images.
+- **Multi-image interleaved conversations**: This feature allows for the input and comparison of multiple images, as well as the ability to specify questions related to the images and engage in multi-image storytelling.
+- **First generalist model supporting grounding in Chinese**: Detecting bounding boxes through open-domain language expression in both Chinese and English.
+- **Fine-grained recognition and understanding**: Compared to the 224\*224 resolution currently used by other open-source LVLM, the 448\*448 resolution promotes fine-grained text recognition, document QA, and bounding box annotation.
+
+<br>
+<p align="center">
+    <img src="assets/demo_vl.gif" width="400"/>
+<p>
+<br>
+
+We release two models of the Qwen-VL series:
+- Qwen-VL: The pre-trained LVLM model uses Qwen-7B as the initialization of the LLM, and [Openclip ViT-bigG](https://github.com/mlfoundations/open_clip) as the initialization of the visual encoder. And connects them with a randomly initialized cross-attention layer.
+- Qwen-VL-Chat: A multimodal LLM-based AI assistant, which is trained with alignment techniques. Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.
+
+
+## Evaluation
+
+We evaluated the model's abilities from two perspectives:
+1. **Standard Benchmarks**: We evaluate the model's basic task capabilities on four major categories of multimodal tasks:
+   - Zero-shot Captioning: Evaluate model's zero-shot image captioning ability on unseen datasets;
+   - General VQA: Evaluate the general question-answering ability of pictures, such as the judgment, color, number, category, etc;
+   - Text-based VQA: Evaluate the model's ability to recognize text in pictures, such as document QA, chart QA, etc;
+   - Referring Expression Comprehension: Evaluate the ability to localize a target object in an image described by a referring expression.
+
+2. **TouchStone**: To evaluate the overall text-image dialogue capability and alignment level with humans, we have constructed a benchmark called TouchStone, which is based on scoring with GPT4 to evaluate the LVLM model.
+   - The TouchStone benchmark covers a total of 300+ images, 800+ questions, and 27 categories. Such as attribute-based Q&A, celebrity recognition, writing poetry, summarizing multiple images, product comparison, math problem solving, etc;
+   - In order to break the current limitation of GPT4 in terms of direct image input, TouchStone provides fine-grained image annotations by human labeling. These detailed annotations, along with the questions and the model's output, are then presented to GPT4 for scoring.
+   - The benchmark includes both English and Chinese versions.
+
+The results of the evaluation are as follows:
+
+Qwen-VL outperforms current SOTA generalist models on multiple VL tasks and has a more comprehensive coverage in terms of capability range.
+
+<p align="center">
+    <img src="assets/radar.png" width="600"/>
+<p>
+
+### Zero-shot Captioning & General VQA
+<table>
+<thead>
+  <tr>
+    <th rowspan="2">Model type</th>
+    <th rowspan="2">Model</th>
+    <th colspan="2">Zero-shot Captioning</th>
+    <th colspan="5">General VQA</th>
+  </tr>
+  <tr>
+    <th>NoCaps</th>
+    <th>Flickr30K</th>
+    <th>VQAv2<sup>dev</sup></th>
+    <th>OK-VQA</th>
+    <th>GQA</th>
+    <th>SciQA-Img<br>(0-shot)</th>
+    <th>VizWiz<br>(0-shot)</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="10">Generalist<br>Models</td>
+    <td>Flamingo-9B</td>
+    <td>-</td>
+    <td>61.5</td>
+    <td>51.8</td>
+    <td>44.7</td>
+    <td>-</td>
+    <td>-</td>
+    <td>28.8</td>
+  </tr>
+  <tr>
+    <td>Flamingo-80B</td>
+    <td>-</td>
+    <td>67.2</td>
+    <td>56.3</td>
+    <td>50.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>31.6</td>
+  </tr>
+  <tr>
+    <td>Unified-IO-XL</td>
+    <td>100.0</td>
+    <td>-</td>
+    <td>77.9</td>
+    <td>54.0</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Kosmos-1</td>
+    <td>-</td>
+    <td>67.1</td>
+    <td>51.0</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>29.2</td>
+  </tr>
+  <tr>
+    <td>Kosmos-2</td>
+    <td>-</td>
+    <td>66.7</td>
+    <td>45.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>BLIP-2 (Vicuna-13B)</td>
+    <td>103.9</td>
+    <td>71.6</td>
+    <td>65.0</td>
+    <td>45.9</td>
+    <td>32.3</td>
+    <td>61.0</td>
+    <td>19.6</td>
+  </tr>
+  <tr>
+    <td>InstructBLIP (Vicuna-13B)</td>
+    <td><strong>121.9</strong></td>
+    <td>82.8</td>
+    <td>-</td>
+    <td>-</td>
+    <td>49.5</td>
+    <td>63.1</td>
+    <td>33.4</td>
+  </tr>
+  <tr>
+    <td>Shikra (Vicuna-13B)</td>
+    <td>-</td>
+    <td>73.9</td>
+    <td>77.36</td>
+    <td>47.16</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td><strong>Qwen-VL (Qwen-7B)</strong></td>
+    <td>121.4</td>
+    <td><b>85.8</b></td>
+    <td><b>78.8</b></td>
+    <td><b>58.6</b></td>
+    <td><b>59.3</b></td>
+    <td>67.1</td>
+    <td>35.2</td>
+  </tr>
+  <!-- <tr>
+    <td>Qwen-VL (4-shot)</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>63.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>39.1</td>
+  </tr> -->
+  <tr>
+    <td>Qwen-VL-Chat</td>
+    <td>120.2</td>
+    <td>81.0</td>
+    <td>78.2</td>
+    <td>56.6</td>
+    <td>57.5</td>
+    <td><b>68.2</b></td>
+    <td><b>38.9</b></td>
+  </tr>
+  <!-- <tr>
+    <td>Qwen-VL-Chat (4-shot)</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>60.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>44.45</td>
+  </tr> -->
+  <tr>
+    <td>Previous SOTA<br>(Per Task Fine-tuning)</td>
+    <td>-</td>
+    <td>127.0<br>(PALI-17B)</td>
+    <td>84.5<br>(InstructBLIP<br>-FlanT5-XL)</td>
+    <td>86.1<br>(PALI-X<br>-55B)</td>
+    <td>66.1<br>(PALI-X<br>-55B)</td>
+    <td>72.1<br>(CFR)</td>
+    <td>92.53<br>(LLaVa+<br>GPT-4)</td>
+    <td>70.9<br>(PALI-X<br>-55B)</td>
+  </tr>
+</tbody>
+</table>
+
+- For zero-shot image captioning, Qwen-VL achieves the **SOTA** on Flickr30K and competitive results on Nocaps with InstructBlip.
+- For general VQA, Qwen-VL achieves the **SOTA** under the same generalist LVLM scale settings.
+
+### Text-oriented VQA (focuse on text understanding capabilities in images)
+
+<table>
+<thead>
+  <tr>
+    <th>Model type</th>
+    <th>Model</th>
+    <th>TextVQA</th>
+    <th>DocVQA</th>
+    <th>ChartQA</th>
+    <th>AI2D</th>
+    <th>OCR-VQA</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="5">Generalist Models</td>
+    <td>BLIP-2 (Vicuna-13B)</td>
+    <td>42.4</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>InstructBLIP (Vicuna-13B)</td>
+    <td>50.7</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>mPLUG-DocOwl (LLaMA-7B)</td>
+    <td>52.6</td>
+    <td>62.2</td>
+    <td>57.4</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Pic2Struct-Large (1.3B)</td>
+    <td>-</td>
+    <td><b>76.6</b></td>
+    <td>58.6</td>
+    <td>42.1</td>
+    <td>71.3</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL (Qwen-7B)</td>
+    <td><b>63.8</b></td>
+    <td>65.1</td>
+    <td><b>65.7</b></td>
+    <td><b>62.3</b></td>
+    <td><b>75.7</b></td>
+  </tr>
+  <tr>
+    <td>Specialist SOTAs<br>(Specialist/Finetuned)</td>
+    <td>PALI-X-55B (Single-task FT)<br>(Without OCR Pipeline)</td>
+    <td>71.44</td>
+    <td>80.0</td>
+    <td>70.0</td>
+    <td>81.2</td>
+    <td>75.0</td>
+  </tr>
+</tbody>
+</table>
+
+- In text-related recognition/QA evaluation, Qwen-VL achieves the SOTA under the generalist LVLM scale settings.
+- Resolution is important for several above evaluations. While most open-source LVLM models with 224 resolution are incapable of these evaluations or can only solve these by cutting images, Qwen-VL scales the resolution to 448 so that it can be evaluated end-to-end. Qwen-VL even outperforms Pic2Struct-Large models of 1024 resolution on some tasks.
+
+### Referring Expression Comprehension
+<table>
+<thead>
+  <tr>
+    <th rowspan="2">Model type</th>
+    <th rowspan="2">Model</th>
+    <th colspan="3">RefCOCO</th>
+    <th colspan="3">RefCOCO+</th>
+    <th colspan="2">RefCOCOg</th>
+    <th>GRIT</th>
+  </tr>
+  <tr>
+    <th>val</th>
+    <th>test-A</th>
+    <th>test-B</th>
+    <th>val</th>
+    <th>test-A</th>
+    <th>test-B</th>
+    <th>val-u</th>
+    <th>test-u</th>
+    <th>refexp</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="8">Generalist Models</td>
+    <td>GPV-2</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>51.50</td>
+  </tr>
+  <tr>
+    <td>OFA-L*</td>
+    <td>79.96</td>
+    <td>83.67</td>
+    <td>76.39</td>
+    <td>68.29</td>
+    <td>76.00</td>
+    <td>61.75</td>
+    <td>67.57</td>
+    <td>67.58</td>
+    <td>61.70</td>
+  </tr>
+  <tr>
+    <td>Unified-IO</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td><b>78.61</b></td>
+  </tr>
+  <tr>
+    <td>VisionLLM-H</td>
+    <td></td>
+    <td>86.70</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Shikra-7B</td>
+    <td>87.01</td>
+    <td>90.61</td>
+    <td>80.24 </td>
+    <td>81.60</td>
+    <td>87.36</td>
+    <td>72.12</td>
+    <td>82.27</td>
+    <td>82.19</td>
+    <td>69.34</td>
+  </tr>
+  <tr>
+    <td>Shikra-13B</td>
+    <td>87.83 </td>
+    <td>91.11</td>
+    <td>81.81</td>
+    <td>82.89</td>
+    <td>87.79</td>
+    <td>74.41</td>
+    <td>82.64</td>
+    <td>83.16</td>
+    <td>69.03</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL-7B</td>
+    <td><b>89.36</b></td>
+    <td>92.26</td>
+    <td><b>85.34</b></td>
+    <td><b>83.12</b></td>
+    <td>88.25</td>
+    <td><b>77.21</b></td>
+    <td>85.58</td>
+    <td>85.48</td>
+    <td>78.22</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL-7B-Chat</td>
+    <td>88.55</td>
+    <td><b>92.27</b></td>
+    <td>84.51</td>
+    <td>82.82</td>
+    <td><b>88.59</b></td>
+    <td>76.79</td>
+    <td><b>85.96</b></td>
+    <td><b>86.32</b></td>
+    <td>-</td>
+  <tr>
+    <td rowspan="3">Specialist SOTAs<br>(Specialist/Finetuned)</td>
+    <td>G-DINO-L</td>
+    <td>90.56&nbsp;&nbsp;</td>
+    <td>93.19</td>
+    <td>88.24</td>
+    <td>82.75</td>
+    <td>88.95</td>
+    <td>75.92</td>
+    <td>86.13</td>
+    <td>87.02</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>UNINEXT-H</td>
+    <td>92.64 </td>
+    <td>94.33</td>
+    <td>91.46</td>
+    <td>85.24</td>
+    <td>89.63</td>
+    <td>79.79</td>
+    <td>88.73</td>
+    <td>89.37</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>ONE-PEACE</td>
+    <td>92.58 </td>
+    <td>94.18</td>
+    <td>89.26</td>
+    <td>88.77</td>
+    <td>92.21</td>
+    <td>83.23</td>
+    <td>89.22</td>
+    <td>89.27</td>
+    <td>-</td>
+  </tr>
+</tbody>
+</table>
+
+- Qwen-VL achieves the **SOTA** in all above referring expression comprehension benchmarks.
+- Qwen-VL has not been trained on any Chinese grounding data, but it can still generalize to the Chinese Grounding tasks in a zero-shot way by training Chinese Caption data and English Grounding data.
+
+We provide all of the above evaluation scripts for reproducing our experimental results. Please read [eval_mm/EVALUATION.md](eval_mm/EVALUATION.md) for more information.
+
+### Chat evaluation
+
+TouchStone is a benchmark based on scoring with GPT4 to evaluate the abilities of the LVLM model on text-image dialogue and alignment levels with humans. It covers a total of 300+ images, 800+ questions, and 27 categories, such as attribute-based Q&A, celebrity recognition, writing poetry, summarizing multiple images, product comparison, math problem solving, etc. Please read [touchstone/README.md](touchstone/README.md) for more information.
+
+#### English evaluation
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### Chinese evaluation
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
+Qwen-VL-Chat has achieved the best results in both Chinese and English alignment evaluation.
+
+## Requirements
+
+* python 3.8 and above
+* pytorch 1.12 and above, 2.0 and above are recommended
+* CUDA 11.4 and above are recommended (this is for GPU users)
+
+## Quickstart
+
+Below, we provide simple examples to show how to use Qwen-VL and Qwen-VL-Chat with 🤖 ModelScope and 🤗 Transformers.
+
+Before running the code, make sure you have setup the environment and installed the required packages. Make sure you meet the above requirements, and then install the dependent libraries.
+
+```bash
+pip install -r requirements.txt
+```
+
+Now you can start with ModelScope or Transformers. More usage aboue vision encoder, please refer to the [tutorial](TUTORIAL.md).
+
+#### 🤗 Transformers
+
+To use Qwen-VL-Chat for the inference, all you need to do is to input a few lines of codes as demonstrated below. However, **please make sure that you are using the latest code.**
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+# Note: The default behavior now has injection attack prevention off.
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# use fp16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu only
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cpu", trust_remote_code=True).eval()
+# use cuda device
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
+
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# 1st dialogue turn
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': '这是什么?'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+# 图中是一名女子在沙滩上和狗玩耍，旁边是一只拉布拉多犬，它们处于沙滩上。
+
+# 2st dialogue turn
+response, history = model.chat(tokenizer, '框出图中击掌的位置', history=history)
+print(response)
+# <ref>击掌</ref><box>(536,509),(588,602)</box>
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+if image:
+  image.save('1.jpg')
+else:
+  print("no box")
+```
+
+<p align="center">
+    <img src="assets/demo_highfive.jpg" width="500"/>
+<p>
+
+<details>
+  <summary>Running Qwen-VL</summary>
+
+Running Qwen-VL pretrained base model is also simple.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
+
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# use fp16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu only
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cpu", trust_remote_code=True).eval()
+# use cuda device
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()
+
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
+
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': 'Generate the caption in English with grounding:'},
+])
+inputs = tokenizer(query, return_tensors='pt')
+inputs = inputs.to(model.device)
+pred = model.generate(**inputs)
+response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+print(response)
+# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+image = tokenizer.draw_bbox_on_latest_picture(response)
+if image:
+  image.save('2.jpg')
+else:
+  print("no box")
+```
+
+<p align="center">
+    <img src="assets/demo_spotting_caption.jpg" width="500"/>
+<p>
+      
+</details>
+
+
+#### 🤖 ModelScope
+
+ModelScope is an opensource platform for Model-as-a-Service (MaaS), which provides flexible and cost-effective model service to AI developers. Similarly, you can run the models with ModelScope as shown below:
+
+```python
+from modelscope import (
+    snapshot_download, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+)
+import torch
+model_id = 'qwen/Qwen-VL-Chat'
+revision = 'v1.0.0'
+
+model_dir = snapshot_download(model_id, revision=revision)
+torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+if not hasattr(tokenizer, 'model_dir'):
+    tokenizer.model_dir = model_dir
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, bf16=True).eval()
+# use fp16
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="cpu", trust_remote_code=True).eval()
+# use auto
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
+
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)
+
+# 1st dialogue turn
+# Either a local path or an url between <img></img> tags.
+image_path = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'
+response, history = model.chat(tokenizer, query=f'<img>{image_path}</img>这是什么', history=None)
+print(response)
+# 图中是一名年轻女子在沙滩上和她的狗玩耍，狗的品种是拉布拉多。她们坐在沙滩上，狗的前腿抬起来，与人互动。
+
+# 2st dialogue turn
+response, history = model.chat(tokenizer, '输出击掌的检测框', history=history)
+print(response)
+# <ref>"击掌"</ref><box>(211,412),(577,891)</box>
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+if image:
+  image.save('output_chat.jpg')
+else:
+  print("no box")
+```
+
+<p align="center">
+    <img src="assets/demo_highfive.jpg" width="500"/>
+<p>
+
+## Demo
+
+### Web UI
+
+We provide code for users to build a web UI demo. Before you start, make sure you install the following packages:
+
+```
+pip install -r requirements_web_demo.txt
+```
+
+Then run the command below and click on the generated link:
+
+```
+python web_demo_mm.py
+```
+
+## FAQ
+
+If you meet problems, please refer to [FAQ](FAQ.md) and the issues first to search a solution before you launch a new issue.
+
+
+## License Agreement
+
+Researchers and developers are free to use the codes and model weights of both Qwen-VL and Qwen-VL-Chat. We also allow their commercial use. Check our license at [LICENSE](LICENSE) for more details.
+
+## Contact Us
+
+If you are interested to leave a message to either our research team or product team, feel free to send an email to qianwen_opensource@alibabacloud.com.
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/README_CN.md b/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ddbf06ea7e2f15493a63f2d8f914b40bf3a9205b
--- /dev/null
+++ b/README_CN.md
@@ -0,0 +1,666 @@
+<br>
+
+<p align="center">
+    <img src="assets/logo.jpg" width="400"/>
+<p>
+<br>
+
+<p align="center">
+        Qwen-VL <a href="https://modelscope.cn/models/qwen/Qwen-VL/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-VL">🤗</a>&nbsp ｜ Qwen-VL-Chat <a href="https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary">🤖 <a>| <a href="https://huggingface.co/Qwen/Qwen-VL-Chat">🤗</a>&nbsp ｜ &nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-VL-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a>Report</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>
+</p>
+<br>
+
+<p align="center">
+        中文</a>&nbsp ｜ &nbsp<a href="README.md">English</a>
+</p>
+<br><br>
+
+**Qwen-VL** 是阿里云研发的大规模视觉语言模型（Large Vision Language Model, LVLM）。Qwen-VL 可以以图像、文本、检测框作为输入，并以文本和检测框作为输出。Qwen-VL 系列模型的特点包括：
+- **强大的性能**：在四大类多模态任务的标准英文测评中（Zero-shot Captioning/VQA/DocVQA/Grounding）上，均取得同等通用模型大小下最好效果；
+- **多语言对话模型**：天然支持英文、中文等多语言对话，端到端支持图片里中英双语的长文本识别；
+- **多图交错对话**：支持多图输入和比较，指定图片问答，多图文学创作等；
+- **首个支持中文开放域定位的通用模型**：通过中文开放域语言表达进行检测框标注；
+- **细粒度识别和理解**：相比于目前其它开源LVLM使用的224分辨率，Qwen-VL是首个开源的448分辨率的LVLM模型。更高分辨率可以提升细粒度的文字识别、文档问答和检测框标注。
+
+<br>
+<p align="center">
+    <img src="assets/demo_vl.gif" width="400"/>
+<p>
+<br>
+
+目前，我们提供了 Qwen-VL 系列的两个模型：
+- Qwen-VL: Qwen-VL 以 Qwen-7B 的预训练模型作为语言模型的初始化，并以 [Openclip ViT-bigG](https://github.com/mlfoundations/open_clip) 作为视觉编码器的初始化，中间加入单层随机初始化的 cross-attention，经过约1.5B的图文数据训练得到。最终图像输入分辨率为448。
+- Qwen-VL-Chat: 在 Qwen-VL 的基础上，我们使用对齐机制打造了基于大语言模型的视觉AI助手Qwen-VL-Chat，它支持更灵活的交互方式，包括多图、多轮问答、创作等能力。
+
+
+## 评测
+
+我们从两个角度评测了两个模型的能力：
+1. 在**英文标准 Benchmark** 上评测模型的基础任务能力。目前评测了四大类多模态任务：
+    - Zero-shot Captioning: 评测模型在未见过数据集上的零样本图片描述能力；
+    - General VQA: 评测模型的通用问答能力，例如判断题、颜色、个数、类目等问答能力；
+    - Text-based VQA：评测模型对于图片中文字相关的识别/问答能力，例如文档问答、图表问答、文字问答等；
+    - Referring Expression Compression：评测模型给定物体描述画检测框的能力；
+
+2. **试金石 (TouchStone)**：为了评测模型整体的图文对话能力和人类对齐水平。我们为此构建了一个基于 GPT4 打分来评测 LVLM 模型的 Benchmark：TouchStone。在 TouchStone-v0.1 中：
+    - 评测基准总计涵盖 300+张图片、800+道题目、27个类别。包括基础属性问答、人物地标问答、影视作品问答、视觉推理、反事实推理、诗歌创作、故事写作，商品比较、图片解题等**尽可能广泛的类别**。
+    - 为了弥补目前 GPT4 无法直接读取图片的缺陷，我们给所有的带评测图片提供了**人工标注的充分详细描述**，并且将图片的详细描述、问题和模型的输出结果一起交给 GPT4 打分。
+    - 评测同时包含英文版本和中文版本。
+  
+评测结果如下：
+
+Qwen-VL在多个VL任务上相比目前SOTA的Generalist Models都有明显优势，并且在能力范围也覆盖更加全面。
+
+<p align="center">
+    <img src="assets/radar.png" width="600"/>
+<p>
+
+### 零样本图像描述生成（Zero-shot Image Caption） 及 通用视觉问答（General VQA）
+<table>
+<thead>
+  <tr>
+    <th rowspan="2">Model type</th>
+    <th rowspan="2">Model</th>
+    <th colspan="2">Zero-shot Captioning</th>
+    <th colspan="5">General VQA</th>
+  </tr>
+  <tr>
+    <th>NoCaps</th>
+    <th>Flickr30K</th>
+    <th>VQAv2<sup>dev</sup></th>
+    <th>OK-VQA</th>
+    <th>GQA</th>
+    <th>SciQA-Img<br>(0-shot)</th>
+    <th>VizWiz<br>(0-shot)</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="10">Generalist<br>Models</td>
+    <td>Flamingo-9B</td>
+    <td>-</td>
+    <td>61.5</td>
+    <td>51.8</td>
+    <td>44.7</td>
+    <td>-</td>
+    <td>-</td>
+    <td>28.8</td>
+  </tr>
+  <tr>
+    <td>Flamingo-80B</td>
+    <td>-</td>
+    <td>67.2</td>
+    <td>56.3</td>
+    <td>50.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>31.6</td>
+  </tr>
+  <tr>
+    <td>Unified-IO-XL</td>
+    <td>100.0</td>
+    <td>-</td>
+    <td>77.9</td>
+    <td>54.0</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Kosmos-1</td>
+    <td>-</td>
+    <td>67.1</td>
+    <td>51.0</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>29.2</td>
+  </tr>
+  <tr>
+    <td>Kosmos-2</td>
+    <td>-</td>
+    <td>66.7</td>
+    <td>45.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>BLIP-2 (Vicuna-13B)</td>
+    <td>103.9</td>
+    <td>71.6</td>
+    <td>65.0</td>
+    <td>45.9</td>
+    <td>32.3</td>
+    <td>61.0</td>
+    <td>19.6</td>
+  </tr>
+  <tr>
+    <td>InstructBLIP (Vicuna-13B)</td>
+    <td><strong>121.9</strong></td>
+    <td>82.8</td>
+    <td>-</td>
+    <td>-</td>
+    <td>49.5</td>
+    <td>63.1</td>
+    <td>33.4</td>
+  </tr>
+  <tr>
+    <td>Shikra (Vicuna-13B)</td>
+    <td>-</td>
+    <td>73.9</td>
+    <td>77.36</td>
+    <td>47.16</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td><strong>Qwen-VL (Qwen-7B)</strong></td>
+    <td>121.4</td>
+    <td><b>85.8</b></td>
+    <td><b>78.8</b></td>
+    <td><b>58.6</b></td>
+    <td><b>59.3</b></td>
+    <td>67.1</td>
+    <td>35.2</td>
+  </tr>
+  <!-- <tr>
+    <td>Qwen-VL (4-shot)</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>63.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>39.1</td>
+  </tr> -->
+  <tr>
+    <td>Qwen-VL-Chat</td>
+    <td>120.2</td>
+    <td>81.0</td>
+    <td>78.2</td>
+    <td>56.6</td>
+    <td>57.5</td>
+    <td><b>68.2</b></td>
+    <td><b>38.9</b></td>
+  </tr>
+  <!-- <tr>
+    <td>Qwen-VL-Chat (4-shot)</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>60.6</td>
+    <td>-</td>
+    <td>-</td>
+    <td>44.45</td>
+  </tr> -->
+  <tr>
+    <td>Previous SOTA<br>(Per Task Fine-tuning)</td>
+    <td>-</td>
+    <td>127.0<br>(PALI-17B)</td>
+    <td>84.5<br>(InstructBLIP<br>-FlanT5-XL)</td>
+    <td>86.1<br>(PALI-X<br>-55B)</td>
+    <td>66.1<br>(PALI-X<br>-55B)</td>
+    <td>72.1<br>(CFR)</td>
+    <td>92.53<br>(LLaVa+<br>GPT-4)</td>
+    <td>70.9<br>(PALI-X<br>-55B)</td>
+  </tr>
+</tbody>
+</table>
+
+- 在 Zero-shot Captioning 中，Qwen-VL 在 Flickr30K 数据集上取得了 **SOTA** 的结果，并在 Nocaps 数据集上取得了和 InstructBlip 可竞争的结果。
+- 在 General VQA 中，Qwen-VL 取得了 LVLM 模型同等量级和设定下 **SOTA** 的结果。
+
+### 文本导向的视觉问答（Text-oriented VQA）
+
+<table>
+<thead>
+  <tr>
+    <th>Model type</th>
+    <th>Model</th>
+    <th>TextVQA</th>
+    <th>DocVQA</th>
+    <th>ChartQA</th>
+    <th>AI2D</th>
+    <th>OCR-VQA</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="5">Generalist Models</td>
+    <td>BLIP-2 (Vicuna-13B)</td>
+    <td>42.4</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>InstructBLIP (Vicuna-13B)</td>
+    <td>50.7</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>mPLUG-DocOwl (LLaMA-7B)</td>
+    <td>52.6</td>
+    <td>62.2</td>
+    <td>57.4</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Pic2Struct-Large (1.3B)</td>
+    <td>-</td>
+    <td><b>76.6</b></td>
+    <td>58.6</td>
+    <td>42.1</td>
+    <td>71.3</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL (Qwen-7B)</td>
+    <td><b>63.8</b></td>
+    <td>65.1</td>
+    <td><b>65.7</b></td>
+    <td><b>62.3</b></td>
+    <td><b>75.7</b></td>
+  </tr>
+  <tr>
+    <td>Specialist SOTAs<br>(Specialist/Finetuned)</td>
+    <td>PALI-X-55B (Single-task FT)<br>(Without OCR Pipeline)</td>
+    <td>71.44</td>
+    <td>80.0</td>
+    <td>70.0</td>
+    <td>81.2</td>
+    <td>75.0</td>
+  </tr>
+</tbody>
+</table>
+
+- 在文字相关的识别/问答评测上，取得了当前规模下通用 LVLM 达到的最好结果。
+- 分辨率对上述某几个评测非常重要，大部分 224 分辨率的开源 LVLM 模型无法完成以上评测，或只能通过切图的方式解决。Qwen-VL 将分辨率提升到 448，可以直接以端到端的方式进行以上评测。Qwen-VL 在很多任务上甚至超过了 1024 分辨率的 Pic2Struct-Large 模型。
+
+### 细粒度视觉定位（Referring Expression Comprehension）
+<table>
+<thead>
+  <tr>
+    <th rowspan="2">Model type</th>
+    <th rowspan="2">Model</th>
+    <th colspan="3">RefCOCO</th>
+    <th colspan="3">RefCOCO+</th>
+    <th colspan="2">RefCOCOg</th>
+    <th>GRIT</th>
+  </tr>
+  <tr>
+    <th>val</th>
+    <th>test-A</th>
+    <th>test-B</th>
+    <th>val</th>
+    <th>test-A</th>
+    <th>test-B</th>
+    <th>val-u</th>
+    <th>test-u</th>
+    <th>refexp</th>
+  </tr>
+</thead>
+<tbody align="center">
+  <tr>
+    <td rowspan="8">Generalist Models</td>
+    <td>GPV-2</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>51.50</td>
+  </tr>
+  <tr>
+    <td>OFA-L*</td>
+    <td>79.96</td>
+    <td>83.67</td>
+    <td>76.39</td>
+    <td>68.29</td>
+    <td>76.00</td>
+    <td>61.75</td>
+    <td>67.57</td>
+    <td>67.58</td>
+    <td>61.70</td>
+  </tr>
+  <tr>
+    <td>Unified-IO</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td><b>78.61</b></td>
+  </tr>
+  <tr>
+    <td>VisionLLM-H</td>
+    <td></td>
+    <td>86.70</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>Shikra-7B</td>
+    <td>87.01</td>
+    <td>90.61</td>
+    <td>80.24 </td>
+    <td>81.60</td>
+    <td>87.36</td>
+    <td>72.12</td>
+    <td>82.27</td>
+    <td>82.19</td>
+    <td>69.34</td>
+  </tr>
+  <tr>
+    <td>Shikra-13B</td>
+    <td>87.83 </td>
+    <td>91.11</td>
+    <td>81.81</td>
+    <td>82.89</td>
+    <td>87.79</td>
+    <td>74.41</td>
+    <td>82.64</td>
+    <td>83.16</td>
+    <td>69.03</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL-7B</td>
+    <td><b>89.36</b></td>
+    <td>92.26</td>
+    <td><b>85.34</b></td>
+    <td><b>83.12</b></td>
+    <td>88.25</td>
+    <td><b>77.21</b></td>
+    <td>85.58</td>
+    <td>85.48</td>
+    <td>78.22</td>
+  </tr>
+  <tr>
+    <td>Qwen-VL-7B-Chat</td>
+    <td>88.55</td>
+    <td><b>92.27</b></td>
+    <td>84.51</td>
+    <td>82.82</td>
+    <td><b>88.59</b></td>
+    <td>76.79</td>
+    <td><b>85.96</b></td>
+    <td><b>86.32</b></td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td rowspan="3">Specialist SOTAs<br>(Specialist/Finetuned)</td>
+    <td>G-DINO-L</td>
+    <td>90.56&nbsp;&nbsp;</td>
+    <td>93.19</td>
+    <td>88.24</td>
+    <td>82.75</td>
+    <td>88.95</td>
+    <td>75.92</td>
+    <td>86.13</td>
+    <td>87.02</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>UNINEXT-H</td>
+    <td>92.64 </td>
+    <td>94.33</td>
+    <td>91.46</td>
+    <td>85.24</td>
+    <td>89.63</td>
+    <td>79.79</td>
+    <td>88.73</td>
+    <td>89.37</td>
+    <td>-</td>
+  </tr>
+  <tr>
+    <td>ONE-PEACE</td>
+    <td>92.58 </td>
+    <td>94.18</td>
+    <td>89.26</td>
+    <td>88.77</td>
+    <td>92.21</td>
+    <td>83.23</td>
+    <td>89.22</td>
+    <td>89.27</td>
+    <td>-</td>
+  </tr>
+</tbody>
+</table>
+
+- 在定位任务上，Qwen-VL 全面超过 Shikra-13B，取得了目前 Generalist LVLM 模型上在 Refcoco 上的 **SOTA**。
+- Qwen-VL 并没有在任何中文定位数据上训练过，但通过中文 Caption 数据和 英文 Grounding 数据的训练，可以 Zero-shot 泛化出中文 Grounding 能力。
+
+我们提供了以上**所有**评测脚本以供复现我们的实验结果。请阅读 [eval_mm/EVALUATION.md](eval_mm/EVALUATION.md) 了解更多信息。
+
+### Chat 能力测评
+
+TouchStone 是一个基于 GPT4 打分来评测 LVLM 模型的图文对话能力和人类对齐水平的基准。它涵盖了 300+张图片、800+道题目、27个类别，包括基础属性、人物地标、视觉推理、诗歌创作、故事写作、商品比较、图片解题等**尽可能广泛的类别**。关于 TouchStone 的详细介绍，请参考[touchstone/README_CN.md](touchstone/README_CN.md)了解更多信息。
+
+#### 英文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### 中文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
+Qwen-VL-Chat 模型在中英文的对齐评测中均取得当前 LVLM 模型下的最好结果。
+
+## 部署要求
+
+* python 3.8及以上版本
+* pytorch 1.12及以上版本，推荐2.0及以上版本
+* 建议使用CUDA 11.4及以上（GPU用户需考虑此选项）
+
+## 快速使用
+
+我们提供简单的示例来说明如何利用 🤖 ModelScope 和 🤗 Transformers 快速使用 Qwen-VL 和 Qwen-VL-Chat。
+
+在开始前，请确保你已经配置好环境并安装好相关的代码包。最重要的是，确保你满足上述要求，然后安装相关的依赖库。
+
+```bash
+pip install -r requirements.txt
+```
+
+接下来你可以开始使用Transformers或者ModelScope来使用我们的模型。关于视觉模块的更多用法，请参考[教程](TUTORIAL_zh.md)。
+
+#### 🤗 Transformers
+
+如希望使用 Qwen-VL-chat 进行推理，所需要写的只是如下所示的数行代码。**请确保你使用的是最新代码。**
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+# 请注意：分词器默认行为已更改为默认关闭特殊token攻击防护。
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# 打开bf16精度，A100、H100、RTX3060、RTX3070等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# 打开fp16精度，V100、P100、T4等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# 使用CPU进行推理，需要约32GB内存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cpu", trust_remote_code=True).eval()
+# 默认gpu进行推理，需要约24GB显存
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
+
+# 可指定不同的生成长度、top_p等相关超参
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# 第一轮对话 1st dialogue turn
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': '这是什么?'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+# 图中是一名女子在沙滩上和狗玩耍，旁边是一只拉布拉多犬，它们处于沙滩上。
+
+# 第二轮对话 2st dialogue turn
+response, history = model.chat(tokenizer, '框出图中击掌的位置', history=history)
+print(response)
+# <ref>击掌</ref><box>(536,509),(588,602)</box>
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+if image:
+  image.save('1.jpg')
+else:
+  print("no box")
+```
+
+<p align="center">
+    <img src="assets/demo_highfive.jpg" width="500"/>
+<p>
+
+运行Qwen-VL同样非常简单。
+
+  <summary>运行Qwen-VL</summary>
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
+
+# 打开bf16精度，A100、H100、RTX3060、RTX3070等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# 打开fp16精度，V100、P100、T4等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# 使用CPU进行推理，需要约32GB内存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cpu", trust_remote_code=True).eval()
+# 默认gpu进行推理，需要约24GB显存
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()
+
+# 可指定不同的生成长度、top_p等相关超参
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
+
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': 'Generate the caption in English with grounding:'},
+])
+inputs = tokenizer(query, return_tensors='pt')
+inputs = inputs.to(model.device)
+pred = model.generate(**inputs)
+response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+print(response)
+# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+image = tokenizer.draw_bbox_on_latest_picture(response)
+if image:
+  image.save('2.jpg')
+else:
+  print("no box")
+```
+
+<p align="center">
+    <img src="assets/demo_spotting_caption.jpg" width="500"/>
+<p>
+
+
+#### 🤖 ModelScope
+
+魔搭（ModelScope）是开源的模型即服务共享平台，为泛AI开发者提供灵活、易用、低成本的一站式模型服务产品。使用ModelScope同样非常简单，代码如下所示：
+
+```python
+from modelscope import (
+    snapshot_download, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+)
+import torch
+model_id = 'qwen/Qwen-VL-Chat'
+revision = 'v1.0.0'
+
+model_dir = snapshot_download(model_id, revision=revision)
+torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+if not hasattr(tokenizer, 'model_dir'):
+    tokenizer.model_dir = model_dir
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, bf16=True).eval()
+# use fp16
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="cpu", trust_remote_code=True).eval()
+# use auto
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
+
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)
+
+# 1st dialogue turn
+# Either a local path or an url between <img></img> tags.
+image_path = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'
+response, history = model.chat(tokenizer, query=f'<img>{image_path}</img>这是什么', history=None)
+print(response)
+# 图中是一名年轻女子在沙滩上和她的狗玩耍，狗的品种是拉布拉多。她们坐在沙滩上，狗的前腿抬起来，与人互动。
+
+# 2st dialogue turn
+response, history = model.chat(tokenizer, '输出击掌的检测框', history=history)
+print(response)
+# <ref>"击掌"</ref><box>(211,412),(577,891)</box>
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+if image:
+  image.save('output_chat.jpg')
+else:
+  print("no box")
+```
+
+## Demo
+
+### Web UI
+
+我们提供了Web UI的demo供用户使用。在开始前，确保已经安装如下代码库：
+
+```
+pip install -r requirements_web_demo.txt
+```
+
+随后运行如下命令，并点击生成链接：
+
+```
+python web_demo_mm.py
+```
+
+## FAQ
+
+如遇到问题，敬请查阅 [FAQ](FAQ_zh.md)以及issue区，如仍无法解决再提交issue。
+
+
+## 使用协议
+
+研究人员与开发者可使用Qwen-VL和Qwen-VL-Chat或进行二次开发。我们同样允许商业使用，具体细节请查看[LICENSE](LICENSE)。如需商用，请填写[问卷](https://dashscope.console.aliyun.com/openModelApply/qianwen)申请。
+
+## 联系我们
+
+如果你想给我们的研发团队和产品团队留言，请通过邮件（qianwen_opensource@alibabacloud.com）联系我们。
diff --git a/TUTORIAL.md b/TUTORIAL.md
new file mode 100644
index 0000000000000000000000000000000000000000..8402187ed046978ec45e78a32d77819a0e6d6fec
--- /dev/null
+++ b/TUTORIAL.md
@@ -0,0 +1,221 @@
+# Qwen-VL-Chat Tutorial
+Qwen-VL-Chat is a generalist multimodal large-scale language model, and it can perform a wide range of vision-language tasks. In this tutorial, we will give some concise examples to demonstrate the capabilities of Qwen-VL-Chat in **Visual Question Answering, Text Understanding, Mathematical Reasoning with Diagrams, Multi-Figure Reasoning, and Grounding**. Please note that the examples shown are far from the limit of Qwen-VL-Chat's capabilities, **you can further explore Qwen-VL-Chat's capabilities by changing the input images and prompts!**
+
+## Initializing Qwen-VL-Chat
+## Initializing the Qwen-VL-Chat model
+Before you can use Qwen-VL-Chat, you first need to initialize Qwen-VL-Chat's tokenizer and Qwen-VL-Chat's model:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+# If you expect the results to be reproducible, set a random seed.
+# torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Chat", trust_remote_code=True)
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Chat", device_map="cuda", trust_remote_code=True).eval()
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat-Chat", trust_remote_code=True)
+```
+After executing the above code, ```tokenizer``` will correspond to the classifier used by Qwen-VL-Chat, while ```model``` will correspond to the model of Qwen-VL-Chat. The ```tokenizer``` is used for preprocessing the interleaved multimodal inputs, while the ```model``` is the Qwen-VL-Chat model itself.
+
+## Using Qwen-VL-Chat
+### **Multi-round visual question answering**
+#### **The first question**
+Let's get started with a simple example. As shown below, the file ```assets/mm_tutorial/Rebecca_(1939_poster).jpeg`` is a poster for the 1940 film Rebecca.
+
+![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
+
+Let's ask what is the name of the film on the Qwen-VL-Chat poster. First of all, we use ```tokenizer.from_list_format``` which can preprocess and tokenize the input:
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
+    {'text': 'What is the name of the movie in the poster?'},
+])
+```
+Next, we can use ```model.chat``` to ask questions to the Qwen-VL-Chat model and get its response. Note that for the first question, the dialogue history is empty, so we use ```history=None```.
+```python
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+You are expected to get an output similar to the following:
+
+> The name of the movie in the poster is "Rebecca."
+
+This shows that the model correctly answered the given question! According to the poster, the title of the film is 
+ indeed **Rebecca**.
+
+#### **Multi-round question answering**
+We can also continue to ask the model other questions, such as who is the director of the film. The dialogue history is not empty for subsequent questions, therefore we use ```history=history``` to pass the history of previous conversations to ``model.chat``:
+
+```python
+query = tokenizer.from_list_format([
+    {'text': 'Who directed this movie?'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> The movie "Rebecca" was directed by Alfred Hitchcock.
+
+Again, the model answered the given question correctly! According to the poster, the director of the film is Alfred Hitchcock。
+
+### **Text Understanding**
+Qwen-VL-Chat also has the ability to understand images containing dense text. As shown below, the file ```assets/mm_tutorial/Hospital.jpeg``` is a hospital signage containing dense text.
+
+![](assets/mm_tutorial/Hospital_Small.jpg)
+
+We can ask questions about the location of different departments in the Hospital. Since the dialogue history is empty, so we use ```history=None```.
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Hospital.jpg'},
+    {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> The Department of Otorhinolaryngology is located on the 4th floor.
+
+You can also ask further questions. In this case you need to use ```history=history``` to pass a history of previous conversations to ```model.chat```. 
+
+```python
+query = tokenizer.from_list_format([
+    {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> The Department of Surgery is located on the 3rd floor.
+
+### **Mathematical Reasoning with Diagram**
+Using the model's diagram comprehension and mathematical reasoning capabilities, Qwen-VL-Chat can also perform some more complex tasks! As shown below, the file ```assets/mm_tutorial/Menu.jpeg``` is the menu of a restaurant. Now we want to know how much it would cost to purchase two Salmon Burgers and three Meat Lover's Pizzas.
+
+![](assets/mm_tutorial/Menu.jpeg)
+
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Menu.jpeg'},
+    {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+```Think carefully step by step.``` is a common prompt that guides the model through complex tasks step by step. So if you have a complex task to complete, try using it to improve the accuracy of the model. You are expected to get an output similar to the following:
+
+> To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
+> 
+> 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
+> 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
+> 
+> Therefore, the total cost would be $56.
+
+### **Multi-Figure Reasoning and Chinese Input**
+In the previous examples, we have demonstrated Qwen-VL-Chat's question-answering capability for a single image and English questions. However, Qwen-VL-Chat is actually a multilingual model that supports Chinese input and multiple images! In the following example, we let Qwen-VL-Chat compare the photos of two cities (Chongqing and Beijing) for us (```assets/mm_tutorial/Chongqing.jpeg``` and ```assets/mm_tutorial/Beijing.jpeg```) in Chinese:
+
+![](assets/mm_tutorial/Chongqing_Small.jpeg)
+
+![](assets/mm_tutorial/Beijing_Small.jpeg)
+
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
+    {'image': 'assets/mm_tutorial/Beijing.jpeg'},
+    {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
+])
+torch.manual_seed(5678)
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
+
+**Please note that comparing cities is a fairly subjective question, so the responses generated by the model may be subject to a high degree of randomness. If you do not set the random seed using ```torch.manual_seed(5678)```, the output will be different each time. Even if you set the random seed, the results obtained may still differ from this tutorial due to differences in hardware and software environments.**
+
+### **Grounding Capability**
+In the last section of the tutorial, we demonstrate the ability of the Qwen-VL-Chat model to produce a bounding box. Qwen-VL-Chat can frame a specified area of an image with a rectangular box according to your language description. This may be a bit abstract, so let's look at the following example. As shown below, the file ```assets/mm_tutorial/Shanghai.jpg``` is a photo of Shanghai, and we'll start by asking the model to describe the image with a regular prompt.
+
+![](assets/mm_tutorial/Shanghai_Small.jpeg)
+
+```python
+torch.manual_seed(1234)
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Shanghai.jpg'},
+    {'text': '图里有啥'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
+
+Next, let's talk to the model by using the prompt ```请给我框出图中上海环球金融中心和东方明珠``` and see what happens. Note that at this point you need to pass the history of previous conversations to ```model.chat``` using ```history=history```.
+
+```python
+query = tokenizer.from_list_format([
+    {'text': '请给我框出图中上海环球金融中心和东方明珠'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+You are expected to get an output similar to the following:
+```xml
+<ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
+```
+The Qwen-VL-Chat model doesn't have a hand, but it doesn't reject your request either. Instead, it outputs something "strange" - In fact, the output of the model gives the location of the 上海环球金融中心（Shanghai World Financial Centre） and the 东方明珠（Oriental Pearl Tower） in markup language. You can visualise it using the following code:
+```python
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+image.save('Shanghai_Output.jpg')
+```
+The saved ```Shanghai_Output.jpg``` will look similar to the screenshot below: 
+
+![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
+
+After that, you can still chat with Qwen-VL-Chat as before:
+```python
+query = tokenizer.from_list_format([
+    {'text': '帮我写个这座城市的旅游计划'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+You are expected to get an output similar to the following:
+
+> 好的，以下是一个简单的上海旅游计划：
+>
+> 第一天：
+> 上午：抵达上海，前往酒店办理入住手续。
+> 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
+> 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
+> 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
+>
+> 第二天：
+> 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
+> 中午：在科技馆附近的餐厅享用午餐。
+> 下午：游览世纪公园，欣赏美景并放松身心。
+> 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
+>
+> 第三天：
+> 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
+> 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
+> 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
+> 晚上：在酒店附近享用晚餐，然后离开上海。
+>
+> 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
+
+
+**Please note that travel planning is a fairly subjective question, so the responses generated by the model may be subject to a high degree of randomness. If you do not set the random seed using ```torch.manual_seed(1234)```, the output will be different each time. Even if you set the random seed, the results obtained may still differ from this tutorial due to differences in hardware and software environments.**
diff --git a/TUTORIAL_zh.md b/TUTORIAL_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..530a9dc4d80c638f90d592dbd7989f95d1cc8bc3
--- /dev/null
+++ b/TUTORIAL_zh.md
@@ -0,0 +1,216 @@
+# Qwen-VL-Chat使用教程
+Qwen-VL-Chat是通用多模态大规模语言模型，因此它可以完成多种视觉语言任务。在本教程之中，我们会给出一些简明的例子，用以展示Qwen-VL-Chat在**视觉问答，文字理解，图表数学推理，多图理解和Grounding**(根据指令标注图片中指定区域的包围框)等多方面的能力。请注意，展示的例子远非Qwen-VL-Chat能力的极限，**您可以通过更换不同的输入图像和提示词（Prompt），来进一步挖掘Qwen-VL-Chat的能力！**
+
+## 初始化Qwen-VL-Chat
+## 初始化Qwen-VL-Chat模型
+在使用Qwen-VL-Chat之前，您首先需要初始化Qwen-VL-Chat的分词器（Tokenizer）和Qwen-VL-Chat的模型：
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+# 如果您希望结果可复现，可以设置随机数种子。
+# torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Chat", trust_remote_code=True)
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Chat", device_map="cuda", trust_remote_code=True).eval()
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat-Chat", trust_remote_code=True)
+```
+在执行完上述代码后，```tokenizer```将对应Qwen-VL-Chat使用的分词器，而```model```将对应Qwen-VL-Chat的模型。```tokenizer```用于对图文混排输入进行分词和预处理，而```model```则是Qwen-VL-Chat模型本身。
+
+## 使用Qwen-VL-Chat
+### **多轮视觉问答**
+#### **第一个问题**
+首先我们来看一个最简单的例子，如下图所示，文件```assets/mm_tutorial/Rebecca_(1939_poster).jpeg```是1940年电影Rebecca的于1939发布的海报。
+
+![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
+
+我们来问一问Qwen-VL-Chat海报上电影的名称是什么。首先，我们使用tokenizer.from_list_format可以对图文混排输入进行分词与处理：
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
+    {'text': 'What is the name of the movie in the poster?'},
+])
+```
+接下来，我们可以使用```model.chat```向Qwen-VL-Chat模型提问并获得回复。注意在第一次提问时，对话历史为空，因此我们使用```history=None```。
+```python
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+您应该会得到类似下列的输出结果：
+
+> The name of the movie in the poster is "Rebecca."
+
+这说明模型正确的回答了问题！根据海报，该电影的名称的确是**Rebecca**。
+
+#### **多轮问答**
+我们还可以继续向模型发问，例如询问电影的导演是谁。在后续提问时，对话历史并不为空，我们使用```history=history```向```model.chat```传递之前的对话历史：
+```python
+query = tokenizer.from_list_format([
+    {'text': 'Who directed this movie?'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> The movie "Rebecca" was directed by Alfred Hitchcock.
+
+模型再次正确回答了问题！根据海报，该电影的导演是Alfred Hitchcock。
+
+### **文字理解**
+Qwen-VL-Chat具有一定的针对包含密集文字图片的理解能力。如下图所示，文件```assets/mm_tutorial/Hospital.jpeg```是一个包含密集文字的医院指示牌。
+
+![](assets/mm_tutorial/Hospital_Small.jpg)
+
+我们可以像之前一样向模型询问医院中各个科室的位置，对话历史为空，因此我们使用```history=None```。
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Hospital.jpg'},
+    {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> The Department of Otorhinolaryngology is located on the 4th floor.
+
+您同样可以进一步提出后续问题，此时需要使用```history=history```向```model.chat```传递之前的对话历史。
+
+```python
+query = tokenizer.from_list_format([
+    {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> The Department of Surgery is located on the 3rd floor.
+
+### **图表数学推理**
+利用模型的图表理解和数学推理能力，Qwen-VL-Chat还可以完成更复杂的一些任务！如下图所示，文件```assets/mm_tutorial/Menu.jpeg```展示了一家餐厅的菜单。现在我们想知道，如果购买两个Salmon Burger和三个Meat Lover's Pizza需要花多少钱呢？
+
+![](assets/mm_tutorial/Menu.jpeg)
+
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Menu.jpeg'},
+    {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+```Think carefully step by step.```是一个引导模型分步处理复杂任务的常见提示词，如果您需要完成的任务较为复杂，可以试着使用它来提高准确率。您应该会得到类似下列的输出结果：
+
+> To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
+> 
+> 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
+> 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
+> 
+> Therefore, the total cost would be $56.
+
+### **多图理解与中文输入**
+在之前的例子中，我们主要展示了Qwen-VL-Chat针对单张图像和英文问题的问答能力。但实际上，Qwen-VL-Chat是支持中文输入的多语言模型，而且也支持多张图片的输入！下面的例子中，我们用中文让Qwen-VL-Chat来为我们比较重庆和北京这两个城市的照片（```assets/mm_tutorial/Chongqing.jpeg```和```assets/mm_tutorial/Beijing.jpeg```）：
+
+![](assets/mm_tutorial/Chongqing_Small.jpeg)
+
+![](assets/mm_tutorial/Beijing_Small.jpeg)
+
+```python
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
+    {'image': 'assets/mm_tutorial/Beijing.jpeg'},
+    {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
+])
+torch.manual_seed(5678)
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
+
+**请注意，城市间的比较是一个具有相当主观性的问题，因此模型产生的回复可能具有相当高的随机性。若不使用```torch.manual_seed(5678)```设置随机数种子，每次的输出结果会不一样。即使您设置了随机数种子，由于软硬件环境的差异，得到的结果也可能与本文档中的有所不同。**
+
+### **Grounding能力**
+在最后，我们展示Qwen-VL-Chat模型产生包围框的能力。Qwen-VL-Chat可以根据您的语言描述，在图像中用矩形框框出指定区域。这样说可能有些抽象，让我们来看下面的例子。如下图所示，文件```assets/mm_tutorial/Shanghai.jpg```是上海的一张照片，我们先用常规的提示词，问一下模型图里有什么。
+
+![](assets/mm_tutorial/Shanghai_Small.jpeg)
+
+```python
+torch.manual_seed(1234)
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Shanghai.jpg'},
+    {'text': '图里有啥'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
+
+接下来，我们通过使用```请给我框出图中上海环球金融中心和东方明珠```这个提示词来和模型对话，看看会发生什么。注意此时需要使用```history=history```向```model.chat```传递之前的对话历史。
+```python
+query = tokenizer.from_list_format([
+    {'text': '请给我框出图中上海环球金融中心和东方明珠'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+您应该会得到类似下列的输出结果：
+```xml
+<ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
+```
+Qwen-VL-Chat模型没有手，但也没有拒绝您的请求，而是输出了一些“奇怪”的东西——并不是，实际上，模型的输出以标记语言的形式给出了上海环球金融中心和东方明珠在图中的具体位置。您可以使用下列代码将其可视化：
+```python
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+image.save('Shanghai_Output.jpg')
+```
+保存下来的```Shanghai_Output.jpg```结果将类似于下面的截图：
+
+![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
+
+在此之后，您还可以继续照常和Qwen-VL-Chat对话：
+```python
+query = tokenizer.from_list_format([
+    {'text': '帮我写个这座城市的旅游计划'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+```
+
+您应该会得到类似下列的输出结果：
+
+> 好的，以下是一个简单的上海旅游计划：
+>
+> 第一天：
+> 上午：抵达上海，前往酒店办理入住手续。
+> 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
+> 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
+> 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
+>
+> 第二天：
+> 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
+> 中午：在科技馆附近的餐厅享用午餐。
+> 下午：游览世纪公园，欣赏美景并放松身心。
+> 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
+>
+> 第三天：
+> 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
+> 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
+> 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
+> 晚上：在酒店附近享用晚餐，然后离开上海。
+>
+> 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
+
+**请注意，旅游计划是一个具有相当主观性的问题，因此模型产生的回复可能具有相当高的随机性。若不使用```torch.manual_seed(1234)```设置随机数种子，每次的输出结果会不一样。即使您设置了随机数种子，由于软硬件环境的差异，得到的结果也可能与本文档中的有所不同。**
\ No newline at end of file
diff --git a/assets/apple.jpeg b/assets/apple.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..386f6d2166c99c71421f5d598c868c0faa205769
--- /dev/null
+++ b/assets/apple.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c90b2e7b5db8d0a491acb1ee0a18957014b3966c1bd3a6956dabbb4d7a954068
+size 2462731
diff --git a/assets/apple_r.jpeg b/assets/apple_r.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a0fba2717359027fbca7b9a090c0a3cfc3e31e80
Binary files /dev/null and b/assets/apple_r.jpeg differ
diff --git a/assets/demo.jpeg b/assets/demo.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..9fdc040050624556464ffa5112dde397ccd792c6
Binary files /dev/null and b/assets/demo.jpeg differ
diff --git a/assets/demo_highfive.jpg b/assets/demo_highfive.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e4537bfdc11d47f4209a387394e2010e9c231994
Binary files /dev/null and b/assets/demo_highfive.jpg differ
diff --git a/assets/demo_spotting_caption.jpg b/assets/demo_spotting_caption.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8af46fa7265bea3fe94aef3b77abf8e347d2d7b4
Binary files /dev/null and b/assets/demo_spotting_caption.jpg differ
diff --git a/assets/demo_vl.gif b/assets/demo_vl.gif
new file mode 100644
index 0000000000000000000000000000000000000000..c8db935fc9acc571e7622485c7127486e0d21f59
--- /dev/null
+++ b/assets/demo_vl.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a37d3b465bb42be43a42b82ad690220938f86d9f80f899b07071d179085f0473
+size 2175555
diff --git a/assets/logo.jpg b/assets/logo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d11f3ae1c69da9f82f632b55b641d0836c86437
Binary files /dev/null and b/assets/logo.jpg differ
diff --git a/assets/mm_tutorial/Beijing.jpeg b/assets/mm_tutorial/Beijing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c0b7565f9e1dde7023cbd6fc02644024c9ad5911
--- /dev/null
+++ b/assets/mm_tutorial/Beijing.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea42bace91cd0aee523f30daa0c4befad4cb329aba82dae9330cc6fe1eb8caf
+size 9231307
diff --git a/assets/mm_tutorial/Beijing_Small.jpeg b/assets/mm_tutorial/Beijing_Small.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0774954d7f1703a77339b4964233425d02a43f5d
Binary files /dev/null and b/assets/mm_tutorial/Beijing_Small.jpeg differ
diff --git a/assets/mm_tutorial/Chongqing.jpeg b/assets/mm_tutorial/Chongqing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c5f45aa78e416678fb4d8b9138188949da0e1904
--- /dev/null
+++ b/assets/mm_tutorial/Chongqing.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa8e2df5232982ae863cdb756fc6c5c3cc146af7e0fb704a71f3dabe1eca7877
+size 1702469
diff --git a/assets/mm_tutorial/Chongqing_Small.jpeg b/assets/mm_tutorial/Chongqing_Small.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..9daef1a9c8fcdc17e21f70b0e2b83199b82b602a
Binary files /dev/null and b/assets/mm_tutorial/Chongqing_Small.jpeg differ
diff --git a/assets/mm_tutorial/Hospital.jpg b/assets/mm_tutorial/Hospital.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fbe827ee54a7c4f459aee661568b721608238c0b
Binary files /dev/null and b/assets/mm_tutorial/Hospital.jpg differ
diff --git a/assets/mm_tutorial/Hospital_Small.jpg b/assets/mm_tutorial/Hospital_Small.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..22cbdfeeaff95cbbdeaf52f5efcdc2c5c0fd715b
Binary files /dev/null and b/assets/mm_tutorial/Hospital_Small.jpg differ
diff --git a/assets/mm_tutorial/Menu.jpeg b/assets/mm_tutorial/Menu.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0324cc05fdd66664f457fa4e6efa85fd739bfb4a
Binary files /dev/null and b/assets/mm_tutorial/Menu.jpeg differ
diff --git a/assets/mm_tutorial/Rebecca_(1939_poster).jpeg b/assets/mm_tutorial/Rebecca_(1939_poster).jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5c76ff90ed344c9895102c96026da83fb11373a3
--- /dev/null
+++ b/assets/mm_tutorial/Rebecca_(1939_poster).jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:392a8c559f8b18889b89beac808f4b4b7bf7492161031882cc3ad3af0f97aec4
+size 1322494
diff --git a/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg b/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3f1296d34681fbc20e444d59fbd49dc394a1a845
Binary files /dev/null and b/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg differ
diff --git a/assets/mm_tutorial/Shanghai.jpg b/assets/mm_tutorial/Shanghai.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ebe8d1c9c1ab0254d157c1881173c4d0f00dbcf
Binary files /dev/null and b/assets/mm_tutorial/Shanghai.jpg differ
diff --git a/assets/mm_tutorial/Shanghai_Output.jpg b/assets/mm_tutorial/Shanghai_Output.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9206c611fe61e3997b5bcf0059f00f0092bdd441
--- /dev/null
+++ b/assets/mm_tutorial/Shanghai_Output.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e1bd95d7455d8fc56600fd1034c31c9676e593240bc753731ab6e828960e731
+size 1193833
diff --git a/assets/mm_tutorial/Shanghai_Output_Small.jpeg b/assets/mm_tutorial/Shanghai_Output_Small.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..482231564c1df14df1a8908049f51f7a83c28ddd
Binary files /dev/null and b/assets/mm_tutorial/Shanghai_Output_Small.jpeg differ
diff --git a/assets/mm_tutorial/Shanghai_Small.jpeg b/assets/mm_tutorial/Shanghai_Small.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c634747220f0bbe7ced7fd55cc0f456cf17c82ae
Binary files /dev/null and b/assets/mm_tutorial/Shanghai_Small.jpeg differ
diff --git a/assets/mm_tutorial/TUTORIAL.ipynb b/assets/mm_tutorial/TUTORIAL.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/assets/qwenvl.jpeg b/assets/qwenvl.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..18a25bb52b18a5ca0f749d50de8a28434e8dcebf
Binary files /dev/null and b/assets/qwenvl.jpeg differ
diff --git a/assets/radar.png b/assets/radar.png
new file mode 100644
index 0000000000000000000000000000000000000000..26b83c2b0a587585f29ae346349746693644105a
Binary files /dev/null and b/assets/radar.png differ
diff --git a/assets/touchstone_datasets.jpg b/assets/touchstone_datasets.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f96af6877e9b52acbe790603ab3efb71d0723e43
--- /dev/null
+++ b/assets/touchstone_datasets.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:173a4109f21663e71b12da6280076b82260c1f7b4e350af72d1595baa353dd0e
+size 3056949
diff --git a/assets/touchstone_eval.png b/assets/touchstone_eval.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c7d1042853fd3b4f9d6639a2ab02124fa56d0b7
Binary files /dev/null and b/assets/touchstone_eval.png differ
diff --git a/assets/touchstone_logo.png b/assets/touchstone_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..7ccbbecef3c1875a378dfb5b09ff28e9ff4acaf1
--- /dev/null
+++ b/assets/touchstone_logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec333b3f2e12733652fd42bde8ff7152865d58353f8b07e2d52dce2036c63e09
+size 1054973
diff --git a/eval_mm/EVALUATION.md b/eval_mm/EVALUATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..d16773ba3297508159db1d999a204494ddc54a41
--- /dev/null
+++ b/eval_mm/EVALUATION.md
@@ -0,0 +1 @@
+# We will release the evaluation documentation soon.
diff --git a/eval_mm/evaluate_caption.py b/eval_mm/evaluate_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..17fb942572530d12ca6a0a09a31abbad11748894
--- /dev/null
+++ b/eval_mm/evaluate_caption.py
@@ -0,0 +1,193 @@
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+
+import torch
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ds_collections = {
+    'flickr': {
+        'train': 'data/flickr30k/flickr30k_karpathy_test.json',
+        'test': 'data/flickr30k/flickr30k_karpathy_test.json',
+    },
+    'nocaps': {
+        'train': '',
+        'test': 'data/nocaps/nocaps_val.json',
+    },
+}
+
+
+class CaptionDataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot=0):
+        self.images = json.load(open(test))['images']
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = json.load(open(train))['annotations']
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        image_id, image_path = self.images[idx]['id'], self.images[idx][
+            'image']
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                few_shot_prompt += self.prompt.format(
+                    sample['image']) + f" {sample['caption']}"
+
+        return {
+            'image_id': image_id,
+            'input_text': few_shot_prompt + self.prompt.format(image_path)
+        }
+
+
+def collate_fn(inputs, tokenizer):
+
+    image_ids = [_['image_id'] for _ in inputs]
+    input_texts = [_['input_text'] for _ in inputs]
+    input_tokens = tokenizer(input_texts,
+                             return_tensors='pt',
+                             padding='longest')
+
+    return image_ids, input_tokens.input_ids, input_tokens.attention_mask
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    prompt = '<img>{}</img>Describe the image in English:'
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+
+    random.seed(args.seed)
+    dataset = CaptionDataset(
+        train=ds_collections[args.dataset]['train'],
+        test=ds_collections[args.dataset]['test'],
+        tokenizer=tokenizer,
+        prompt=prompt,
+        few_shot=args.few_shot,
+    )
+    coco_karpathy_test_loader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    image_ids = []
+    captions = []
+    for _, (ids, input_ids,
+            attention_mask) in tqdm(enumerate(coco_karpathy_test_loader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=30,
+            min_new_tokens=8,
+            length_penalty=0,
+            num_return_sequences=1,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        image_ids.extend(ids)
+        captions.extend([
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ])
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_ids = [None for _ in range(world_size)]
+    merged_captions = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_ids, image_ids)
+    torch.distributed.all_gather_object(merged_captions, captions)
+
+    merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
+    merged_captions = [
+        _ for _ in itertools.chain.from_iterable(merged_captions)
+    ]
+
+    if torch.distributed.get_rank() == 0:
+        results = []
+        for image_id, caption in zip(merged_ids, merged_captions):
+            results.append({
+                'image_id': int(image_id),
+                'caption': caption,
+            })
+        time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+        results_file = f'{args.dataset}_{time_prefix}.json'
+        json.dump(results, open(results_file, 'w'))
+
+        coco = COCO(ds_collections[args.dataset]['test'])
+        coco_result = coco.loadRes(results_file)
+        coco_eval = COCOEvalCap(coco, coco_result)
+        coco_eval.evaluate()
+
+        print(coco_eval.eval.items())
+    torch.distributed.barrier()
diff --git a/eval_mm/evaluate_grounding.py b/eval_mm/evaluate_grounding.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a76223cc4e51a0381ab7cafdf3a7abe570eae99
--- /dev/null
+++ b/eval_mm/evaluate_grounding.py
@@ -0,0 +1,213 @@
+import argparse
+import itertools
+import json
+import os
+import re
+from functools import partial
+
+import torch
+from torchvision.ops.boxes import box_area
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ds_collections = {
+    'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
+    'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
+    'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
+    'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
+    'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
+    'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
+    'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
+    'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
+}
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def collate_fn(batches, tokenizer):
+
+    texts = [_['text'] for _ in batches]
+    bboxes = [_['bbox'] for _ in batches]
+    hws = [_['hw'] for _ in batches]
+
+    input_ids = tokenizer(texts, return_tensors='pt', padding='longest')
+
+    return input_ids.input_ids, input_ids.attention_mask, bboxes, hws
+
+
+class RefCOCODataset(torch.utils.data.Dataset):
+
+    def __init__(self, test, tokenizer, prompt):
+        self.datas = open(test).readlines()
+        self.tokenizer = tokenizer
+        self.prompt = prompt
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.datas[idx].strip())
+        image = data['image']
+        text = data['sent']
+        bbox = data['bbox']
+
+        w, h = data['width'], data['height']
+
+        return {
+            'text': self.prompt.format(image, text),
+            'bbox': bbox,
+            'hw': (h, w),
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>{}</img><ref>{}</ref><box>'
+
+    dataset = RefCOCODataset(test=ds_collections[args.dataset],
+                             tokenizer=tokenizer,
+                             prompt=prompt)
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (input_ids, attention_mask, bboxes,
+            hws) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=28,
+            min_new_tokens=10,
+            length_penalty=1,
+            num_return_sequences=1,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True) for _ in pred
+        ]
+
+        for bbox, hw, answer in zip(bboxes, hws, answers):
+            outputs.append({
+                'answer': answer,
+                'gt_bbox': bbox,
+                'hw': hw,
+            })
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, outputs)
+
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    PATTERN = re.compile(r'\((.*?)\),\((.*?)\)')
+
+    if torch.distributed.get_rank() == 0:
+        correct = total_cnt = 0
+        for i, output in enumerate(merged_outputs):
+            predict_bbox = re.findall(PATTERN, output['answer'])
+            try:
+                if ',' not in predict_bbox[0][0] or ',' not in predict_bbox[0][
+                        1]:
+                    predict_bbox = (0., 0., 0., 0.)
+                else:
+                    x1, y1 = [
+                        float(tmp) for tmp in predict_bbox[0][0].split(',')
+                    ]
+                    x2, y2 = [
+                        float(tmp) for tmp in predict_bbox[0][1].split(',')
+                    ]
+                    predict_bbox = (x1, y1, x2, y2)
+            except:
+                predict_bbox = (0., 0., 0., 0.)
+            target_bbox = torch.tensor(output['gt_bbox'],
+                                       dtype=torch.float32).view(-1, 4)
+            predict_bbox = torch.tensor(predict_bbox,
+                                        dtype=torch.float32).view(-1, 4) / 999
+            predict_bbox[:, 0::2] *= output['hw'][1]
+            predict_bbox[:, 1::2] *= output['hw'][0]
+            iou, _ = box_iou(predict_bbox, target_bbox)
+            iou = iou.item()
+            total_cnt += 1
+            if iou >= 0.5:
+                correct += 1
+
+        print(f'Precision @ 1: {correct / total_cnt} \n')
+    torch.distributed.barrier()
diff --git a/eval_mm/evaluate_multiple_choice.py b/eval_mm/evaluate_multiple_choice.py
new file mode 100644
index 0000000000000000000000000000000000000000..b60aa369d742def68237fa938dcc00897a8681fb
--- /dev/null
+++ b/eval_mm/evaluate_multiple_choice.py
@@ -0,0 +1,184 @@
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+multiple_choices = ['A', 'B', 'C', 'D', 'E']
+
+ds_collections = {
+    'scienceqa_test_img': {
+        'test': 'data/scienceqa/scienceqa_test_img.jsonl',
+    }
+}
+
+
+def collate_fn(batches, pad_token_id):
+
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+
+    chunk_sizes = [len(_) for _ in input_tokens]
+
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+
+    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
+
+
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+
+    def __init__(self, test, prompt, tokenizer):
+        self.datas = open(test).readlines()
+        self.prompt = prompt
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+
+        data = json.loads(self.datas[idx].strip())
+        image = data['image']
+        hint = data['hint'] if data['hint'] else 'N/A'
+        question = data['question']
+
+        choices = data['choices']
+        choice_list = []
+        for i, c in enumerate(choices):
+            choice_list.append('{}. {}'.format(multiple_choices[i], c))
+        choice_txt = '\n'.join(choice_list)
+
+        prompt = self.prompt.format(image, hint, question, choice_txt)
+
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in multiple_choices[:len(choices)]
+        ]
+
+        return {
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            'answer': data['answer'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+
+    prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
+
+    dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
+                                    prompt=prompt,
+                                    tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+
+    results = []
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths, answer,
+                chunk_sizes) in tqdm(enumerate(dataloader)):
+
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+
+            losses = losses.split(chunk_sizes, dim=0)
+
+            for loss, target_length, answer in zip(losses, target_lengths,
+                                                   answer):
+
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+                if pred == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_results = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_results, results)
+
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+
+    if torch.distributed.get_rank() == 0:
+        print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
+
+    torch.distributed.barrier()
diff --git a/eval_mm/evaluate_vizwiz_testdev.py b/eval_mm/evaluate_vizwiz_testdev.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f40422b12809493b886fa08844cc17e26005467
--- /dev/null
+++ b/eval_mm/evaluate_vizwiz_testdev.py
@@ -0,0 +1,167 @@
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def collate_fn(batches, tokenizer):
+
+    images = [_['image'] for _ in batches]
+    questions = [_['question'] for _ in batches]
+
+    input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
+
+    return images, input_ids.input_ids, input_ids.attention_mask
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot):
+        self.test = json.load(open(test))
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = open(train).readlines()
+
+    def __len__(self):
+        return len(self.test)
+
+    def __getitem__(self, idx):
+        data = self.test[idx]
+        image, question = data['image'], data['question']
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                sample = json.loads(sample.strip())
+                few_shot_prompt += self.prompt.format(
+                    sample['image'],
+                    sample['question']) + f" {sample['answer']}"
+
+        return {
+            'image': data['image'],
+            'question': few_shot_prompt + self.prompt.format(image, question),
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>data/vizwiz/test/{}</img>{} Answer:'
+
+    random.seed(args.seed)
+    dataset = VQADataset(
+        train='data/vizwiz/vizwiz_train.jsonl',
+        test='data/vizwiz/test.json',
+        prompt=prompt,
+        few_shot=args.few_shot,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (images, input_ids, attention_mask) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=10,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ]
+
+        for image, answer in zip(images, answers):
+            outputs.append({'image': image, 'answer': answer})
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, outputs)
+
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+    if torch.distributed.get_rank() == 0:
+        time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+        results_file = f'vizwiz_testdev_{time_prefix}_fs{args.few_shot}_s{args.seed}.json'
+        json.dump(merged_outputs, open(results_file, 'w'),
+                  ensure_ascii=False)  # save to results
+
+    torch.distributed.barrier()
diff --git a/eval_mm/evaluate_vqa.py b/eval_mm/evaluate_vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..2accb43745652dace9189cda1852e610eb171987
--- /dev/null
+++ b/eval_mm/evaluate_vqa.py
@@ -0,0 +1,357 @@
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from vqa import VQA
+from vqa_eval import VQAEval
+
+ds_collections = {
+    'vqav2_val': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_val.jsonl',
+        'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'okvqa_val': {
+        'train': 'data/okvqa/okvqa_train.jsonl',
+        'test': 'data/okvqa/okvqa_val.jsonl',
+        'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'textvqa_val': {
+        'train': 'data/textvqa/textvqa_train.jsonl',
+        'test': 'data/textvqa/textvqa_val.jsonl',
+        'question': 'data/textvqa/textvqa_val_questions.json',
+        'annotation': 'data/textvqa/textvqa_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_val': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_val.jsonl',
+        'question': 'data/vizwiz/vizwiz_val_questions.json',
+        'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'docvqa': {
+        'train': 'data/DocVQA/train.jsonl',
+        'test': 'data/DocVQA/val.jsonl',
+        # 'question': '',
+        'annotation': './data/DocVQA/val/val_v1.0.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'infographicsvqa': {
+        'train': 'data/InfographicsVQA/train.jsonl',
+        'test': 'data/InfographicsVQA/val.jsonl',
+        # 'question': '',
+        'annotation': './data/InfographicsVQA/infographicVQA_val_v1.0.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'chartqa': {
+        'train': 'data/ChartQA/train.jsonl',
+        'test': 'data/ChartQA/val_human.jsonl',
+        # 'question': '',
+        # 'annotation': '',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'gqa': {
+        'train': 'data/GQA/train.jsonl',
+        'test': 'data/GQA/testdev_balanced.jsonl',
+        # 'question': '',
+        # 'annotation': '',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'ocrvqa': {
+        'train': 'data/OCR-VQA/train.jsonl',
+        'test': 'data/OCR-VQA/val.jsonl',
+        # 'question': '',
+        # 'annotation': '',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'ai2diagram': {
+        'train': 'data/AI2Diagram/train.jsonl',
+        'test': 'data/AI2Diagram/test.jsonl',
+        # 'question': '',
+        # 'annotation': '',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    }
+}
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith("%"):
+                # Convert percentages to floats.
+                return float(text.rstrip("%")) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(
+            prediction_float - target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+def evaluate_relaxed_accuracy(entries):
+    scores = []
+    for elem in entries:
+        score = max([relaxed_correctness(elem['answer'].strip(), ann) for ann in elem['annotation']])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+def evaluate_exact_match_accuracy(entries):
+    scores = []
+    for elem in entries:
+        score = max([(1.0 if (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0) for ann in elem['annotation']])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def collate_fn(batches, tokenizer):
+
+    questions = [_['question'] for _ in batches]
+    question_ids = [_['question_id'] for _ in batches]
+    annotations = [_['annotation'] for _ in batches]
+
+    input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
+
+    return question_ids, input_ids.input_ids, input_ids.attention_mask, annotations
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot):
+        self.test = open(test).readlines()
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = open(train).readlines()
+
+    def __len__(self):
+        return len(self.test)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.test[idx].strip())
+        image, question, question_id, annotation = data['image'], data['question'], data[
+            'question_id'], data['answer']
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                sample = json.loads(sample.strip())
+                few_shot_prompt += self.prompt.format(
+                    sample['image'],
+                    sample['question']) + f" {sample['answer']}"
+
+        return {
+            'question': few_shot_prompt + self.prompt.format(image, question),
+            'question_id': question_id,
+            'annotation': annotation
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>{}</img>{} Answer:'
+
+    random.seed(args.seed)
+    dataset = VQADataset(
+        train=ds_collections[args.dataset]['train'],
+        test=ds_collections[args.dataset]['test'],
+        prompt=prompt,
+        few_shot=args.few_shot,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (question_ids, input_ids,
+            attention_mask, annotations) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=ds_collections[args.dataset]['max_new_tokens'],
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ]
+
+        for question_id, answer, annotation in zip(question_ids, answers, annotations):
+            try:
+                outputs.append({'question_id': int(question_id), 'answer': answer, 'annotation': annotation})
+            except:
+                outputs.append({'question_id': question_id, 'answer': answer, 'annotation': annotation})
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, outputs)
+
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+    if torch.distributed.get_rank() == 0:
+        time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+        results_file = f'{args.dataset}_{time_prefix}_fs{args.few_shot}_s{args.seed}.json'
+        json.dump(merged_outputs, open(results_file, 'w'),
+                  ensure_ascii=False)  # save to results
+
+        if ds_collections[args.dataset]['metric'] == 'vqa_score':
+            vqa = VQA(ds_collections[args.dataset]['annotation'],
+                    ds_collections[args.dataset]['question'])
+            results = vqa.loadRes(
+                resFile=results_file,
+                quesFile=ds_collections[args.dataset]['question'])
+            vqa_scorer = VQAEval(vqa, results, n=2)
+            vqa_scorer.evaluate()
+
+            print(vqa_scorer.accuracy)
+
+        elif ds_collections[args.dataset]['metric'] == 'anls':
+            merged_outputs = [{'answer': _['answer'], 'questionId': _['question_id']} for _ in merged_outputs]
+            results_file = f'{args.dataset}_official_{time_prefix}.json'
+            json.dump(merged_outputs, open(results_file, 'w'), ensure_ascii=False)
+            print('python infographicsvqa_eval.py -g ' + ds_collections[args.dataset]['annotation'] + ' -s ' + results_file)
+            os.system('python infographicsvqa_eval.py -g ' + ds_collections[args.dataset]['annotation'] + ' -s ' + results_file)
+        elif ds_collections[args.dataset]['metric'] == 'relaxed_accuracy':
+            print({'relaxed_accuracy': evaluate_relaxed_accuracy(merged_outputs)})
+        elif ds_collections[args.dataset]['metric'] == 'accuracy':
+            if 'gqa' in args.dataset:
+                for entry in merged_outputs:
+                    response = entry['answer']
+                    response = response.strip().split('.')[0].split(',')[0].split('!')[0].lower()
+                    if 'is ' in response:
+                        response = response.split('is ')[1]
+                    if 'are ' in response:
+                        response = response.split('are ')[1]
+                    if 'a ' in response:
+                        response = response.split('a ')[1]
+                    if 'an ' in response:
+                        response = response.split('an ')[1]
+                    if 'the ' in response:
+                        response = response.split('the ')[1]
+                    if ' of' in response:
+                        response = response.split(' of')[0]
+                    response = response.strip()
+                    entry['answer'] = response
+            print({'accuracy': evaluate_exact_match_accuracy(merged_outputs)})
+
+    torch.distributed.barrier()
diff --git a/eval_mm/vqa.py b/eval_mm/vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ee18f0532a4f8ed1f4ee4a33c162f7c4375398
--- /dev/null
+++ b/eval_mm/vqa.py
@@ -0,0 +1,206 @@
+"""Copyright (c) 2022, salesforce.com, inc.
+
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+__author__ = 'aagrawal'
+__version__ = '0.9'
+
+# Interface for accessing the VQA dataset.
+
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
+
+# The following functions are defined:
+#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
+#  getQuesIds - Get question ids that satisfy given filter conditions.
+#  getImgIds  - Get image ids that satisfy given filter conditions.
+#  loadQA     - Load questions and answers with the specified question ids.
+#  showQA     - Display the specified questions and answers.
+#  loadRes    - Load result file and create result object.
+
+# Help on each function can be accessed by: "help(COCO.function)"
+
+import copy
+import datetime
+import json
+
+
+class VQA:
+
+    def __init__(self, annotation_file=None, question_file=None):
+        """Constructor of VQA helper class for reading and visualizing
+        questions and answers.
+
+        :param annotation_file (str): location of VQA annotation file
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.questions = {}
+        self.qa = {}
+        self.qqa = {}
+        self.imgToQA = {}
+        if not annotation_file == None and not question_file == None:
+            print('loading VQA annotations and questions into memory...')
+            time_t = datetime.datetime.utcnow()
+            dataset = json.load(open(annotation_file, 'r'))
+            questions = json.load(open(question_file, 'r'))
+            self.dataset = dataset
+            self.questions = questions
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
+        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        for ann in self.dataset['annotations']:
+            imgToQA[ann['image_id']] += [ann]
+            qa[ann['question_id']] = ann
+        for ques in self.questions['questions']:
+            qqa[ques['question_id']] = ques
+        print('index created!')
+
+        # create class members
+        self.qa = qa
+        self.qqa = qqa
+        self.imgToQA = imgToQA
+
+    def info(self):
+        """Print information about the VQA annotation file.
+
+        :return:
+        """
+        for key, value in self.datset['info'].items():
+            print('%s: %s' % (key, value))
+
+    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
+        """Get question ids that satisfy given filter conditions. default skips
+        that filter.
+
+        :param  imgIds    (int array)   : get question ids for given imgs
+                        quesTypes (str array)   : get question ids for given question types
+                        ansTypes  (str array)   : get question ids for given answer types
+        :return:    ids   (int array)   : integer array of question ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+
+        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                anns = sum(
+                    [
+                        self.imgToQA[imgId]
+                        for imgId in imgIds if imgId in self.imgToQA
+                    ],
+                    [],
+                )
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['question_id'] for ann in anns]
+        return ids
+
+    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
+        """Get image ids that satisfy given filter conditions. default skips
+        that filter.
+
+         :param quesIds   (int array)   : get image ids for given question ids
+        quesTypes (str array)   : get image ids for given question types
+        ansTypes  (str array)   : get image ids for given answer types
+         :return: ids     (int array)   : integer array of image ids
+        """
+        quesIds = quesIds if type(quesIds) == list else [quesIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+
+        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(quesIds) == 0:
+                anns = sum([
+                    self.qa[quesId] for quesId in quesIds if quesId in self.qa
+                ], [])
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['image_id'] for ann in anns]
+        return ids
+
+    def loadQA(self, ids=[]):
+        """Load questions and answers with the specified question ids.
+
+        :param ids (int array)       : integer ids specifying question ids
+        :return: qa (object array)   : loaded qa objects
+        """
+        if type(ids) == list:
+            return [self.qa[id] for id in ids]
+        elif type(ids) == int:
+            return [self.qa[ids]]
+
+    def showQA(self, anns):
+        """Display the specified annotations.
+
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        for ann in anns:
+            quesId = ann['question_id']
+            print('Question: %s' % (self.qqa[quesId]['question']))
+            for ans in ann['answers']:
+                print('Answer %d: %s' % (ans['answer_id'], ans['answer']))
+
+    def loadRes(self, resFile, quesFile):
+        """Load result file and return a result object.
+
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = VQA()
+        res.questions = json.load(open(quesFile))
+        res.dataset['info'] = copy.deepcopy(self.questions['info'])
+        res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
+        res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
+        res.dataset['data_subtype'] = copy.deepcopy(
+            self.questions['data_subtype'])
+        res.dataset['license'] = copy.deepcopy(self.questions['license'])
+
+        print('Loading and preparing results...     ')
+        time_t = datetime.datetime.utcnow()
+        anns = json.load(open(resFile))
+        assert type(anns) == list, 'results is not an array of objects'
+        annsQuesIds = [ann['question_id'] for ann in anns]
+        assert set(annsQuesIds) == set(
+            self.getQuesIds()
+        ), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
+        for ann in anns:
+            quesId = ann['question_id']
+            if res.dataset['task_type'] == 'Multiple Choice':
+                assert (
+                    ann['answer'] in self.qqa[quesId]['multiple_choices']
+                ), 'predicted answer is not one of the multiple choices'
+            qaAnn = self.qa[quesId]
+            ann['image_id'] = qaAnn['image_id']
+            ann['question_type'] = qaAnn['question_type']
+            ann['answer_type'] = qaAnn['answer_type']
+        print('DONE (t=%0.2fs)' %
+              ((datetime.datetime.utcnow() - time_t).total_seconds()))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
diff --git a/eval_mm/vqa_eval.py b/eval_mm/vqa_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..1329ae13cd7f3857a839c95462118738e61b0d6d
--- /dev/null
+++ b/eval_mm/vqa_eval.py
@@ -0,0 +1,330 @@
+"""Copyright (c) 2022, salesforce.com, inc.
+
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+# coding=utf-8
+
+__author__ = 'aagrawal'
+
+import re
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
+import sys
+
+
+class VQAEval:
+
+    def __init__(self, vqa=None, vqaRes=None, n=2):
+        self.n = n
+        self.accuracy = {}
+        self.evalQA = {}
+        self.evalQuesType = {}
+        self.evalAnsType = {}
+        self.vqa = vqa
+        self.vqaRes = vqaRes
+        if vqa is not None:
+            self.params = {'question_id': vqa.getQuesIds()}
+        self.contractions = {
+            'aint': "ain't",
+            'arent': "aren't",
+            'cant': "can't",
+            'couldve': "could've",
+            'couldnt': "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            'didnt': "didn't",
+            'doesnt': "doesn't",
+            'dont': "don't",
+            'hadnt': "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            'hasnt': "hasn't",
+            'havent': "haven't",
+            'hed': "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            'hes': "he's",
+            'howd': "how'd",
+            'howll': "how'll",
+            'hows': "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            'Im': "I'm",
+            'Ive': "I've",
+            'isnt': "isn't",
+            'itd': "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            'itll': "it'll",
+            "let's": "let's",
+            'maam': "ma'am",
+            'mightnt': "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            'mightve': "might've",
+            'mustnt': "mustn't",
+            'mustve': "must've",
+            'neednt': "needn't",
+            'notve': "not've",
+            'oclock': "o'clock",
+            'oughtnt': "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            'shant': "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            'shouldve': "should've",
+            'shouldnt': "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": 'somebodyd',
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            'somebodyll': "somebody'll",
+            'somebodys': "somebody's",
+            'someoned': "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            'someonell': "someone'll",
+            'someones': "someone's",
+            'somethingd': "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            'somethingll': "something'll",
+            'thats': "that's",
+            'thered': "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            'therere': "there're",
+            'theres': "there's",
+            'theyd': "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            'theyll': "they'll",
+            'theyre': "they're",
+            'theyve': "they've",
+            'twas': "'twas",
+            'wasnt': "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            'weve': "we've",
+            'werent': "weren't",
+            'whatll': "what'll",
+            'whatre': "what're",
+            'whats': "what's",
+            'whatve': "what've",
+            'whens': "when's",
+            'whered': "where'd",
+            'wheres': "where's",
+            'whereve': "where've",
+            'whod': "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            'wholl': "who'll",
+            'whos': "who's",
+            'whove': "who've",
+            'whyll': "why'll",
+            'whyre': "why're",
+            'whys': "why's",
+            'wont': "won't",
+            'wouldve': "would've",
+            'wouldnt': "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            'yall': "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            'youd': "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            'youll': "you'll",
+            'youre': "you're",
+            'youve': "you've",
+        }
+        self.manualMap = {
+            'none': '0',
+            'zero': '0',
+            'one': '1',
+            'two': '2',
+            'three': '3',
+            'four': '4',
+            'five': '5',
+            'six': '6',
+            'seven': '7',
+            'eight': '8',
+            'nine': '9',
+            'ten': '10',
+        }
+        self.articles = ['a', 'an', 'the']
+
+        self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
+        self.commaStrip = re.compile('(\d)(,)(\d)')
+        self.punct = [
+            ';',
+            r'/',
+            '[',
+            ']',
+            '"',
+            '{',
+            '}',
+            '(',
+            ')',
+            '=',
+            '+',
+            '\\',
+            '_',
+            '-',
+            '>',
+            '<',
+            '@',
+            '`',
+            ',',
+            '?',
+            '!',
+        ]
+
+    def evaluate(self, quesIds=None):
+        if quesIds == None:
+            quesIds = [quesId for quesId in self.params['question_id']]
+        gts = {}
+        res = {}
+        for quesId in quesIds:
+            gts[quesId] = self.vqa.qa[quesId]
+            res[quesId] = self.vqaRes.qa[quesId]
+
+        # =================================================
+        # Compute accuracy
+        # =================================================
+        accQA = []
+        accQuesType = {}
+        accAnsType = {}
+        print('computing accuracy')
+        step = 0
+        for quesId in quesIds:
+            resAns = res[quesId]['answer']
+            resAns = resAns.replace('\n', ' ')
+            resAns = resAns.replace('\t', ' ')
+            resAns = resAns.strip()
+            resAns = self.processPunctuation(resAns)
+            resAns = self.processDigitArticle(resAns)
+            gtAcc = []
+            gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
+            if len(set(gtAnswers)) > 1:
+                for ansDic in gts[quesId]['answers']:
+                    ansDic['answer'] = self.processPunctuation(
+                        ansDic['answer'])
+            for gtAnsDatum in gts[quesId]['answers']:
+                otherGTAns = [
+                    item for item in gts[quesId]['answers']
+                    if item != gtAnsDatum
+                ]
+                matchingAns = [
+                    item for item in otherGTAns if item['answer'] == resAns
+                ]
+                acc = min(1, float(len(matchingAns)) / 3)
+                gtAcc.append(acc)
+            quesType = gts[quesId]['question_type']
+            ansType = gts[quesId]['answer_type']
+            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
+            accQA.append(avgGTAcc)
+            if quesType not in accQuesType:
+                accQuesType[quesType] = []
+            accQuesType[quesType].append(avgGTAcc)
+            if ansType not in accAnsType:
+                accAnsType[ansType] = []
+            accAnsType[ansType].append(avgGTAcc)
+            self.setEvalQA(quesId, avgGTAcc)
+            self.setEvalQuesType(quesId, quesType, avgGTAcc)
+            self.setEvalAnsType(quesId, ansType, avgGTAcc)
+            if step % 100 == 0:
+                self.updateProgress(step / float(len(quesIds)))
+            step = step + 1
+
+        self.setAccuracy(accQA, accQuesType, accAnsType)
+        print('Done computing accuracy')
+
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + ' ' in inText or ' ' + p
+                    in inText) or (re.search(self.commaStrip, inText) != None):
+                outText = outText.replace(p, '')
+            else:
+                outText = outText.replace(p, ' ')
+        outText = self.periodStrip.sub('', outText, re.UNICODE)
+        return outText
+
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = ' '.join(outText)
+        return outText
+
+    def setAccuracy(self, accQA, accQuesType, accAnsType):
+        self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
+                                         self.n)
+        self.accuracy['perQuestionType'] = {
+            quesType: round(
+                100 * float(sum(accQuesType[quesType])) /
+                len(accQuesType[quesType]),
+                self.n,
+            )
+            for quesType in accQuesType
+        }
+        self.accuracy['perAnswerType'] = {
+            ansType: round(
+                100 * float(sum(accAnsType[ansType])) /
+                len(accAnsType[ansType]), self.n)
+            for ansType in accAnsType
+        }
+
+    def setEvalQA(self, quesId, acc):
+        self.evalQA[quesId] = round(100 * acc, self.n)
+
+    def setEvalQuesType(self, quesId, quesType, acc):
+        if quesType not in self.evalQuesType:
+            self.evalQuesType[quesType] = {}
+        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
+
+    def setEvalAnsType(self, quesId, ansType, acc):
+        if ansType not in self.evalAnsType:
+            self.evalAnsType[ansType] = {}
+        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
+
+    def updateProgress(self, progress):
+        barLength = 20
+        status = ''
+        if isinstance(progress, int):
+            progress = float(progress)
+        if not isinstance(progress, float):
+            progress = 0
+            status = 'error: progress var must be float\r\n'
+        if progress < 0:
+            progress = 0
+            status = 'Halt...\r\n'
+        if progress >= 1:
+            progress = 1
+            status = 'Done...\r\n'
+        block = int(round(barLength * progress))
+        text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
+            '#' * block + '-' * (barLength - block), int(progress * 100),
+            status)
+        sys.stdout.write(text)
+        sys.stdout.flush()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7f70a086db405864f510df8823e8cf250a9b2919
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+transformers==4.31.0
+accelerate
+tiktoken
+einops
+transformers_stream_generator==0.0.4
+scipy
+torchvision
+pillow
+tensorboard
+matplotlib
diff --git a/requirements_web_demo.txt b/requirements_web_demo.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25aceddaba2623925a4c9f20f2bb00c4282b4db7
--- /dev/null
+++ b/requirements_web_demo.txt
@@ -0,0 +1 @@
+gradio
diff --git a/test.py b/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ca9a06bc9ef7923ea4da7375fd282cf08892bd
--- /dev/null
+++ b/test.py
@@ -0,0 +1,38 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+# Note: The default behavior now has injection attack prevention off.
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# use fp16
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu only
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cpu", trust_remote_code=True).eval()
+# use cuda device
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
+
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+# 1st dialogue turn
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': '这是什么?'},
+])
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+# 图中是一名女子在沙滩上和狗玩耍，旁边是一只拉布拉多犬，它们处于沙滩上。
+
+# 2st dialogue turn
+response, history = model.chat(tokenizer, '框出图中击掌的位置', history=history)
+print(response)
+# <ref>击掌</ref><box>(536,509),(588,602)</box>
+image = tokenizer.draw_bbox_on_latest_picture(response, history)
+if image:
+  image.save('1.jpg')
+else:
+  print("no box")
diff --git a/touchstone/README.md b/touchstone/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65f0f70697196b7b245bf9181bf82aa61a73819b
--- /dev/null
+++ b/touchstone/README.md
@@ -0,0 +1,69 @@
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        <a href="touchstone/README_CN.md">中文</a>&nbsp ｜ &nbspEnglish
+</p>
+<br><br>
+
+**TOUCHSTONE** is a comprehensive assessment of multimodal language models, encompassing not only basic recognition and comprehension but also extending to literary creation. By automating the evaluation process and converting multimodal information into text, our TouchStone allows for efficient and accurate assessment of dialogue quality, leveraging the power of advanced language models without the need for manual intervention.
+
+## DATASET
+
+To evaluate the abilities of LVLMs, we construct a diverse and comprehensive dataset that covers five key dimensions: basic descriptive ability, visual recognition ability, visual comprehension ability, visual storytelling ability, and multi-image analysis ability.
+
+- **Basic Descriptive Ability** Image description involves the ability of a model to describe the information contained in an image, including simple and detailed descriptions. Simple descriptions are typically short phrases that describe the main subject and action of the image, while detailed descriptions provide more in-depth information about the image scene, their attributes, and relationships.
+
+- **Visual Recognition Ability** Image recognition is the task of recognizing objects or scenes within an image and inferring relevant information. This area can be further divided into several sub-tasks, including attribute QA, movie/TV recognition, art recognition, landmark recognition, celebrity recognition, emotion recognition, text recognition, object recognition, and structure content recognition. 
+
+- **Visual Comprehension Ability** Image understanding involves the ability of a model to understand the meaning of an image and associated tasks. This area encompasses several sub-tasks, such as style appreciation, abstract image understanding, meme understanding, image analysis, chart analysis, general problem-solving, and reasoning QA.
+
+- **Visual Storytelling Ability**  The visual storytelling ability is the process of literary creation based on visual content, including writing emails, poetry, stories, ads/commodity recommendations, and brainstorming. 
+
+- **Multi-Image Analysis Ability** Multi-image analysis is the task of analyzing and comparing multiple images. This area includes tasks such as comparing two/multiple images, summarizing multiple image information, comparing commodities, and step-by-step analysis of images.
+
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+We comprehensively evaluate the model's ability from five dimensions. As shown in the figure above, an example of 27 subtasks is given. From perception to cognition to creativity, as the difficulty increases, the requirements for models are also getting higher and higher. Currently, LVLM capabilities are in their early stages. Our dataset contains 800+ questions and 27 categories.
+
+## Methods
+
+
+We apply a powerful LLM as a judge to enable automated evaluation. To effectively comprehend the contents of an image, we manually substitute the actual image input with fine-grained textual annotations. By inputting these annotations and corresponding questions to a powerful LLM like GPT4, we obtain reference answers.
+
+For the evaluation of the LVLMs, we provide actual images and questions as input and obtain their respective answers. Finally, we employ GPT4 to score the answers generated by the LVLMs based on the fine-grained annotations and questions. The scoring instructions require the model to assess the usefulness, relevance, and accuracy of the answers, considering the annotations as the content of the images. To ensure fairness in the evaluation, each model's answer is compared against a consistent reference answer from GPT4. The average score of the model in all questions is taken as the final score.
+
+To eliminate the influence of answer position, we perform a second scoring round by swapping the positions of the answers and then compute the average of the two scores obtained. This approach aims to mitigate any bias introduced by the placement of the answers.
+
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+### Evaluation
+
+#### Evaluation in English-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### Evaluation in Chinese-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
diff --git a/touchstone/README_CN.md b/touchstone/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..66ce2a09f5e87503ef59832df6b8c2091edd0a86
--- /dev/null
+++ b/touchstone/README_CN.md
@@ -0,0 +1,68 @@
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        中文&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a>
+</p>
+<br><br>
+
+**TOUCHSTONE** 是一种针对多模态语言模型（LVLM）的自动化综合评估方法，评估不仅包括基本的认知和理解，还延伸到文学创作。通过人类注解将多模态信息转换为文本，我们的 TouchStone 可以利用SOTA的语言模型来自动化地完成对LVLMs的多模态对话质量评估。
+
+## 数据集
+
+为了评估 LVLMs 的能力，我们构建了一个多样化且全面的数据集，涵盖五个关键维度：基本描述能力、视觉识别能力、视觉理解能力、视觉叙事能力和多图分析能力。
+
+- **基本描述能力** 图像描述考验模型总结图片信息的能力，包括简单描述和详细描述。 简单描述通常是描述图像的主要内容和关系的简短短语，而详细描述则提供有关图像场景、其属性和关系的更深入的信息。
+
+- **视觉识别能力** 图像识别考察模型提取图像中内容的属性以及关联到知识库的能力。为了考察这方面能力，测试的问题包括属性QA、影视识别、艺术识别、地标识别、名人识别、情感识别、文本识别、物体识别和结构内容识别。
+
+- **视觉理解能力** 图像理解需要模型理解图像内容并完成推理进行相关任务。 这方面包含了例如风格欣赏、抽象图像理解、模因理解、图像分析、图表分析、一般问题解决和推理问答等任务。
+
+- **视觉叙事能力**  视觉叙事能力是基于视觉内容的文学创作能力，包括撰写电子邮件、诗歌、故事、广告/商品推荐、头脑风暴等。 
+
+- **多图分析能力** 多图分析是分析和比较多幅图像的任务。该领域包括比较两个/多个图像、总结多个图像信息、比较商品以及逐步分析图像等任务。
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+我们从五个维度综合评估了模型的能力。 如上图所示，给出了27个子任务的示例。 从感知到认知，再到创造力，随着难度的增加，对模型的要求也越来越高。 目前，LVLM的能力还处于早期阶段。 我们的数据集包含800+道题目、27个类别。
+
+## 测评方式
+
+我们应用SOTA的LLM进行自动化评估。 为了有效地理解图像的内容，我们人工用细粒度的文本注释替换实际的图像输入。 通过将这些注释和相应的问题输入到像GPT4这样强LLM中，我们可以获得参考答案。
+
+对于待测评的LVLM，我们提供实际图像和问题作为输入并获得各自的答案。 最后，我们使用GPT4根据细粒度注释和问题对LVLM生成的答案进行评分。 评分指令要求模型评估答案的有用性、相关性和准确性，并将人工注解视为图像的内容。 为了确保评估的公平性，每个模型的答案都会与 GPT4生成的参考答案进行比较。 模型在所有问题上的平均得分作为最终得分。
+
+为了消除答案位置的影响，我们通过交换答案的位置来进行第二轮评分，然后计算获得的两次分数的平均值。
+
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+
+## 测评结果
+
+#### 英文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### 中文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
diff --git a/web_demo_mm.py b/web_demo_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad17d954c2ddf22ceaaed3973a96c888962e6536
--- /dev/null
+++ b/web_demo_mm.py
@@ -0,0 +1,234 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""A simple web interactive chat demo based on gradio."""
+
+from argparse import ArgumentParser
+from pathlib import Path
+
+import copy
+import gradio as gr
+import os
+import re
+import secrets
+import tempfile
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+DEFAULT_CKPT_PATH = 'Qwen/Qwen-VL-Chat'
+BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
+
+
+def _get_args():
+    parser = ArgumentParser()
+    parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
+                        help="Checkpoint name or path, default to %(default)r")
+    parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
+
+    parser.add_argument("--share", action="store_true", default=False,
+                        help="Create a publicly shareable link for the interface.")
+    parser.add_argument("--inbrowser", action="store_true", default=False,
+                        help="Automatically launch the interface in a new tab on the default browser.")
+    parser.add_argument("--server-port", type=int, default=8000,
+                        help="Demo server port.")
+    parser.add_argument("--server-name", type=str, default="127.0.0.1",
+                        help="Demo server name.")
+
+    args = parser.parse_args()
+    return args
+
+
+def _load_model_tokenizer(args):
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True, resume_download=True,
+    )
+
+    if args.cpu_only:
+        device_map = "cpu"
+    else:
+        device_map = "cuda"
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map=device_map,
+        trust_remote_code=True,
+        resume_download=True,
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True, resume_download=True,
+    )
+
+    return model, tokenizer
+
+
+def _parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f"<br></code></pre>"
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", r"\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+
+
+def _launch_demo(args, model, tokenizer):
+    uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(
+        Path(tempfile.gettempdir()) / "gradio"
+    )
+
+    def predict(_chatbot, task_history):
+        query = task_history[-1][0]
+        print("User: " + _parse_text(query))
+        history_cp = copy.deepcopy(task_history)
+        full_response = ""
+
+        history_filter = []
+        pic_idx = 1
+        pre = ""
+        for i, (q, a) in enumerate(history_cp):
+            if isinstance(q, (tuple, list)):
+                q = f'Picture {pic_idx}: <img>{q[0]}</img>'
+                pre += q + '\n'
+                pic_idx += 1
+            else:
+                pre += q
+                history_filter.append((pre, a))
+                pre = ""
+        history, message = history_filter[:-1], history_filter[-1][0]
+        response, history = model.chat(tokenizer, message, history=history)
+        image = tokenizer.draw_bbox_on_latest_picture(response, history)
+        if image is not None:
+            temp_dir = secrets.token_hex(20)
+            temp_dir = Path(uploaded_file_dir) / temp_dir
+            temp_dir.mkdir(exist_ok=True, parents=True)
+            name = f"tmp{secrets.token_hex(5)}.jpg"
+            filename = temp_dir / name
+            image.save(str(filename))
+            _chatbot[-1] = (_parse_text(query), (str(filename),))
+            chat_response = response.replace("<ref>", "")
+            chat_response = chat_response.replace(r"</ref>", "")
+            chat_response = re.sub(BOX_TAG_PATTERN, "", chat_response)
+            if chat_response != "":
+                _chatbot.append((None, chat_response))
+        else:
+            _chatbot[-1] = (_parse_text(query), response)
+        full_response = _parse_text(response)
+
+        task_history[-1] = (query, full_response)
+        print("Qwen-VL-Chat: " + _parse_text(full_response))
+        return _chatbot
+
+    def regenerate(_chatbot, task_history):
+        if not task_history:
+            return _chatbot
+        item = task_history[-1]
+        if item[1] is None:
+            return _chatbot
+        task_history[-1] = (item[0], None)
+        chatbot_item = _chatbot.pop(-1)
+        if chatbot_item[0] is None:
+            _chatbot[-1] = (_chatbot[-1][0], None)
+        else:
+            _chatbot.append((chatbot_item[0], None))
+        return predict(_chatbot, task_history)
+
+    def add_text(history, task_history, text):
+        history = history + [(_parse_text(text), None)]
+        task_history = task_history + [(text, None)]
+        return history, task_history, ""
+
+    def add_file(history, task_history, file):
+        history = history + [((file.name,), None)]
+        task_history = task_history + [((file.name,), None)]
+        return history, task_history
+
+    def reset_user_input():
+        return gr.update(value="")
+
+    def reset_state(task_history):
+        task_history.clear()
+        return []
+
+    with gr.Blocks() as demo:
+        gr.Markdown("""\
+<p align="center"><img src="https://modelscope.cn/api/v1/models/qwen/Qwen-7B-Chat/repo?
+Revision=master&FilePath=assets/logo.jpeg&View=true" style="height: 80px"/><p>""")
+        gr.Markdown("""<center><font size=8>Qwen-VL-Chat Bot</center>""")
+        gr.Markdown(
+            """\
+<center><font size=3>This WebUI is based on Qwen-VL-Chat, developed by Alibaba Cloud. \
+(本WebUI基于Qwen-VL-Chat打造，实现聊天机器人功能。)</center>""")
+        gr.Markdown("""\
+<center><font size=4>Qwen-VL <a href="https://modelscope.cn/models/qwen/Qwen-VL/summary">🤖 </a> 
+| <a href="https://huggingface.co/Qwen/Qwen-VL">🤗</a>&nbsp ｜ 
+Qwen-VL-Chat <a href="https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary">🤖 </a> | 
+<a href="https://huggingface.co/Qwen/Qwen-VL-Chat">🤗</a>&nbsp ｜ 
+&nbsp<a href="https://github.com/QwenLM/Qwen-VL">Github</a></center>""")
+
+        chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=750)
+        query = gr.Textbox(lines=2, label='Input')
+        task_history = gr.State([])
+
+        with gr.Row():
+            empty_bin = gr.Button("🧹 Clear History (清除历史)")
+            submit_btn = gr.Button("🚀 Submit (发送)")
+            regen_btn = gr.Button("🤔️ Regenerate (重试)")
+            addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image"])
+
+        submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
+            predict, [chatbot, task_history], [chatbot], show_progress=True
+        )
+        submit_btn.click(reset_user_input, [], [query])
+        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
+        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
+        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
+
+        gr.Markdown("""\
+<font size=2>Note: This demo is governed by the original license of Qwen-VL. \
+We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
+including hate speech, violence, pornography, deception, etc. \
+(注：本演示受Qwen-VL的许可协议限制。我们强烈建议，用户不应传播及不应允许他人传播以下内容，\
+包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""")
+
+    demo.queue().launch(
+        share=args.share,
+        inbrowser=args.inbrowser,
+        server_port=args.server_port,
+        server_name=args.server_name,
+    )
+
+
+def main():
+    args = _get_args()
+
+    model, tokenizer = _load_model_tokenizer(args)
+
+    _launch_demo(args, model, tokenizer)
+
+
+if __name__ == '__main__':
+    main()