hanjiaming.0208 commited on
Commit
2e41e44
·
1 Parent(s): 62dccb6
README.md CHANGED
@@ -12,4 +12,35 @@ short_description: Unified MLLM with Text-Aligned Representations
12
  license: apache-2.0
13
  ---
14
 
15
- # Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  license: apache-2.0
13
  ---
14
 
15
+ ### Unifying Visual Understanding and Generation via Text-Aligned Representations
16
+ > [Jiaming Han](https://csuhan.com), [Hao Chen](https://haochen-rye.github.io)<sup>†</sup>, [Yang Zhao](https://scholar.google.com/citations?user=uPmTOHAAAAAJ&hl=zh-CN), [Hanyu Wang](https://hywang66.github.io), [Qi Zhao](https://kevinz8866.github.io), [Ziyan Yang](https://ziyanyang.github.io), [Hao He](https://hehao13.github.io), [Xiangyu Yue](https://xyue.io)<sup>‡</sup>, [Lu Jiang](https://www.lujiang.info)<sup>‡</sup>
17
+ >
18
+ > <sup>†</sup> Project Lead&nbsp;&nbsp;<sup>‡</sup> Corresponding Authors
19
+
20
+ <a href="https://tar.csuhan.com">
21
+ <img
22
+ src="https://img.shields.io/badge/Project-Page-0A66C2?logo=chromewebstore&logoColor=0A66C2"
23
+ alt="Project Page"
24
+ />
25
+ </a>
26
+ <a href="http://arxiv.org/abs/2506.18898">
27
+ <img
28
+ src="https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=red"
29
+ alt="Tar Paper on arXiv"
30
+ />
31
+ </a>
32
+
33
+
34
+ ### Citation
35
+ ```
36
+ @article{han2025tar,
37
+ title={Vision as a Dialect: Unifying Visual Understanding and Generation via Text-Aligned Representations},
38
+ author={Han, Jiaming and Chen, Hao and Zhao, Yang and Wang, Hanyu and Zhao, Qi and Yang, Ziyan and He, Hao and Yue, Xiangyu and Jiang, Lu},
39
+ journal={arXiv preprint arXiv:2506.18898},
40
+ year={2025},
41
+ }
42
+ ```
43
+
44
+ ### License
45
+ This project is licensed under the Apache 2.0 License.
46
+
app.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
  from torchvision.transforms.functional import to_tensor
@@ -30,12 +44,12 @@ def generate_text(self, image: str, prompt: str) -> str:
30
 
31
  login(token=os.getenv('HF_TOKEN'))
32
  config = T2IConfig()
33
- config.model = snapshot_download("csuhan/Tar-7B-v0.1")
34
  config.ar_path = {
35
- "1024px": hf_hub_download("csuhan/TA-Tok", "ar_dtok_lp_1024px.pth"),
36
- "512px": hf_hub_download("csuhan/TA-Tok", "ar_dtok_lp_512px.pth"),
37
  }
38
- config.encoder_path = hf_hub_download("csuhan/TA-Tok", "ta_tok.pth")
39
  config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
40
  inference = TextToImageInference(config)
41
 
@@ -61,7 +75,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
61
 
62
  ### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
63
 
64
- [🕸️ Project Page](http://tar.csuhan.com) • [📄 Paper](http://arxiv.org/abs/2506.18898) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/collections/csuhan/tar-68538273b5537d0bee712648)
65
 
66
  </div>
67
  """,
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import os
16
  import gradio as gr
17
  from torchvision.transforms.functional import to_tensor
 
44
 
45
  login(token=os.getenv('HF_TOKEN'))
46
  config = T2IConfig()
47
+ config.model = snapshot_download("ByteDance-Seed/Tar-7B")
48
  config.ar_path = {
49
+ "1024px": hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ar_dtok_lp_1024px.pth"),
50
+ "512px": hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ar_dtok_lp_512px.pth"),
51
  }
52
+ config.encoder_path = hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ta_tok.pth")
53
  config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
54
  inference = TextToImageInference(config)
55
 
 
75
 
76
  ### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
77
 
78
+ [🕸️ Project Page](http://tar.csuhan.com) • [📄 Paper](http://arxiv.org/abs/2506.18898) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/collections/ByteDance-Seed/tar-6864cf0d9fe59a3b91cc4260)
79
 
80
  </div>
81
  """,
t2i_inference.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  from dataclasses import dataclass
3
 
@@ -11,7 +25,7 @@ from tok.mm_autoencoder import MMAutoEncoder
11
 
12
  @dataclass
13
  class T2IConfig:
14
- model_path: str = "csuhan/Tar-1.5B"
15
  # visual tokenizer config
16
  ar_path = None
17
  encoder_path: str = 'ta_tok.pth'
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import re
16
  from dataclasses import dataclass
17
 
 
25
 
26
  @dataclass
27
  class T2IConfig:
28
+ model_path: str = "ByteDance-Seed/Tar-1.5B"
29
  # visual tokenizer config
30
  ar_path = None
31
  encoder_path: str = 'ta_tok.pth'
tok/__init__.py CHANGED
@@ -1 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from .ar_dtok import *
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  from .ar_dtok import *
tok/ar_dtok/__init__.py CHANGED
@@ -1,2 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from .bottleneck import Bottleneck, SimVectorQuantizer
2
  from .vqvae import VQVAE
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  from .bottleneck import Bottleneck, SimVectorQuantizer
16
  from .vqvae import VQVAE
tok/ar_dtok/ar_model.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from contextlib import contextmanager
3
  from dataclasses import dataclass
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import os
16
  from contextlib import contextmanager
17
  from dataclasses import dataclass
tok/ar_dtok/bottleneck.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import torch
16
  import torch.nn as nn
17
  import torch.nn.functional as F
tok/ar_dtok/generate.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Modified from:
2
  # llamagen: https://github.com/FoundationVision/LlamaGen/blob/main/autoregressive/models/generate.py
3
  # gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  # Modified from:
16
  # llamagen: https://github.com/FoundationVision/LlamaGen/blob/main/autoregressive/models/generate.py
17
  # gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
tok/ar_dtok/vqvae.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass, field
2
  from typing import List
3
 
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  from dataclasses import dataclass, field
16
  from typing import List
17
 
tok/mm_autoencoder.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn as nn
3
 
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import torch
16
  import torch.nn as nn
17
 
tok/models.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import copy
2
  import inspect
3
 
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import copy
16
  import inspect
17
 
tok/ta_tok.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import torch
16
  import torch.nn as nn
17
  import torch.nn.functional as F
tok/utils.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn as nn
3
 
 
1
+ # // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2
+ # //
3
+ # // Licensed under the Apache License, Version 2.0 (the "License");
4
+ # // you may not use this file except in compliance with the License.
5
+ # // You may obtain a copy of the License at
6
+ # //
7
+ # // http://www.apache.org/licenses/LICENSE-2.0
8
+ # //
9
+ # // Unless required by applicable law or agreed to in writing, software
10
+ # // distributed under the License is distributed on an "AS IS" BASIS,
11
+ # // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # // See the License for the specific language governing permissions and
13
+ # // limitations under the License.
14
+
15
  import torch
16
  import torch.nn as nn
17