Spaces:
Running
on
Zero
Running
on
Zero
hanjiaming.0208
commited on
Commit
·
2e41e44
1
Parent(s):
62dccb6
init
Browse files- README.md +32 -1
- app.py +19 -5
- t2i_inference.py +15 -1
- tok/__init__.py +14 -0
- tok/ar_dtok/__init__.py +14 -0
- tok/ar_dtok/ar_model.py +14 -0
- tok/ar_dtok/bottleneck.py +14 -0
- tok/ar_dtok/generate.py +14 -0
- tok/ar_dtok/vqvae.py +14 -0
- tok/mm_autoencoder.py +14 -0
- tok/models.py +14 -0
- tok/ta_tok.py +14 -0
- tok/utils.py +14 -0
README.md
CHANGED
@@ -12,4 +12,35 @@ short_description: Unified MLLM with Text-Aligned Representations
|
|
12 |
license: apache-2.0
|
13 |
---
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
license: apache-2.0
|
13 |
---
|
14 |
|
15 |
+
### Unifying Visual Understanding and Generation via Text-Aligned Representations
|
16 |
+
> [Jiaming Han](https://csuhan.com), [Hao Chen](https://haochen-rye.github.io)<sup>†</sup>, [Yang Zhao](https://scholar.google.com/citations?user=uPmTOHAAAAAJ&hl=zh-CN), [Hanyu Wang](https://hywang66.github.io), [Qi Zhao](https://kevinz8866.github.io), [Ziyan Yang](https://ziyanyang.github.io), [Hao He](https://hehao13.github.io), [Xiangyu Yue](https://xyue.io)<sup>‡</sup>, [Lu Jiang](https://www.lujiang.info)<sup>‡</sup>
|
17 |
+
>
|
18 |
+
> <sup>†</sup> Project Lead <sup>‡</sup> Corresponding Authors
|
19 |
+
|
20 |
+
<a href="https://tar.csuhan.com">
|
21 |
+
<img
|
22 |
+
src="https://img.shields.io/badge/Project-Page-0A66C2?logo=chromewebstore&logoColor=0A66C2"
|
23 |
+
alt="Project Page"
|
24 |
+
/>
|
25 |
+
</a>
|
26 |
+
<a href="http://arxiv.org/abs/2506.18898">
|
27 |
+
<img
|
28 |
+
src="https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=red"
|
29 |
+
alt="Tar Paper on arXiv"
|
30 |
+
/>
|
31 |
+
</a>
|
32 |
+
|
33 |
+
|
34 |
+
### Citation
|
35 |
+
```
|
36 |
+
@article{han2025tar,
|
37 |
+
title={Vision as a Dialect: Unifying Visual Understanding and Generation via Text-Aligned Representations},
|
38 |
+
author={Han, Jiaming and Chen, Hao and Zhao, Yang and Wang, Hanyu and Zhao, Qi and Yang, Ziyan and He, Hao and Yue, Xiangyu and Jiang, Lu},
|
39 |
+
journal={arXiv preprint arXiv:2506.18898},
|
40 |
+
year={2025},
|
41 |
+
}
|
42 |
+
```
|
43 |
+
|
44 |
+
### License
|
45 |
+
This project is licensed under the Apache 2.0 License.
|
46 |
+
|
app.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
from torchvision.transforms.functional import to_tensor
|
@@ -30,12 +44,12 @@ def generate_text(self, image: str, prompt: str) -> str:
|
|
30 |
|
31 |
login(token=os.getenv('HF_TOKEN'))
|
32 |
config = T2IConfig()
|
33 |
-
config.model = snapshot_download("
|
34 |
config.ar_path = {
|
35 |
-
"1024px": hf_hub_download("
|
36 |
-
"512px": hf_hub_download("
|
37 |
}
|
38 |
-
config.encoder_path = hf_hub_download("
|
39 |
config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
|
40 |
inference = TextToImageInference(config)
|
41 |
|
@@ -61,7 +75,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
61 |
|
62 |
### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
|
63 |
|
64 |
-
[🕸️ Project Page](http://tar.csuhan.com) • [📄 Paper](http://arxiv.org/abs/2506.18898) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/collections/
|
65 |
|
66 |
</div>
|
67 |
""",
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import os
|
16 |
import gradio as gr
|
17 |
from torchvision.transforms.functional import to_tensor
|
|
|
44 |
|
45 |
login(token=os.getenv('HF_TOKEN'))
|
46 |
config = T2IConfig()
|
47 |
+
config.model = snapshot_download("ByteDance-Seed/Tar-7B")
|
48 |
config.ar_path = {
|
49 |
+
"1024px": hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ar_dtok_lp_1024px.pth"),
|
50 |
+
"512px": hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ar_dtok_lp_512px.pth"),
|
51 |
}
|
52 |
+
config.encoder_path = hf_hub_download("ByteDance-Seed/Tar-TA-Tok", "ta_tok.pth")
|
53 |
config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
|
54 |
inference = TextToImageInference(config)
|
55 |
|
|
|
75 |
|
76 |
### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
|
77 |
|
78 |
+
[🕸️ Project Page](http://tar.csuhan.com) • [📄 Paper](http://arxiv.org/abs/2506.18898) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/collections/ByteDance-Seed/tar-6864cf0d9fe59a3b91cc4260)
|
79 |
|
80 |
</div>
|
81 |
""",
|
t2i_inference.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
from dataclasses import dataclass
|
3 |
|
@@ -11,7 +25,7 @@ from tok.mm_autoencoder import MMAutoEncoder
|
|
11 |
|
12 |
@dataclass
|
13 |
class T2IConfig:
|
14 |
-
model_path: str = "
|
15 |
# visual tokenizer config
|
16 |
ar_path = None
|
17 |
encoder_path: str = 'ta_tok.pth'
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import re
|
16 |
from dataclasses import dataclass
|
17 |
|
|
|
25 |
|
26 |
@dataclass
|
27 |
class T2IConfig:
|
28 |
+
model_path: str = "ByteDance-Seed/Tar-1.5B"
|
29 |
# visual tokenizer config
|
30 |
ar_path = None
|
31 |
encoder_path: str = 'ta_tok.pth'
|
tok/__init__.py
CHANGED
@@ -1 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from .ar_dtok import *
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
from .ar_dtok import *
|
tok/ar_dtok/__init__.py
CHANGED
@@ -1,2 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from .bottleneck import Bottleneck, SimVectorQuantizer
|
2 |
from .vqvae import VQVAE
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
from .bottleneck import Bottleneck, SimVectorQuantizer
|
16 |
from .vqvae import VQVAE
|
tok/ar_dtok/ar_model.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from contextlib import contextmanager
|
3 |
from dataclasses import dataclass
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import os
|
16 |
from contextlib import contextmanager
|
17 |
from dataclasses import dataclass
|
tok/ar_dtok/bottleneck.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import torch
|
16 |
import torch.nn as nn
|
17 |
import torch.nn.functional as F
|
tok/ar_dtok/generate.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Modified from:
|
2 |
# llamagen: https://github.com/FoundationVision/LlamaGen/blob/main/autoregressive/models/generate.py
|
3 |
# gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
# Modified from:
|
16 |
# llamagen: https://github.com/FoundationVision/LlamaGen/blob/main/autoregressive/models/generate.py
|
17 |
# gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
|
tok/ar_dtok/vqvae.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass, field
|
2 |
from typing import List
|
3 |
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
from dataclasses import dataclass, field
|
16 |
from typing import List
|
17 |
|
tok/mm_autoencoder.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import torch
|
16 |
import torch.nn as nn
|
17 |
|
tok/models.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import copy
|
2 |
import inspect
|
3 |
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import copy
|
16 |
import inspect
|
17 |
|
tok/ta_tok.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import torch
|
16 |
import torch.nn as nn
|
17 |
import torch.nn.functional as F
|
tok/utils.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
|
|
|
1 |
+
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
2 |
+
# //
|
3 |
+
# // Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# // you may not use this file except in compliance with the License.
|
5 |
+
# // You may obtain a copy of the License at
|
6 |
+
# //
|
7 |
+
# // http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
# //
|
9 |
+
# // Unless required by applicable law or agreed to in writing, software
|
10 |
+
# // distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# // See the License for the specific language governing permissions and
|
13 |
+
# // limitations under the License.
|
14 |
+
|
15 |
import torch
|
16 |
import torch.nn as nn
|
17 |
|