Updating README and adding first couple of Group adapters
Browse files- Group1/adapter_config.json +34 -0
- Group1/adapter_model.safetensors +3 -0
- Group2/adapter_config.json +32 -0
- Group2/adapter_model.safetensors +3 -0
- Group6/adapter_config.json +32 -0
- Group6/adapter_model.safetensors +3 -0
- README.md +153 -3
Group1/adapter_config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "haoranxu/XALMA-13B-Pretrain",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 1024,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 512,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
+
"gate_proj",
|
25 |
+
"q_proj",
|
26 |
+
"down_proj",
|
27 |
+
"v_proj",
|
28 |
+
"k_proj",
|
29 |
+
"up_proj"
|
30 |
+
],
|
31 |
+
"task_type": "CAUSAL_LM",
|
32 |
+
"use_dora": false,
|
33 |
+
"use_rslora": false
|
34 |
+
}
|
Group1/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6aac72e5c4cd7b3b88c583d8a66eafb27d254727884124ed0d72b365796c1306
|
3 |
+
size 4005638112
|
Group2/adapter_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "haoranxu/XALMA-13B-Pretrain",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 1024,
|
13 |
+
"lora_dropout": 0.05,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 512,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"down_proj",
|
23 |
+
"q_proj",
|
24 |
+
"up_proj",
|
25 |
+
"v_proj",
|
26 |
+
"gate_proj",
|
27 |
+
"k_proj",
|
28 |
+
"o_proj"
|
29 |
+
],
|
30 |
+
"task_type": "CAUSAL_LM",
|
31 |
+
"use_rslora": false
|
32 |
+
}
|
Group2/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:944b698b544ee2fbd1c1ee624b454851399e25b36b72f5a5a1ce6f2002373830
|
3 |
+
size 4005638112
|
Group6/adapter_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "haoranxu/XALMA-13B-Pretrain",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"loftq_config": {},
|
12 |
+
"lora_alpha": 1024,
|
13 |
+
"lora_dropout": 0.05,
|
14 |
+
"megatron_config": null,
|
15 |
+
"megatron_core": "megatron.core",
|
16 |
+
"modules_to_save": null,
|
17 |
+
"peft_type": "LORA",
|
18 |
+
"r": 512,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"down_proj",
|
23 |
+
"q_proj",
|
24 |
+
"o_proj",
|
25 |
+
"up_proj",
|
26 |
+
"v_proj",
|
27 |
+
"k_proj",
|
28 |
+
"gate_proj"
|
29 |
+
],
|
30 |
+
"task_type": "CAUSAL_LM",
|
31 |
+
"use_rslora": false
|
32 |
+
}
|
Group6/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f611a6cbe80879cbaada7530c6f1b4cb12fb5799c1dc6366d05ec93ff498a8e
|
3 |
+
size 4005638112
|
README.md
CHANGED
@@ -1,3 +1,153 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
datasets:
|
4 |
+
- oscar-corpus/OSCAR-2301
|
5 |
+
- allenai/nllb
|
6 |
+
- Helsinki-NLP/opus-100
|
7 |
+
language:
|
8 |
+
- en
|
9 |
+
- da
|
10 |
+
- nl
|
11 |
+
- de
|
12 |
+
- is
|
13 |
+
- 'no'
|
14 |
+
- sc
|
15 |
+
- af
|
16 |
+
- ca
|
17 |
+
- ro
|
18 |
+
- gl
|
19 |
+
- it
|
20 |
+
- pt
|
21 |
+
- es
|
22 |
+
- bg
|
23 |
+
- mk
|
24 |
+
- sr
|
25 |
+
- uk
|
26 |
+
- ru
|
27 |
+
- id
|
28 |
+
- ms
|
29 |
+
- th
|
30 |
+
- vi
|
31 |
+
- mg
|
32 |
+
- fr
|
33 |
+
- hu
|
34 |
+
- el
|
35 |
+
- cs
|
36 |
+
- pl
|
37 |
+
- lt
|
38 |
+
- lv
|
39 |
+
- ka
|
40 |
+
- zh
|
41 |
+
- ja
|
42 |
+
- ko
|
43 |
+
- fi
|
44 |
+
- et
|
45 |
+
- gu
|
46 |
+
- hi
|
47 |
+
- mr
|
48 |
+
- ne
|
49 |
+
- ur
|
50 |
+
- az
|
51 |
+
- kk
|
52 |
+
- ky
|
53 |
+
- tr
|
54 |
+
- uz
|
55 |
+
- ar
|
56 |
+
- he
|
57 |
+
- fa
|
58 |
+
base_model:
|
59 |
+
- haoranxu/ALMA-13B-Pretrain
|
60 |
+
---
|
61 |
+
|
62 |
+
This is a collection of the different LoRa adapters for the different language groups. The original adapters were included in each of the separate X-ALMA-13B-Group{i} repositories. However, those individual group repositories all contained the merged version of the pretrain base + group LoRa adapter.
|
63 |
+
|
64 |
+
|
65 |
+
Original Model Card Information
|
66 |
+
-----
|
67 |
+
|
68 |
+
[X-ALMA](https://arxiv.org/pdf/2410.03115) builds upon [ALMA-R](https://arxiv.org/pdf/2401.08417) by expanding support from 6 to 50 languages. It utilizes a plug-and-play architecture with language-specific modules, complemented by a carefully designed training recipe. This release includes the **X-ALMA pre-trained base model**.
|
69 |
+
```
|
70 |
+
@misc{xu2024xalmaplugplay,
|
71 |
+
title={X-ALMA: Plug & Play Modules and Adaptive Rejection for Quality Translation at Scale},
|
72 |
+
author={Haoran Xu and Kenton Murray and Philipp Koehn and Hieu Hoang and Akiko Eriguchi and Huda Khayrallah},
|
73 |
+
year={2024},
|
74 |
+
eprint={2410.03115},
|
75 |
+
archivePrefix={arXiv},
|
76 |
+
primaryClass={cs.CL},
|
77 |
+
url={https://arxiv.org/abs/2410.03115},
|
78 |
+
}
|
79 |
+
```
|
80 |
+
X-ALMA-13B-Pretrain is pre-trained on 50 languages: en,da,nl,de,is,no,sv,af,ca,ro,gl,it,pt,es,bg,mk,sr,uk,ru,id,ms,th,vi,mg,fr,hu,el,cs,pl,lt,lv,ka,zh,ja,ko,fi,et,gu,hi,mr,ne,ur,az,kk,ky,tr,uz,ar,he,fa.
|
81 |
+
|
82 |
+
All X-ALMA checkpoints are released at huggingface:
|
83 |
+
| Models | Model Link | Description |
|
84 |
+
|:-------------:|:---------------:|:---------------:|
|
85 |
+
| X-ALMA | [haoranxu/X-ALMA](https://huggingface.co/haoranxu/X-ALMA)) | X-ALMA model with all its modules |
|
86 |
+
| X-ALMA-13B-Pretrain | [haoranxu/X-ALMA-13B-Pretrain](https://huggingface.co/haoranxu/X-ALMA-13B-Pretrain) | X-ALMA 13B multilingual pre-trained base model |
|
87 |
+
| X-ALMA-Group1 | [haoranxu/X-ALMA-13B-Group1](https://huggingface.co/haoranxu/X-ALMA-13B-Group1) | X-ALMA group1 specific module and the merged model |
|
88 |
+
| X-ALMA-Group2 | [haoranxu/X-ALMA-13B-Group2](https://huggingface.co/haoranxu/X-ALMA-13B-Group2) | X-ALMA group2 specific module and the merged model |
|
89 |
+
| X-ALMA-Group3 | [haoranxu/X-ALMA-13B-Group3](https://huggingface.co/haoranxu/X-ALMA-13B-Group3) | X-ALMA group3 specific module and the merged model |
|
90 |
+
| X-ALMA-Group4 | [haoranxu/X-ALMA-13B-Group4](https://huggingface.co/haoranxu/X-ALMA-13B-Group4) | X-ALMA group4 specific module and the merged model |
|
91 |
+
| X-ALMA-Group5 | [haoranxu/X-ALMA-13B-Group5](https://huggingface.co/haoranxu/X-ALMA-13B-Group5) | X-ALMA group5 specific module and the merged model |
|
92 |
+
| X-ALMA-Group6 | [haoranxu/X-ALMA-13B-Group6](https://huggingface.co/haoranxu/X-ALMA-13B-Group6) | X-ALMA group6 specific module and the merged model |
|
93 |
+
| X-ALMA-Group7 | [haoranxu/X-ALMA-13B-Group7](https://huggingface.co/haoranxu/X-ALMA-13B-Group7) | X-ALMA group7 specific module and the merged model |
|
94 |
+
| X-ALMA-Group8 | [haoranxu/X-ALMA-13B-Group8](https://huggingface.co/haoranxu/X-ALMA-13B-Group8) | X-ALMA group8 specific module and the merged model |
|
95 |
+
|
96 |
+
## A quick start:
|
97 |
+
There are three ways to load X-ALMA for translation. An example of translating "我爱机器翻译。" into English (X-ALMA should also able to do multilingual open-ended QA).
|
98 |
+
|
99 |
+
**The first way**: loading the merged model where the language-specific module has been merged into the base model **(Recommended)**:
|
100 |
+
```
|
101 |
+
import torch
|
102 |
+
from transformers import AutoModelForCausalLM
|
103 |
+
from transformers import AutoTokenizer
|
104 |
+
from peft import PeftModel
|
105 |
+
|
106 |
+
GROUP2LANG = {
|
107 |
+
1: ["da", "nl", "de", "is", "no", "sv", "af"],
|
108 |
+
2: ["ca", "ro", "gl", "it", "pt", "es"],
|
109 |
+
3: ["bg", "mk", "sr", "uk", "ru"],
|
110 |
+
4: ["id", "ms", "th", "vi", "mg", "fr"],
|
111 |
+
5: ["hu", "el", "cs", "pl", "lt", "lv"],
|
112 |
+
6: ["ka", "zh", "ja", "ko", "fi", "et"],
|
113 |
+
7: ["gu", "hi", "mr", "ne", "ur"],
|
114 |
+
8: ["az", "kk", "ky", "tr", "uz", "ar", "he", "fa"],
|
115 |
+
}
|
116 |
+
LANG2GROUP = {lang: str(group) for group, langs in GROUP2LANG.items() for lang in langs}
|
117 |
+
group_id = LANG2GROUP["zh"]
|
118 |
+
|
119 |
+
model = AutoModelForCausalLM.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", torch_dtype=torch.float16, device_map="auto")
|
120 |
+
tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left')
|
121 |
+
|
122 |
+
# Add the source sentence into the prompt template
|
123 |
+
prompt="Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
|
124 |
+
|
125 |
+
# X-ALMA needs chat template but ALMA and ALMA-R don't need it.
|
126 |
+
chat_style_prompt = [{"role": "user", "content": prompt}]
|
127 |
+
prompt = tokenizer.apply_chat_template(chat_style_prompt, tokenize=False, add_generation_prompt=True)
|
128 |
+
|
129 |
+
input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=40, truncation=True).input_ids.cuda()
|
130 |
+
|
131 |
+
# Translation
|
132 |
+
with torch.no_grad():
|
133 |
+
generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9)
|
134 |
+
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
135 |
+
print(outputs)
|
136 |
+
```
|
137 |
+
|
138 |
+
**The second way**: loading the base model and language-specific module **(Recommended)**:
|
139 |
+
```
|
140 |
+
model = AutoModelForCausalLM.from_pretrained("haoranxu/X-ALMA-13B-Pretrain", torch_dtype=torch.float16, device_map="auto")
|
141 |
+
model = PeftModel.from_pretrained(model, f"haoranxu/X-ALMA-13B-Group{group_id}")
|
142 |
+
tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left')
|
143 |
+
```
|
144 |
+
|
145 |
+
**The third way**: loading the base model with all language-specific modules like MoE: (Require large GPU memory)
|
146 |
+
```
|
147 |
+
from modeling_xalma import XALMAForCausalLM
|
148 |
+
model = XALMAForCausalLM.from_pretrained("haoranxu/X-ALMA", torch_dtype=torch.float16, device_map="auto")
|
149 |
+
tokenizer = AutoTokenizer.from_pretrained("haoranxu/X-ALMA", padding_side='left')
|
150 |
+
|
151 |
+
# Add `lang="zh"`: specify the language to instruct the model on which group to use for the third loading method during generation.
|
152 |
+
generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9, lang="zh")
|
153 |
+
```
|