henok3878 commited on
Commit
a1f27d5
·
1 Parent(s): a2ddf9a

setup project and add configs and tokenizers

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizers/*.json filter=lfs diff=lfs merge=lfs -text
configs/config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ src_vocab_size: 37000
3
+ tgt_vocab_size: 37000
4
+ d_model: 512
5
+ num_heads: 8
6
+ d_ff: 2048
7
+ num_encoder_layers: 6
8
+ num_decoder_layers: 6
9
+ dropout: 0.1
10
+ src_max_len: 128
11
+ tgt_max_len: 128
12
+
13
+ training:
14
+ seed: 42
15
+ batch_size: 144
16
+ epochs: 24
17
+ lr_factor: 1.0
18
+ num_workers: 8
19
+ quick_val_size: 1024 # 1024 examples for quick eval
20
+ quick_eval_every: 1000 # steps
21
+ full_eval_every: 10000 # steps
22
+ warmup_steps: 4000
23
+ weight_decay: 0.01
24
+ adam_eps: 1e-9
25
+ adam_beta1: 0.9
26
+ adam_beta2: 0.98
27
+ label_smoothing: 0.1
28
+ max_grad_norm: 1.0
29
+
30
+ experiment:
31
+ base_dir: "experiments"
32
+ checkpoint_dir: "checkpoints"
33
+ save_every_steps: 10000 # steps
34
+ keep_last_n: 10 # keep last n step checkpoints
35
+ log_every: 100 # log every N batches
36
+ log_dir: "logs"
37
+
38
+ data:
39
+ dataset_name: "wmt14"
40
+ subset: "de-en"
41
+ lang_src: "en"
42
+ lang_tgt: "de"
43
+ tokenization_strategy: "joint" # "joint" or "separate"
44
+ validation_fraction: 0.05
environment.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: transformer-demo
2
+ channels:
3
+ - conda-forge
4
+ - pytorch
5
+ dependencies:
6
+ - python=3.12
7
+ - pip
8
+ - pip:
9
+ - -r requirements.txt
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ tokenizers
3
+ gradio
4
+ pyyaml
5
+ transformer-from-scratch @ git+https://github.com/henok3878/transformer-from-scratch.git
tokenizers/tokenizer-joint-de-en-vocab37000.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f348d8c6dfb4903cbb17cdeff7f4223da6278e5e110674afa86093f6ef5378
3
+ size 2685044