Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Jan 20, 2022

Commit

539e83f

1 Parent(s): d2dce47

infer logic added

Browse files

Files changed (6) hide show

config.yaml +4 -4
idiomify/fetchers.py +21 -9
idiomify/models.py +12 -12
idiomify/tensors.py +1 -1
main_infer.py +34 -0
main_train.py +4 -5

config.yaml CHANGED Viewed

@@ -7,7 +7,7 @@ alpha:
     idiom2def_ver: c
     k: 11
     lr: 0.00001
-    max_epochs: 200
     batch_size: 64
     shuffle: true
   kor2eng:
@@ -18,7 +18,7 @@ alpha:
     idiom2def_ver: d
     k: 11
     lr: 0.00001
-    max_epochs: 200
     batch_size: 64
     num_workers: 4
     shuffle: true
@@ -30,7 +30,7 @@ gamma:
     idiom2def_ver: c
     k: 11
     lr: 0.00001
-    max_epochs: 200
     batch_size: 64
     shuffle: true
   kor2eng:
@@ -40,7 +40,7 @@ gamma:
     idiom2def_ver: d
     k: 11
     lr: 0.00001
-    max_epochs: 200
     batch_size: 64
     num_workers: 4
     shuffle: true

     idiom2def_ver: c
     k: 11
     lr: 0.00001
+    max_epochs: 10
     batch_size: 64
     shuffle: true
   kor2eng:
     idiom2def_ver: d
     k: 11
     lr: 0.00001
+    max_epochs: 20
     batch_size: 64
     num_workers: 4
     shuffle: true
     idiom2def_ver: c
     k: 11
     lr: 0.00001
+    max_epochs: 10
     batch_size: 64
     shuffle: true
   kor2eng:
     idiom2def_ver: d
     k: 11
     lr: 0.00001
+    max_epochs: 20
     batch_size: 64
     num_workers: 4
     shuffle: true

idiomify/fetchers.py CHANGED Viewed

@@ -2,8 +2,10 @@ import csv
 import yaml
 import wandb
 from typing import Tuple, List
-from idiomify.models import Alpha, Gamma
-from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir
 # dataset
@@ -35,13 +37,23 @@ def fetch_idioms(ver: str) -> List[str]:
         ]
-# models
-def fetch_alpha(ver: str) -> Alpha:
-    pass
-def fetch_gamma(ver: str) -> Gamma:
-    pass
 def fetch_config() -> dict:

 import yaml
 import wandb
 from typing import Tuple, List
+from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer
+from idiomify.models import Alpha, Gamma, RD
+from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir, alpha_dir
+from idiomify import tensors as T
 # dataset
         ]
+def fetch_rd(model: str, ver: str) -> RD:
+    artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/{model}:{ver}", type="model")
+    config = artifact.metadata
+    artifact_path = alpha_dir(ver)
+    artifact.download(root=str(artifact_path))
+    mlm = AutoModelForMaskedLM.from_config(AutoConfig.from_pretrained(config['bert']))
+    ckpt_path = artifact_path / "rd.ckpt"
+    idioms = fetch_idioms(config['idioms_ver'])
+    tokenizer = BertTokenizer.from_pretrained(config['bert'])
+    idiom2subwords = T.idiom2subwords(idioms, tokenizer, config['k'])
+    if model == Alpha.name():
+        rd = Alpha.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
+    elif model == Gamma.name():
+        rd = Gamma.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
+    else:
+        raise ValueError
+    return rd
 def fetch_config() -> dict:

idiomify/models.py CHANGED Viewed

@@ -29,17 +29,17 @@ class RD(pl.LightningModule):
     def predict_dataloader(self):
         pass
-    def __init__(self, mlm: BertForMaskedLM, wisdom2subwords: torch.Tensor, k: int, lr: float):  # noqa
         """
         :param mlm: a bert model for masked language modeling
-        :param wisdom2subwords: (|W|, K)
         :return: (N, K, |V|); (num samples, k, the size of the vocabulary of subwords)
         """
         super().__init__()
         # -- hyper params --- #
         # should be saved to self.hparams
         # https://github.com/PyTorchLightning/pytorch-lightning/issues/4390#issue-730493746
-        self.save_hyperparameters(ignore=["mlm", "wisdom2subwords"])
         # -- the only neural network we need -- #
         self.mlm = mlm
         # --- to be used for getting H_k --- #
@@ -47,7 +47,7 @@ class RD(pl.LightningModule):
         # --- to be used for getting H_desc --- #
         self.desc_mask: Optional[torch.Tensor] = None  # (N, L)
         # -- constant tensors -- #
-        self.register_buffer("wisdom2subwords", wisdom2subwords)  # (|W|, K)
     def forward(self, X: torch.Tensor) -> torch.Tensor:
         """
@@ -94,7 +94,7 @@ class RD(pl.LightningModule):
         :return: S_wisdom_literal (N, |W|)
         """
         S_vocab = self.mlm.cls(H_k)  # bmm; (N, K, H) * (H, |V|) ->  (N, K, |V|)
-        indices = self.wisdom2subwords.T.repeat(S_vocab.shape[0], 1, 1)  # (|W|, K) -> (N, K, |W|)
         S_wisdom_literal = S_vocab.gather(dim=-1, index=indices)  # (N, K, |V|) -> (N, K, |W|)
         S_wisdom_literal = S_wisdom_literal.sum(dim=1)  # (N, K, |W|) -> (N, |W|)
         return S_wisdom_literal
@@ -194,9 +194,9 @@ class Gamma(RD):
     but the way we get S_wisdom_figurative is much simplified, compared with RDBeta.
     """
-    def __init__(self, mlm: BertForMaskedLM, wisdom2subwords: torch.Tensor, k: int, lr: float):
-        super().__init__(mlm, wisdom2subwords, k, lr)
-        # a pooler is a multilayer perceptron that pools wisdom_embeddings from wisdom2subwords_embeddings
         self.pooler = BiLSTMPooler(self.mlm.config.hidden_size)
         # --- to be used to compute  attentions --- #
         self.attention_mask: Optional[torch.Tensor] = None
@@ -232,11 +232,11 @@ class Gamma(RD):
         return S_wisdom, S_wisdom_literal, S_wisdom_figurative
     def S_wisdom_figurative(self, H_all: torch.Tensor) -> torch.Tensor:
-        # --- draw the embeddings for wisdoms from  the embeddings of wisdom2subwords -- #
         # this is to use as less of newly initialised weights as possible
-        wisdom2subwords_embeddings = self.mlm.bert \
-            .embeddings.word_embeddings(self.wisdom2subwords)  # (W, K)  -> (W, K, H)
-        wisdom_embeddings = self.pooler(wisdom2subwords_embeddings).squeeze()  # (W, H, K) -> (W, H, 1) -> (W, H)
         # --- draw H_wisdom from H_desc with attention --- #
         H_cls = H_all[:, 0]  # (N, L, H) -> (N, H)
         H_desc = self.H_desc(H_all)  # (N, L, H) -> (N, D, H)

     def predict_dataloader(self):
         pass
+    def __init__(self, mlm: BertForMaskedLM, idiom2subwords: torch.Tensor, k: int, lr: float):  # noqa
         """
         :param mlm: a bert model for masked language modeling
+        :param idiom2subwords: (|W|, K)
         :return: (N, K, |V|); (num samples, k, the size of the vocabulary of subwords)
         """
         super().__init__()
         # -- hyper params --- #
         # should be saved to self.hparams
         # https://github.com/PyTorchLightning/pytorch-lightning/issues/4390#issue-730493746
+        self.save_hyperparameters(ignore=["mlm", "idiom2subwords"])
         # -- the only neural network we need -- #
         self.mlm = mlm
         # --- to be used for getting H_k --- #
         # --- to be used for getting H_desc --- #
         self.desc_mask: Optional[torch.Tensor] = None  # (N, L)
         # -- constant tensors -- #
+        self.register_buffer("idiom2subwords", idiom2subwords)  # (|W|, K)
     def forward(self, X: torch.Tensor) -> torch.Tensor:
         """
         :return: S_wisdom_literal (N, |W|)
         """
         S_vocab = self.mlm.cls(H_k)  # bmm; (N, K, H) * (H, |V|) ->  (N, K, |V|)
+        indices = self.idiom2subwords.T.repeat(S_vocab.shape[0], 1, 1)  # (|W|, K) -> (N, K, |W|)
         S_wisdom_literal = S_vocab.gather(dim=-1, index=indices)  # (N, K, |V|) -> (N, K, |W|)
         S_wisdom_literal = S_wisdom_literal.sum(dim=1)  # (N, K, |W|) -> (N, |W|)
         return S_wisdom_literal
     but the way we get S_wisdom_figurative is much simplified, compared with RDBeta.
     """
+    def __init__(self, mlm: BertForMaskedLM, idiom2subwords: torch.Tensor, k: int, lr: float):
+        super().__init__(mlm, idiom2subwords, k, lr)
+        # a pooler is a multilayer perceptron that pools wisdom_embeddings from idiom2subwords_embeddings
         self.pooler = BiLSTMPooler(self.mlm.config.hidden_size)
         # --- to be used to compute  attentions --- #
         self.attention_mask: Optional[torch.Tensor] = None
         return S_wisdom, S_wisdom_literal, S_wisdom_figurative
     def S_wisdom_figurative(self, H_all: torch.Tensor) -> torch.Tensor:
+        # --- draw the embeddings for wisdoms from  the embeddings of idiom2subwords -- #
         # this is to use as less of newly initialised weights as possible
+        idiom2subwords_embeddings = self.mlm.bert \
+            .embeddings.word_embeddings(self.idiom2subwords)  # (W, K)  -> (W, K, H)
+        wisdom_embeddings = self.pooler(idiom2subwords_embeddings).squeeze()  # (W, H, K) -> (W, H, 1) -> (W, H)
         # --- draw H_wisdom from H_desc with attention --- #
         H_cls = H_all[:, 0]  # (N, L, H) -> (N, H)
         H_desc = self.H_desc(H_all)  # (N, L, H) -> (N, D, H)

idiomify/tensors.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import List
 from transformers import BertTokenizer
-def wisdom2subwords(idioms: List[str], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
     mask_id = tokenizer.mask_token_id
     pad_id = tokenizer.pad_token_id
     # temporarily disable single-token status of the wisdoms

 from transformers import BertTokenizer
+def idiom2subwords(idioms: List[str], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
     mask_id = tokenizer.mask_token_id
     pad_id = tokenizer.pad_token_id
     # temporarily disable single-token status of the wisdoms

main_infer.py CHANGED Viewed

	@@ -0,0 +1,34 @@

+import argparse
+from idiomify.fetchers import fetch_config, fetch_idioms, fetch_rd
+from idiomify import tensors as T
+from transformers import BertTokenizer
+def main():
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--model", type=str,
+                            default="alpha")
+        parser.add_argument("--ver", type=str,
+                            default="eng2eng")
+        parser.add_argument("--sent", type=str,
+                            default="avoid getting to the point")
+        args = parser.parse_args()
+        config = fetch_config()[args.model][args.ver]
+        config.update(vars(args))
+        tokenizer = BertTokenizer.from_pretrained(config['bert'])
+        idioms = fetch_idioms(config['idioms_ver'])
+        X = T.inputs([config['sent']], tokenizer, config['k'])
+        rd = fetch_rd(config['model'], config['ver'])
+        probs = rd.P_wisdom(X).squeeze().tolist()
+        wisdom2prob = [
+                (wisdom, prob)
+                for wisdom, prob in zip(idioms, probs)
+        ]
+        # sort and append
+        res = list(sorted(wisdom2prob, key=lambda x: x[1], reverse=True))
+        for idx, (idiom, prob) in enumerate(res):
+            print(idx, idiom, prob)
+if __name__ == '__main__':
+    main()

main_train.py CHANGED Viewed

@@ -15,7 +15,6 @@ from idiomify import tensors as T
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("entity", type=str)
     parser.add_argument("--model", type=str, default="alpha")
     parser.add_argument("--ver", type=str, default="eng2eng")
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
@@ -32,18 +31,18 @@ def main():
     mlm = BertForMaskedLM.from_pretrained(config['bert'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
     idioms = fetch_idioms(config['idioms_ver'])
-    wisdom2subwords = T.wisdom2subwords(idioms, tokenizer, config['k'])
     # choose the model to train
     if config['model'] == Alpha.name():
-        rd = Alpha(mlm, wisdom2subwords, config['k'], config['lr'])
     elif config['model'] == Gamma.name():
-        rd = Gamma(mlm, wisdom2subwords, config['k'], config['lr'])
     else:
         raise ValueError
     # prepare datamodule
     datamodule = IdiomifyDataModule(config, tokenizer, idioms)
-    with wandb.init(entity=config['entity'], project="idiomify_demo", config=config) as run:
         logger = WandbLogger(log_model=False)
         trainer = pl.Trainer(max_epochs=config['max_epochs'],
                              fast_dev_run=config['fast_dev_run'],

 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, default="alpha")
     parser.add_argument("--ver", type=str, default="eng2eng")
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
     mlm = BertForMaskedLM.from_pretrained(config['bert'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
     idioms = fetch_idioms(config['idioms_ver'])
+    idiom2subwords = T.idiom2subwords(idioms, tokenizer, config['k'])
     # choose the model to train
     if config['model'] == Alpha.name():
+        rd = Alpha(mlm, idiom2subwords, config['k'], config['lr'])
     elif config['model'] == Gamma.name():
+        rd = Gamma(mlm, idiom2subwords, config['k'], config['lr'])
     else:
         raise ValueError
     # prepare datamodule
     datamodule = IdiomifyDataModule(config, tokenizer, idioms)
+    with wandb.init(entity="eubinecto", project="idiomify-demo", config=config) as run:
         logger = WandbLogger(log_model=False)
         trainer = pl.Trainer(max_epochs=config['max_epochs'],
                              fast_dev_run=config['fast_dev_run'],