Spaces:

skar0
/

shakespeare-demo

Runtime error

App Files Files Community

skar0 commited on Feb 22, 2023

Commit

4e46e20

1 Parent(s): 24dc1da

Removed dataclasses from requirements

Browse files

Files changed (3) hide show

attention_replication.py +2 -2
requirements.txt +147 -154
transformer_replication.py +16 -13

attention_replication.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # %%
 import torch as t
 import torch.nn as nn
-from typing import Union, List
 from fancy_einsum import einsum
-from einops import repeat, rearrange, reduce
 import numpy as np
 #%%
 def single_head_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor) -> t.Tensor:

 # %%
 import torch as t
 import torch.nn as nn
+from typing import Union
 from fancy_einsum import einsum
+from einops import repeat, rearrange
 import numpy as np
 #%%
 def single_head_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor) -> t.Tensor:

requirements.txt CHANGED Viewed

@@ -1,168 +1,161 @@
-aiofiles @ file:///home/conda/feedstock_root/build_artifacts/aiofiles_1676402724025/work
-aiohttp @ file:///home/conda/feedstock_root/build_artifacts/aiohttp_1676292661248/work
-aiosignal @ file:///home/conda/feedstock_root/build_artifacts/aiosignal_1667935791922/work
-altair @ file:///home/conda/feedstock_root/build_artifacts/altair_1675180856922/work
-anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1666191106763/work/dist
-argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1640817743617/work
-argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_1666850768662/work
-asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1670263926556/work
-async-timeout @ file:///home/conda/feedstock_root/build_artifacts/async-timeout_1640026696943/work
-attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1671632566681/work
-backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
-backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
-beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1675252249248/work
-bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1674535352125/work
-brotlipy @ file:///home/conda/feedstock_root/build_artifacts/brotlipy_1666764671472/work
 certifi==2022.12.7
-cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1671179353105/work
-charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1661170624537/work
-click @ file:///home/conda/feedstock_root/build_artifacts/click_1666798198223/work
-colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1666700638685/work
-comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1670575068857/work
-contourpy @ file:///home/conda/feedstock_root/build_artifacts/contourpy_1673633665736/work
-cryptography @ file:///home/conda/feedstock_root/build_artifacts/cryptography-split_1675828607645/work
-cycler @ file:///home/conda/feedstock_root/build_artifacts/cycler_1635519461629/work
-dataclasses @ file:///home/conda/feedstock_root/build_artifacts/dataclasses_1628958434797/work
-datasets @ file:///home/conda/feedstock_root/build_artifacts/datasets_1674838636692/work
-debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1674522362098/work
-decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
-defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
-dill @ file:///home/conda/feedstock_root/build_artifacts/dill_1666603105584/work
-einops @ file:///home/conda/feedstock_root/build_artifacts/einops_1670600230829/work
-entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1643888246732/work
-executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1667317341051/work
 fancy-einsum==0.0.3
-fastapi @ file:///home/conda/feedstock_root/build_artifacts/fastapi_1676407540585/work
-fastjsonschema @ file:///home/conda/feedstock_root/build_artifacts/python-fastjsonschema_1663619548554/work/dist
-ffmpy @ file:///home/conda/feedstock_root/build_artifacts/ffmpy_1659474992694/work
-filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1672354931606/work
-Flask @ file:///home/conda/feedstock_root/build_artifacts/flask_1676592993069/work
-flit_core @ file:///home/conda/feedstock_root/build_artifacts/flit-core_1667734568827/work/source/flit_core
-fonttools @ file:///home/conda/feedstock_root/build_artifacts/fonttools_1666827107856/work
-frozenlist @ file:///home/conda/feedstock_root/build_artifacts/frozenlist_1667935435842/work
-fsspec @ file:///home/conda/feedstock_root/build_artifacts/fsspec_1674184942191/work
-gradio @ file:///home/conda/feedstock_root/build_artifacts/gradio_1676897693557/work
-h11 @ file:///home/conda/feedstock_root/build_artifacts/h11_1664132893548/work
-h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1634280454336/work
 hpack==4.0.0
-httpcore @ file:///home/conda/feedstock_root/build_artifacts/httpcore_1671551055614/work
-httpx @ file:///home/conda/feedstock_root/build_artifacts/httpx_1672850625594/work
-huggingface-hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1676642337813/work
-hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1619110129307/work
-idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1663625384323/work
-importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1672612343532/work
-importlib-resources @ file:///home/conda/feedstock_root/build_artifacts/importlib_resources_1676919000169/work
-ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1676322140253/work
-ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1676047456691/work
 ipython-genutils==0.2.0
-ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1671720089366/work
-itsdangerous @ file:///home/conda/feedstock_root/build_artifacts/itsdangerous_1648147185463/work
-jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1669134318875/work
-Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
-joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1663332044897/work
-jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema-meta_1669810440410/work
-jupyter @ file:///home/conda/feedstock_root/build_artifacts/jupyter_1670249595582/work
-jupyter-console @ file:///home/conda/feedstock_root/build_artifacts/jupyter_console_1676328545892/work
-jupyter-events @ file:///home/conda/feedstock_root/build_artifacts/jupyter_events_1673559782596/work
-jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1676579893731/work
-jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1675109846004/work
-jupyter_server @ file:///home/conda/feedstock_root/build_artifacts/jupyter_server_1676476189852/work
-jupyter_server_terminals @ file:///home/conda/feedstock_root/build_artifacts/jupyter_server_terminals_1673491454549/work
-jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1649936611996/work
-jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1671722028097/work
-kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/kiwisolver_1666805701884/work
-linkify-it-py @ file:///home/conda/feedstock_root/build_artifacts/linkify-it-py_1651923627081/work
-markdown-it-py @ file:///home/conda/feedstock_root/build_artifacts/markdown-it-py_1650305363826/work
-MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1674135787083/work
-matplotlib @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-suite_1676406361850/work
-matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1660814786464/work
-mdit-py-plugins @ file:///home/conda/feedstock_root/build_artifacts/mdit-py-plugins_1670348296204/work
-mdurl @ file:///home/conda/feedstock_root/build_artifacts/mdurl_1639515908913/work
-mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1675771498296/work
-multidict @ file:///home/conda/feedstock_root/build_artifacts/multidict_1672339403932/work
-multiprocess @ file:///home/conda/feedstock_root/build_artifacts/multiprocess_1666932878376/work
 munkres==1.1.4
-nbclassic @ file:///home/conda/feedstock_root/build_artifacts/nbclassic_1676729186918/work
-nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1669795076334/work
-nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert-meta_1674590374792/work
-nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1673560067442/work
-nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1664684991461/work
-notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1667565639349/work
-notebook_shim @ file:///home/conda/feedstock_root/build_artifacts/notebook-shim_1667478401171/work
-numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1675642512762/work
-orjson @ file:///home/conda/feedstock_root/build_artifacts/orjson_1673484660945/work/target/wheels/orjson-3.8.5-cp310-cp310-linux_x86_64.whl
-packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1673482170163/work
 pandas==1.5.3
-pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
-parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1638334955874/work
-pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1667297516076/work
-pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
-Pillow @ file:///home/conda/feedstock_root/build_artifacts/pillow_1675487172403/work
-pkgutil_resolve_name @ file:///home/conda/feedstock_root/build_artifacts/pkgutil-resolve-name_1633981968097/work
-platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1675735718929/work
 ply==3.11
-prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1674535637125/work
-prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1670414775770/work
-psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1667885877572/work
-ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
-pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
-pyarrow==11.0.0
-pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1636257122734/work
-pycryptodome @ file:///home/conda/feedstock_root/build_artifacts/pycryptodome_1669581639515/work
-pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1676531650626/work
-pydub @ file:///home/conda/feedstock_root/build_artifacts/pydub_1615612442567/work
-Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1672682006896/work
-pyOpenSSL @ file:///home/conda/feedstock_root/build_artifacts/pyopenssl_1672659226110/work
-pyparsing @ file:///home/conda/feedstock_root/build_artifacts/pyparsing_1652235407899/work
 PyQt5==5.15.7
 PyQt5-sip==12.11.0
-pyrsistent @ file:///home/conda/feedstock_root/build_artifacts/pyrsistent_1672681463845/work
-PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
-python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
-python-json-logger @ file:///home/conda/feedstock_root/build_artifacts/python-json-logger_1676401516590/work
 python-multipart==0.0.5
-pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1673864280276/work
-PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1666772395347/work
-pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1673612669255/work
-qtconsole @ file:///home/conda/feedstock_root/build_artifacts/qtconsole-base_1667404144336/work
-QtPy @ file:///home/conda/feedstock_root/build_artifacts/qtpy_1667873092748/work
-regex @ file:///home/conda/feedstock_root/build_artifacts/regex_1667265033016/work
-requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1673863902341/work
-responses @ file:///home/conda/feedstock_root/build_artifacts/responses_1643839609465/work
-rfc3339-validator @ file:///home/conda/feedstock_root/build_artifacts/rfc3339-validator_1638811747357/work
-rfc3986 @ file:///home/conda/feedstock_root/build_artifacts/rfc3986_1620442452971/work
-rfc3986-validator @ file:///home/conda/feedstock_root/build_artifacts/rfc3986-validator_1598024191506/work
-sacremoses @ file:///home/conda/feedstock_root/build_artifacts/sacremoses_1651557636210/work
-Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
-sip @ file:///home/conda/feedstock_root/build_artifacts/sip_1675696581052/work
-six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
-sniffio @ file:///home/conda/feedstock_root/build_artifacts/sniffio_1662051266223/work
-soupsieve @ file:///home/conda/feedstock_root/build_artifacts/soupsieve_1658207591808/work
-stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
-starlette @ file:///home/conda/feedstock_root/build_artifacts/starlette-recipe_1676402644778/work
-terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1670253674810/work
-tinycss2 @ file:///home/conda/feedstock_root/build_artifacts/tinycss2_1666100256010/work
-tokenizers @ file:///home/conda/feedstock_root/build_artifacts/tokenizers_1674690844352/work/bindings/python
-toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1604308577558/work
-toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1657485559105/work
 torch==1.13.1
 torchaudio==0.13.1
 torchvision==0.14.1
-tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1666788589303/work
-tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1662214488106/work
-traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1675110562325/work
-transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1676091074773/work
-typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1665144421445/work
-uc-micro-py @ file:///home/conda/feedstock_root/build_artifacts/uc-micro-py_1608058642472/work
-unicodedata2 @ file:///home/conda/feedstock_root/build_artifacts/unicodedata2_1667239886688/work
-urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1673452138552/work
-uvicorn @ file:///home/conda/feedstock_root/build_artifacts/uvicorn-split_1669234664979/work
-wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1673864653149/work
 webencodings==0.5.1
-websocket-client @ file:///home/conda/feedstock_root/build_artifacts/websocket-client_1675567828044/work
-websockets @ file:///home/conda/feedstock_root/build_artifacts/websockets_1666806213473/work
-Werkzeug @ file:///home/conda/feedstock_root/build_artifacts/werkzeug_1676411946679/work
-widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1672066693230/work
-xxhash @ file:///home/conda/feedstock_root/build_artifacts/python-xxhash_1672695020159/work
-yarl @ file:///home/conda/feedstock_root/build_artifacts/yarl_1672340954791/work
-zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1676708471276/work

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==22.2.0
+backcall==0.2.0
+backports.functools-lru-cache==1.6.4
+beautifulsoup4==4.11.2
+bleach==6.0.0
+brotlipy==0.7.0
 certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+comm==0.1.2
+contourpy==1.0.7
+cryptography==39.0.1
+cycler==0.11.0
+debugpy==1.6.6
+decorator==5.1.1
+defusedxml==0.7.1
+einops==0.6.0
+entrypoints==0.4
+executing==1.2.0
 fancy-einsum==0.0.3
+fastapi==0.92.0
+fastjsonschema==2.16.2
+ffmpy==0.3.0
+filelock==3.9.0
+Flask==2.2.3
+flit_core==3.8.0
+fonttools==4.38.0
+frozenlist==1.3.3
+fsspec==2023.1.0
+gradio==3.19.1
+h11==0.14.0
+h2==4.1.0
 hpack==4.0.0
+httpcore==0.16.3
+httpx==0.23.3
+huggingface-hub==0.12.1
+hyperframe==6.0.1
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.12.0
+ipykernel==6.21.2
+ipython==8.10.0
 ipython-genutils==0.2.0
+ipywidgets==8.0.4
+itsdangerous==2.1.2
+jedi==0.18.2
+Jinja2==3.1.2
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter_client==8.0.3
+jupyter-console==6.5.1
+jupyter_core==5.2.0
+jupyter-events==0.6.3
+jupyter_server==2.3.0
+jupyter_server_terminals==0.4.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.5
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+markdown-it-py==2.1.0
+MarkupSafe==2.1.2
+matplotlib==3.7.0
+matplotlib-inline==0.1.6
+mdit-py-plugins==0.3.3
+mdurl==0.1.0
+mistune==2.0.5
+multidict==6.0.4
 munkres==1.1.4
+nbclassic==0.5.2
+nbclient==0.7.2
+nbconvert==7.2.9
+nbformat==5.7.3
+nest-asyncio==1.5.6
+notebook==6.5.2
+notebook_shim==0.2.2
+numpy==1.24.2
+orjson==3.8.5
+packaging==23.0
 pandas==1.5.3
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.4.0
+pip==23.0.1
+pkgutil_resolve_name==1.3.10
+platformdirs==3.0.0
 ply==3.11
+prometheus-client==0.16.0
+prompt-toolkit==3.0.36
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+pycryptodome==3.16.0
+pydantic==1.10.5
+pydub==0.25.1
+Pygments==2.14.0
+pyOpenSSL==23.0.0
+pyparsing==3.0.9
 PyQt5==5.15.7
 PyQt5-sip==12.11.0
+pyrsistent==0.19.3
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-json-logger==2.0.6
 python-multipart==0.0.5
+pytz==2022.7.1
+PyYAML==6.0
+pyzmq==25.0.0
+qtconsole==5.4.0
+QtPy==2.3.0
+regex==2022.10.31
+requests==2.28.2
+rfc3339-validator==0.1.4
+rfc3986==1.5.0
+rfc3986-validator==0.1.1
+Send2Trash==1.8.0
+setuptools==67.3.2
+sip==6.7.7
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.3.2.post1
+stack-data==0.6.2
+starlette==0.25.0
+terminado==0.17.1
+tinycss2==1.2.1
+tokenizers==0.13.2
+toml==0.10.2
+toolz==0.12.0
 torch==1.13.1
 torchaudio==0.13.1
 torchvision==0.14.1
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.9.0
+transformers==4.26.1
+typing_extensions==4.4.0
+uc-micro-py==1.0.1
+unicodedata2==15.0.0
+urllib3==1.26.14
+uvicorn==0.20.0
+wcwidth==0.2.6
 webencodings==0.5.1
+websocket-client==1.5.1
+websockets==10.4
+Werkzeug==2.2.3
+wheel==0.38.4
+widgetsnbextension==4.0.5
+yarl==1.8.2
+zipp==3.14.0

transformer_replication.py CHANGED Viewed

@@ -5,13 +5,7 @@ import torch.nn as nn
 from typing import Union, List
 from fancy_einsum import einsum
 import torch as t
-from torch import nn
-from torchvision import datasets, transforms
-from torch.utils.data import DataLoader
-from typing import Union, Optional, Callable, Tuple
-import numpy as np
-from einops import rearrange
-import time
 # %%
 tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
 if __name__ == "__main__":
@@ -90,9 +84,6 @@ class LayerNorm(nn.Module):
         pass
 # %%
-from dataclasses import dataclass
-@dataclass(frozen=True)
 class TransformerConfig:
     '''Constants used throughout your decoder-only transformer model.'''
@@ -101,10 +92,22 @@ class TransformerConfig:
     vocab_size: int
     hidden_size: int
     max_seq_len: int
-    dropout: float = 0.1
-    layer_norm_epsilon: float = 1e-05
 # %%
-import attention_replication
 class BertMLP(nn.Module):
     def __init__(self, config: TransformerConfig):

 from typing import Union, List
 from fancy_einsum import einsum
 import torch as t
+import attention_replication
 # %%
 tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
 if __name__ == "__main__":
         pass
 # %%
 class TransformerConfig:
     '''Constants used throughout your decoder-only transformer model.'''
     vocab_size: int
     hidden_size: int
     max_seq_len: int
+    dropout: float
+    layer_norm_epsilon: float
+    def __init__(
+        self, num_layers, num_heads, vocab_size, hidden_size, max_seq_len,
+        dropout=0.1, layer_norm_epsilon=1e-5,
+    ) -> None:
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
 # %%
 class BertMLP(nn.Module):
     def __init__(self, config: TransformerConfig):