Spaces:

skar0
/

shakespeare-demo

Runtime error

App Files Files Community

skar0 commited on Feb 22, 2023

Commit

4c2c4e8

1 Parent(s): 8c79c01

Initial commit

Browse files

Files changed (11) hide show

100-0.txt +0 -0
Dockerfile +11 -0
Procfile +1 -0
app.py +13 -0
attention_replication.py +156 -0
config.yaml +61 -0
env.yaml +406 -0
sampling.py +239 -0
shakespeare_demo.py +105 -0
transformer_replication.py +183 -0
word_data.py +100 -0

100-0.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+# Create environment
+FROM mambaorg/micromamba:1.3.1
+COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
+RUN micromamba install --yes --file /tmp/env.yaml && \
+    micromamba clean --all --yes
+# Run app
+COPY . /app/
+WORKDIR /app/
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+RUN python app.py

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: gunicorn app:app

app.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from flask import Flask
+import os
+from shakespeare_demo import make_demo
+app = Flask(__name__)
+@app.route("/")
+def hello_world():
+    return make_demo()
+if __name__ == "__main__":
+    port = int(os.environ.get('PORT', 5999))
+    app.run(debug=True, host='0.0.0.0', port=port)

attention_replication.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# %%
+import torch as t
+import torch.nn as nn
+from typing import Union, List
+from fancy_einsum import einsum
+from einops import repeat, rearrange, reduce
+import numpy as np
+#%%
+def single_head_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor) -> t.Tensor:
+    '''
+    Should return the results of self-attention (see the "Self-Attention in Detail" section of the Illustrated Transformer).
+    With this function, you can ignore masking.
+    Q: shape (batches x seq_Q x head_size)
+    K: shape (batches x seq_K x head_size)
+    V: shape (batches x seq_K x head_size)
+    Return: shape (batches x seq_Q x head_size)
+    '''
+    attention_scores = einsum('batches seq_Q head_size, batches seq_K head_size -> batches seq_Q seq_K', Q, K)
+    #Ignore masking
+    attention_probabilities = nn.functional.softmax(attention_scores / np.sqrt(Q.shape[-1]), dim=2)
+    attention_values = einsum('batches seq_Q seq_K, batches seq_K head_size -> batches seq_Q head_size', attention_probabilities, V)
+    return attention_values
+def test_single_head_attention_shape(single_head_attention):
+    Q = t.randn(1, 3, 2)
+    K = t.randn(1, 5, 2)
+    V = t.randn(1, 5, 2)
+    attention_values = single_head_attention(Q, K, V)
+    assert Q.shape == attention_values.shape
+    print(f"All tests in `test_single_head_attention_shape` passed.")
+def test_single_head_attention(single_head_attention):
+    Q = t.tensor([[[7, 4, 1], [6, 3, 0], [5, 2, 1]]])
+    K = t.tensor([[[1, 3, 5], [2, 4, 6]]])
+    V = t.tensor([[[1, 0, 1], [0, 1, 0]]])
+    attention_values = single_head_attention(Q.float(), K.float(), V.float())
+    t.testing.assert_close(attention_values, t.tensor([[[9.7880e-04, 9.9902e-01, 9.7880e-04], [5.5073e-03, 9.9449e-01, 5.5073e-03], [9.7682e-03, 9.9023e-01, 9.7682e-03]]]), rtol=0.01, atol=0.001)
+    print(f"All tests in `test_single_head_attention` passed.")
+if __name__ == "__main__":
+    test_single_head_attention_shape(single_head_attention)
+    test_single_head_attention(single_head_attention)
+# %%
+def single_head_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor) -> t.Tensor:
+    '''
+    Should return the results of masked self-attention.
+    See "The Decoder Side" section of the Illustrated Transformer for an explanation of masking.
+    Q: shape (batches x seq_Q x head_size)
+    K: shape (batches x seq_K x head_size)
+    V: shape (batches x seq_K x head_size)
+    Return: shape (batches x seq_Q x head_size)
+    '''
+    attention_scores = einsum('batches seq_Q head_size, batches seq_K head_size -> batches seq_Q seq_K', Q, K)
+    batches, seq_Q, head_size = Q.shape
+    batches, seq_K, head_size = K.shape
+    q_index = repeat(t.arange(0, seq_Q), 'q -> b q k', b=batches, k=seq_K)
+    k_index = repeat(t.arange(0, seq_K), 'k -> b q k', b=batches, q=seq_Q)
+    mask = k_index <= q_index
+    attention_scores = t.where(mask, attention_scores, -t.inf)
+    attention_probabilities = nn.functional.softmax(attention_scores / np.sqrt(Q.shape[-1]), dim=2)
+    attention_values = einsum('batches seq_Q seq_K, batches seq_K head_size -> batches seq_Q head_size', attention_probabilities, V)
+    return attention_values
+def test_single_head_masked_attention(single_head_masked_attention):
+    Q = t.tensor([[[7, 4, 1], [6, 3, 0], [5, 2, 1]]])
+    K = t.tensor([[[1, 3, 5], [2, 4, 6]]])
+    V = t.tensor([[[1, 0, 1], [0, 1, 0]]])
+    attention_values = single_head_masked_attention(Q.float(), K.float(), V.float())
+    t.testing.assert_close(attention_values, t.tensor([[[1, 0, 1], [5.5073e-03, 9.9449e-01, 5.5073e-03], [9.7682e-03, 9.9023e-01, 9.7682e-03]]]), rtol=0.01, atol=0.001)
+    print(f"All tests in `test_single_head_attention` passed.")
+if __name__ == "__main__":
+    test_single_head_attention_shape(single_head_masked_attention)
+    test_single_head_masked_attention(single_head_masked_attention)
+# %%
+def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int):
+    '''
+    Implements multihead masked attention on the matrices Q, K and V.
+    Q: shape (batch, seq, nheads*headsize)
+    K: shape (batch, seq, nheads*headsize)
+    V: shape (batch, seq, nheads*headsize)
+    returns: shape (batch, seq, nheads*headsize)
+    '''
+    new_Q = rearrange(Q, 'batch seq (nheads headsize) -> batch nheads seq headsize', nheads=num_heads)
+    new_K = rearrange(K, 'batch seq (nheads headsize) -> batch nheads seq headsize', nheads=num_heads)
+    new_V = rearrange(V, 'batch seq (nheads headsize) -> batch nheads seq headsize', nheads=num_heads)
+    attention_scores = einsum('batches nheads seq_Q head_size, batches nheads seq_K head_size -> batches nheads seq_Q seq_K', new_Q, new_K)
+    batches, _, seq_Q, head_size = new_Q.shape
+    batches, _, seq_K, head_size = new_K.shape
+    q_index = repeat(t.arange(0, seq_Q), 'seq_Q -> batches nheads seq_Q seq_K', batches=batches, seq_K=seq_K, nheads=num_heads)
+    k_index = repeat(t.arange(0, seq_K), 'seq_K -> batches nheads seq_Q seq_K', batches=batches, seq_Q=seq_Q, nheads=num_heads)
+    mask = k_index <= q_index
+    device_inf = t.tensor(-np.inf).to(Q.device)
+    device_mask = mask.to(Q.device)
+    masked_attention_scores = t.where(device_mask, attention_scores, device_inf)
+    attention_probabilities = nn.functional.softmax(masked_attention_scores / np.sqrt(head_size), dim=-1)
+    attention_values = einsum('batches nheads seq_Q seq_K, batches nheads seq_K head_size -> batches seq_Q nheads head_size', attention_probabilities, new_V)
+    return rearrange(attention_values, 'batches seq_Q nheads head_size -> batches seq_Q (nheads head_size)')
+def test_multihead_masked_attention(multihead_masked_attention):
+    Q = t.tensor([[[7, 4, 1], [6, 3, 0], [5, 2, 1]]])
+    K = t.tensor([[[1, 3, 5], [2, 4, 6]]])
+    V = t.tensor([[[1, 0, 1], [0, 1, 0]]])
+    attention_values = multihead_masked_attention(Q.float(), K.float(), V.float(), num_heads=1)
+    t.testing.assert_close(attention_values, t.tensor([[[1, 0, 1], [5.5073e-03, 9.9449e-01, 5.5073e-03], [9.7682e-03, 9.9023e-01, 9.7682e-03]]]), rtol=0.01, atol=0.001)
+    print(f"All tests in `test_multihead_masked_attention` passed.")
+if __name__ == "__main__":
+    test_multihead_masked_attention(multihead_masked_attention)
+# %%
+class MultiheadMaskedAttention(nn.Module):
+    W_QKV: nn.Linear
+    W_O: nn.Linear
+    def __init__(self, hidden_size: int, num_heads: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        assert self.hidden_size % self.num_heads == 0
+        self.W_QKV = nn.Linear(hidden_size, 3 * hidden_size)
+        self.W_O = nn.Linear(hidden_size, hidden_size)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        '''
+        x: shape (batch, seq, hidden_size)
+        Return: shape (batch, seq, hidden_size)
+        '''
+        QKV = self.W_QKV(x)
+        Q = QKV[..., :self.hidden_size]
+        K = QKV[..., self.hidden_size:-self.hidden_size]
+        V = QKV[..., -self.hidden_size:]
+        attention_values = multihead_masked_attention(Q, K, V, self.num_heads)
+        return self.W_O(attention_values)
+# %%
+def test_MultiheadMaskedAttention_shape(MultiheadMaskedAttention):
+    mma = MultiheadMaskedAttention(1, 1)
+    x = t.randn(2, 7, 1)
+    output = mma.forward(x)
+    assert x.shape == output.shape
+    print(f"All tests in `test_MultiheadMaskedAttention_shape` passed.")
+if __name__ == "__main__":
+    test_MultiheadMaskedAttention_shape(MultiheadMaskedAttention)
+# %%

config.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    cli_version: 0.13.5
+    framework: huggingface
+    huggingface_version: 4.24.0
+    is_jupyter_run: true
+    is_kaggle_kernel: false
+    python_version: 3.10.6
+    start_time: 1668083783.928274
+    t:
+      1:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 55
+      2:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 55
+      3:
+      - 1
+      - 2
+      - 3
+      - 23
+      - 37
+      4: 3.10.6
+      5: 0.13.5
+      6: 4.24.0
+      8:
+      - 1
+      - 5
+batch_size:
+  desc: null
+  value: 64
+dropout:
+  desc: null
+  value: 0.1
+epochs:
+  desc: null
+  value: 2
+hidden_size:
+  desc: null
+  value: 512
+lr:
+  desc: null
+  value: 0.001
+max_seq_len:
+  desc: null
+  value: 60
+num_heads:
+  desc: null
+  value: 8
+num_layers:
+  desc: null
+  value: 6

env.yaml ADDED Viewed

	@@ -0,0 +1,406 @@

+name: base
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_kmp_llvm
+  - aiofiles=23.1.0=pyhd8ed1ab_0
+  - aiohttp=3.8.4=py310h1fa729e_0
+  - aiosignal=1.3.1=pyhd8ed1ab_0
+  - alsa-lib=1.2.8=h166bdaf_0
+  - altair=4.2.2=pyhd8ed1ab_0
+  - anyio=3.6.2=pyhd8ed1ab_0
+  - argon2-cffi=21.3.0=pyhd8ed1ab_0
+  - argon2-cffi-bindings=21.2.0=py310h5764c6d_3
+  - arrow-cpp=11.0.0=ha770c72_4_cpu
+  - asttokens=2.2.1=pyhd8ed1ab_0
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - attr=2.5.1=h166bdaf_1
+  - attrs=22.2.0=pyh71513ae_0
+  - aws-c-auth=0.6.24=h565b4ff_2
+  - aws-c-cal=0.5.20=h679401e_5
+  - aws-c-common=0.8.10=h0b41bf4_0
+  - aws-c-compression=0.2.16=hbe6ad0c_2
+  - aws-c-event-stream=0.2.18=h489b7ba_4
+  - aws-c-http=0.7.4=hb2c4a47_0
+  - aws-c-io=0.13.15=head7655_1
+  - aws-c-mqtt=0.8.6=haf0be06_3
+  - aws-c-s3=0.2.4=h05be983_0
+  - aws-c-sdkutils=0.1.7=hbe6ad0c_2
+  - aws-checksums=0.1.14=hbe6ad0c_2
+  - aws-crt-cpp=0.19.7=h9b63b7c_3
+  - aws-sdk-cpp=1.10.57=hd557813_3
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=pyhd8ed1ab_3
+  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+  - beautifulsoup4=4.11.2=pyha770c72_0
+  - blas=2.116=mkl
+  - blas-devel=3.9.0=16_linux64_mkl
+  - bleach=6.0.0=pyhd8ed1ab_0
+  - brotli=1.0.9=h166bdaf_8
+  - brotli-bin=1.0.9=h166bdaf_8
+  - brotlipy=0.7.0=py310h5764c6d_1005
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.12.7=ha878542_0
+  - cairo=1.16.0=ha61ee94_1014
+  - certifi=2022.12.7=pyhd8ed1ab_0
+  - cffi=1.15.1=py310h255011f_3
+  - charset-normalizer=2.1.1=pyhd8ed1ab_0
+  - click=8.1.3=unix_pyhd8ed1ab_2
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.1.2=pyhd8ed1ab_0
+  - contourpy=1.0.7=py310hdf3cbec_0
+  - cryptography=39.0.1=py310h34c0648_0
+  - cuda=11.6.1=0
+  - cuda-cccl=11.6.55=hf6102b2_0
+  - cuda-command-line-tools=11.6.2=0
+  - cuda-compiler=11.6.2=0
+  - cuda-cudart=11.6.55=he381448_0
+  - cuda-cudart-dev=11.6.55=h42ad0f4_0
+  - cuda-cuobjdump=11.6.124=h2eeebcb_0
+  - cuda-cupti=11.6.124=h86345e5_0
+  - cuda-cuxxfilt=11.6.124=hecbf4f6_0
+  - cuda-driver-dev=11.6.55=0
+  - cuda-gdb=12.0.140=0
+  - cuda-libraries=11.6.1=0
+  - cuda-libraries-dev=11.6.1=0
+  - cuda-memcheck=11.8.86=0
+  - cuda-nsight=12.0.140=0
+  - cuda-nsight-compute=12.0.1=0
+  - cuda-nvcc=11.6.124=hbba6d2d_0
+  - cuda-nvdisasm=12.0.140=0
+  - cuda-nvml-dev=11.6.55=haa9ef22_0
+  - cuda-nvprof=12.0.146=0
+  - cuda-nvprune=11.6.124=he22ec0a_0
+  - cuda-nvrtc=11.6.124=h020bade_0
+  - cuda-nvrtc-dev=11.6.124=h249d397_0
+  - cuda-nvtx=11.6.124=h0630a44_0
+  - cuda-nvvp=12.0.146=0
+  - cuda-runtime=11.6.1=0
+  - cuda-samples=11.6.101=h8efea70_0
+  - cuda-sanitizer-api=12.0.140=0
+  - cuda-toolkit=11.6.1=0
+  - cuda-tools=11.6.1=0
+  - cuda-visual-tools=11.6.1=0
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - dataclasses=0.8=pyhc8e2a94_3
+  - datasets=2.9.0=pyhd8ed1ab_0
+  - dbus=1.13.6=h5008d03_3
+  - debugpy=1.6.6=py310heca2aa9_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - dill=0.3.6=pyhd8ed1ab_1
+  - einops=0.6.0=pyhd8ed1ab_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - executing=1.2.0=pyhd8ed1ab_0
+  - expat=2.5.0=h27087fc_0
+  - fastapi=0.92.0=pyhd8ed1ab_0
+  - ffmpeg=4.3=hf484d3e_0
+  - ffmpy=0.3.0=pyhb6f538c_0
+  - fftw=3.3.10=nompi_hf0379b8_106
+  - filelock=3.9.0=pyhd8ed1ab_0
+  - flask=2.2.3=pyhd8ed1ab_0
+  - flit-core=3.8.0=pyhd8ed1ab_0
+  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+  - font-ttf-inconsolata=3.000=h77eed37_0
+  - font-ttf-source-code-pro=2.038=h77eed37_0
+  - font-ttf-ubuntu=0.83=hab24e00_0
+  - fontconfig=2.14.2=h14ed4e7_0
+  - fonts-conda-ecosystem=1=0
+  - fonts-conda-forge=1=0
+  - fonttools=4.38.0=py310h5764c6d_1
+  - freetype=2.12.1=hca18f0e_1
+  - frozenlist=1.3.3=py310h5764c6d_0
+  - fsspec=2023.1.0=pyhd8ed1ab_0
+  - gds-tools=1.5.1.14=0
+  - gettext=0.21.1=h27087fc_0
+  - gflags=2.2.2=he1b5a44_1004
+  - glib=2.74.1=h6239696_1
+  - glib-tools=2.74.1=h6239696_1
+  - glog=0.6.0=h6f12383_0
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - gradio=3.19.1=pyhd8ed1ab_0
+  - graphite2=1.3.13=h58526e2_1001
+  - gst-plugins-base=1.22.0=h4243ec0_0
+  - gstreamer=1.22.0=h25f0c4b_0
+  - gstreamer-orc=0.4.33=h166bdaf_0
+  - h11=0.14.0=pyhd8ed1ab_0
+  - h2=4.1.0=pyhd8ed1ab_0
+  - harfbuzz=6.0.0=h8e241bc_0
+  - hpack=4.0.0=pyh9f0ad1d_0
+  - httpcore=0.16.3=pyhd8ed1ab_0
+  - httpx=0.23.3=pyhd8ed1ab_0
+  - huggingface_hub=0.12.1=pyhd8ed1ab_0
+  - hyperframe=6.0.1=pyhd8ed1ab_0
+  - icu=70.1=h27087fc_0
+  - idna=3.4=pyhd8ed1ab_0
+  - importlib-metadata=6.0.0=pyha770c72_0
+  - importlib_metadata=6.0.0=hd8ed1ab_0
+  - importlib_resources=5.12.0=pyhd8ed1ab_0
+  - ipykernel=6.21.2=pyh210e3f2_0
+  - ipython=8.10.0=pyh41d4057_0
+  - ipython_genutils=0.2.0=py_1
+  - ipywidgets=8.0.4=pyhd8ed1ab_0
+  - itsdangerous=2.1.2=pyhd8ed1ab_0
+  - jack=1.9.22=h11f4161_0
+  - jedi=0.18.2=pyhd8ed1ab_0
+  - jinja2=3.1.2=pyhd8ed1ab_1
+  - joblib=1.2.0=pyhd8ed1ab_0
+  - jpeg=9e=h0b41bf4_3
+  - jsonschema=4.17.3=pyhd8ed1ab_0
+  - jupyter=1.0.0=py310hff52083_8
+  - jupyter_client=8.0.3=pyhd8ed1ab_0
+  - jupyter_console=6.5.1=pyhd8ed1ab_0
+  - jupyter_core=5.2.0=py310hff52083_0
+  - jupyter_events=0.6.3=pyhd8ed1ab_0
+  - jupyter_server=2.3.0=pyhd8ed1ab_0
+  - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1
+  - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
+  - jupyterlab_widgets=3.0.5=pyhd8ed1ab_0
+  - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.4.4=py310hbf28c38_1
+  - krb5=1.20.1=h81ceb04_0
+  - lame=3.100=h166bdaf_1003
+  - lcms2=2.14=hfd0df8a_1
+  - ld_impl_linux-64=2.40=h41732ed_0
+  - lerc=4.0.0=h27087fc_0
+  - libabseil=20220623.0=cxx17_h05df665_6
+  - libarrow=11.0.0=hc42cb68_4_cpu
+  - libblas=3.9.0=16_linux64_mkl
+  - libbrotlicommon=1.0.9=h166bdaf_8
+  - libbrotlidec=1.0.9=h166bdaf_8
+  - libbrotlienc=1.0.9=h166bdaf_8
+  - libcap=2.66=ha37c62d_0
+  - libcblas=3.9.0=16_linux64_mkl
+  - libclang=15.0.7=default_had23c3d_1
+  - libclang13=15.0.7=default_h3e3d535_1
+  - libcrc32c=1.1.2=h9c3ff4c_0
+  - libcublas=11.9.2.110=h5e84587_0
+  - libcublas-dev=11.9.2.110=h5c901ab_0
+  - libcufft=10.7.1.112=hf425ae0_0
+  - libcufft-dev=10.7.1.112=ha5ce4c0_0
+  - libcufile=1.5.1.14=0
+  - libcufile-dev=1.5.1.14=0
+  - libcups=2.3.3=h36d4200_3
+  - libcurand=10.3.1.124=0
+  - libcurand-dev=10.3.1.124=0
+  - libcurl=7.88.1=hdc1c0ab_0
+  - libcusolver=11.3.4.124=h33c3c4e_0
+  - libcusparse=11.7.2.124=h7538f96_0
+  - libcusparse-dev=11.7.2.124=hbbe9722_0
+  - libdb=6.2.32=h9c3ff4c_0
+  - libdeflate=1.17=h0b41bf4_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libevent=2.1.10=h28343ad_4
+  - libffi=3.4.2=h7f98852_5
+  - libflac=1.4.2=h27087fc_0
+  - libgcc-ng=12.2.0=h65d4601_19
+  - libgcrypt=1.10.1=h166bdaf_0
+  - libgfortran-ng=12.2.0=h69a702a_19
+  - libgfortran5=12.2.0=h337968e_19
+  - libglib=2.74.1=h606061b_1
+  - libgoogle-cloud=2.7.0=h21dfe5b_1
+  - libgpg-error=1.46=h620e276_0
+  - libgrpc=1.51.1=h4fad500_1
+  - libhwloc=2.8.0=h32351e8_1
+  - libiconv=1.17=h166bdaf_0
+  - liblapack=3.9.0=16_linux64_mkl
+  - liblapacke=3.9.0=16_linux64_mkl
+  - libllvm15=15.0.7=hadd5161_0
+  - libnghttp2=1.51.0=hff17c54_0
+  - libnpp=11.6.3.124=hd2722f0_0
+  - libnpp-dev=11.6.3.124=h3c42840_0
+  - libnsl=2.0.0=h7f98852_0
+  - libnvjpeg=11.6.2.124=hd473ad6_0
+  - libnvjpeg-dev=11.6.2.124=hb5906b9_0
+  - libogg=1.3.4=h7f98852_1
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.39=h753d276_0
+  - libpq=15.2=hb675445_0
+  - libprotobuf=3.21.12=h3eb15da_0
+  - libsndfile=1.2.0=hb75c966_0
+  - libsodium=1.0.18=h36c2ea0_1
+  - libsqlite=3.40.0=h753d276_0
+  - libssh2=1.10.0=hf14f497_3
+  - libstdcxx-ng=12.2.0=h46fd767_19
+  - libsystemd0=252=h2a991cd_0
+  - libthrift=0.16.0=he500d00_2
+  - libtiff=4.5.0=h6adf6a1_2
+  - libtool=2.4.7=h27087fc_0
+  - libudev1=252=h166bdaf_0
+  - libutf8proc=2.8.0=h166bdaf_0
+  - libuuid=2.32.1=h7f98852_1000
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp-base=1.2.4=h166bdaf_0
+  - libxcb=1.13=h7f98852_1004
+  - libxkbcommon=1.5.0=h79f4944_0
+  - libxml2=2.10.3=h7463322_0
+  - libzlib=1.2.13=h166bdaf_4
+  - linkify-it-py=2.0.0=pyhd8ed1ab_0
+  - llvm-openmp=15.0.7=h0cdce71_0
+  - lz4-c=1.9.4=hcb278e6_0
+  - markdown-it-py=2.1.0=pyhd8ed1ab_0
+  - markupsafe=2.1.2=py310h1fa729e_0
+  - matplotlib-base=3.7.0=py310he60537e_0
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - mdit-py-plugins=0.3.3=pyhd8ed1ab_0
+  - mdurl=0.1.0=pyhd8ed1ab_0
+  - mistune=2.0.5=pyhd8ed1ab_0
+  - mkl=2022.1.0=h84fe81f_915
+  - mkl-devel=2022.1.0=ha770c72_916
+  - mkl-include=2022.1.0=h84fe81f_915
+  - mpg123=1.31.2=hcb278e6_0
+  - multidict=6.0.4=py310h1fa729e_0
+  - multiprocess=0.70.14=py310h5764c6d_3
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mysql-common=8.0.32=ha901b37_0
+  - mysql-libs=8.0.32=hd7da12d_0
+  - nbclassic=0.5.2=pyhd8ed1ab_0
+  - nbclient=0.7.2=pyhd8ed1ab_0
+  - nbconvert=7.2.9=pyhd8ed1ab_0
+  - nbconvert-core=7.2.9=pyhd8ed1ab_0
+  - nbconvert-pandoc=7.2.9=pyhd8ed1ab_0
+  - nbformat=5.7.3=pyhd8ed1ab_0
+  - ncurses=6.3=h27087fc_1
+  - nest-asyncio=1.5.6=pyhd8ed1ab_0
+  - nettle=3.6=he412f7d_0
+  - notebook=6.5.2=pyha770c72_1
+  - notebook-shim=0.2.2=pyhd8ed1ab_0
+  - nsight-compute=2022.4.1.6=0
+  - nspr=4.35=h27087fc_0
+  - nss=3.88=he45b914_0
+  - numpy=1.24.2=py310h8deb116_0
+  - openh264=2.1.1=h780b84a_0
+  - openjpeg=2.5.0=hfec8fc6_2
+  - openssl=3.0.8=h0b41bf4_0
+  - orc=1.8.2=hfdbbad2_2
+  - orjson=3.8.5=py310h38b9cce_1
+  - packaging=23.0=pyhd8ed1ab_0
+  - pandas=1.5.3=py310h9b08913_0
+  - pandoc=2.19.2=h32600fe_1
+  - pandocfilters=1.5.0=pyhd8ed1ab_0
+  - parquet-cpp=1.5.1=2
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pcre2=10.40=hc3806b6_0
+  - pexpect=4.8.0=pyh1a96a4e_2
+  - pickleshare=0.7.5=py_1003
+  - pillow=9.4.0=py310h023d228_1
+  - pip=23.0.1=pyhd8ed1ab_0
+  - pixman=0.40.0=h36c2ea0_0
+  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
+  - platformdirs=3.0.0=pyhd8ed1ab_0
+  - ply=3.11=py_1
+  - prometheus_client=0.16.0=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.36=pyha770c72_0
+  - prompt_toolkit=3.0.36=hd8ed1ab_0
+  - psutil=5.9.4=py310h5764c6d_0
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pulseaudio=16.1=ha8d29e2_1
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pyarrow=11.0.0=py310h633f555_4_cpu
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pycryptodome=3.16.0=py310h1419917_0
+  - pydantic=1.10.5=py310h1fa729e_0
+  - pydub=0.25.1=pyhd8ed1ab_0
+  - pygments=2.14.0=pyhd8ed1ab_0
+  - pyopenssl=23.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pyqt=5.15.7=py310hab646b1_3
+  - pyqt5-sip=12.11.0=py310heca2aa9_3
+  - pyrsistent=0.19.3=py310h1fa729e_0
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.10.9=he550d4f_0_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-fastjsonschema=2.16.2=pyhd8ed1ab_0
+  - python-json-logger=2.0.6=pyhd8ed1ab_0
+  - python-multipart=0.0.5=py_0
+  - python-xxhash=3.2.0=py310h1fa729e_0
+  - python_abi=3.10=3_cp310
+  - pytorch=1.13.1=py3.10_cuda11.6_cudnn8.3.2_0
+  - pytorch-cuda=11.6=h867d48c_1
+  - pytorch-mutex=1.0=cuda
+  - pytz=2022.7.1=pyhd8ed1ab_0
+  - pyyaml=6.0=py310h5764c6d_5
+  - pyzmq=25.0.0=py310h059b190_0
+  - qt-main=5.15.8=h5d23da1_6
+  - qtconsole=5.4.0=pyhd8ed1ab_0
+  - qtconsole-base=5.4.0=pyha770c72_0
+  - qtpy=2.3.0=pyhd8ed1ab_0
+  - re2=2023.02.01=hcb278e6_0
+  - readline=8.1.2=h0f457ee_0
+  - regex=2022.10.31=py310h5764c6d_0
+  - requests=2.28.2=pyhd8ed1ab_0
+  - responses=0.18.0=pyhd8ed1ab_0
+  - rfc3339-validator=0.1.4=pyhd8ed1ab_0
+  - rfc3986=1.5.0=pyhd8ed1ab_0
+  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
+  - s2n=1.3.35=h3358134_0
+  - sacremoses=0.0.53=pyhd8ed1ab_0
+  - send2trash=1.8.0=pyhd8ed1ab_0
+  - setuptools=67.3.2=pyhd8ed1ab_0
+  - sip=6.7.7=py310heca2aa9_0
+  - six=1.16.0=pyh6c4a22f_0
+  - snappy=1.1.9=hbd366e4_2
+  - sniffio=1.3.0=pyhd8ed1ab_0
+  - soupsieve=2.3.2.post1=pyhd8ed1ab_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - starlette=0.25.0=pyhd8ed1ab_0
+  - tbb=2021.7.0=h924138e_1
+  - terminado=0.17.1=pyh41d4057_0
+  - tinycss2=1.2.1=pyhd8ed1ab_0
+  - tk=8.6.12=h27826a3_0
+  - tokenizers=0.13.2=py310he1f1126_0
+  - toml=0.10.2=pyhd8ed1ab_0
+  - toolz=0.12.0=pyhd8ed1ab_0
+  - torchaudio=0.13.1=py310_cu116
+  - torchvision=0.14.1=py310_cu116
+  - tornado=6.2=py310h5764c6d_1
+  - tqdm=4.64.1=pyhd8ed1ab_0
+  - traitlets=5.9.0=pyhd8ed1ab_0
+  - transformers=4.26.1=pyhd8ed1ab_0
+  - typing-extensions=4.4.0=hd8ed1ab_0
+  - typing_extensions=4.4.0=pyha770c72_0
+  - tzdata=2022g=h191b570_0
+  - uc-micro-py=1.0.1=pyhd8ed1ab_0
+  - unicodedata2=15.0.0=py310h5764c6d_0
+  - urllib3=1.26.14=pyhd8ed1ab_0
+  - uvicorn=0.20.0=py310hff52083_1
+  - wcwidth=0.2.6=pyhd8ed1ab_0
+  - webencodings=0.5.1=py_1
+  - websocket-client=1.5.1=pyhd8ed1ab_0
+  - websockets=10.4=py310h5764c6d_1
+  - werkzeug=2.2.3=pyhd8ed1ab_0
+  - wheel=0.38.4=pyhd8ed1ab_0
+  - widgetsnbextension=4.0.5=pyhd8ed1ab_0
+  - xcb-util=0.4.0=h166bdaf_0
+  - xcb-util-image=0.4.0=h166bdaf_0
+  - xcb-util-keysyms=0.4.0=h166bdaf_0
+  - xcb-util-renderutil=0.3.9=h166bdaf_0
+  - xcb-util-wm=0.4.1=h166bdaf_0
+  - xorg-kbproto=1.0.7=h7f98852_1002
+  - xorg-libice=1.0.10=h7f98852_0
+  - xorg-libsm=1.2.3=hd9c2040_1000
+  - xorg-libx11=1.7.2=h7f98852_0
+  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xorg-libxext=1.3.4=h7f98852_1
+  - xorg-libxrender=0.9.10=h7f98852_1003
+  - xorg-renderproto=0.11.1=h7f98852_1002
+  - xorg-xextproto=7.3.0=h7f98852_1002
+  - xorg-xproto=7.0.31=h7f98852_1007
+  - xxhash=0.8.1=h0b41bf4_0
+  - xz=5.2.6=h166bdaf_0
+  - yaml=0.2.5=h7f98852_2
+  - yarl=1.8.2=py310h5764c6d_0
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zipp=3.14.0=pyhd8ed1ab_0
+  - zlib=1.2.13=h166bdaf_4
+  - zstd=1.5.2=h3eb15da_6
+  - pip:
+    - fancy-einsum==0.0.3

sampling.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# %%
+import torch as t
+import torch.nn.functional as F
+import transformers
+import numpy as np
+gpt = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
+def apply_sampling_methods(
+    input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0
+) -> int:
+    '''
+    Return the next token, sampled from the model's probability distribution with modifiers.
+x
+    input_ids: shape (seq,)
+    '''
+    assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids"
+    assert temperature >= 0, "Temperature should be non-negative"
+    assert 0 <= top_p <= 1.0, "Top-p must be a probability"
+    assert 0 <= top_k, "Top-k must be non-negative"
+    assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported"
+    if temperature == 0:
+        return greedy_search(logits)
+    if temperature != 1.0:
+        logits = apply_temperature(logits, temperature)
+    if freq_penalty != 0.0:
+        logits = apply_freq_penalty(input_ids, logits, freq_penalty)
+    if top_k > 0:
+        return sample_top_k(logits, top_k)
+    if top_p > 0:
+        return sample_top_p(logits, top_p)
+    return sample_basic(logits)
+def sample_tokens(
+    model,
+    tokenizer,
+    initial_text: str,
+    max_tokens_generated: int = 30,
+    **kwargs
+) -> str:
+    '''
+    Sample tokens until the model outputs `tokenizer.eos_token_id` or the specified token limit is reached.
+    Return: the prompt and continuation concatenated
+    '''
+    model.eval()
+    input_ids: list = tokenizer.encode(initial_text)
+    generated = []
+    device = next(model.parameters()).device
+    for _ in range(max_tokens_generated):
+        new_input_ids = t.tensor(np.array(input_ids + generated), dtype=t.int64, device=device)
+        new_input_ids_truncated = new_input_ids[-min(tokenizer.model_max_length, new_input_ids.shape[0]):].unsqueeze(0)
+        output = model(new_input_ids_truncated)
+        all_logits = output if isinstance(output, t.Tensor) else output.logits
+        logits = all_logits[0, -1] #batch=0, seq_len=-1 -> returns vocab_size
+        new_token = apply_sampling_methods(new_input_ids, logits, **kwargs)
+        generated.append(new_token)
+        if new_token == getattr(tokenizer, "eos_token_id", None):
+            break
+    return tokenizer.decode(input_ids + generated)
+# %%
+def greedy_search(logits: t.Tensor) -> int:
+    '''
+    logits: shape (vocab_size, )
+    Return: the most likely token (as an integer)
+    '''
+    return logits.argmax().numpy()
+if __name__ == "__main__":
+    prompt = "Jingle bells, jingle bells, jingle all the way"
+    print("Greedy decoding with prompt: ", prompt)
+    output = sample_tokens(gpt, tokenizer, prompt, max_tokens_generated=8, temperature=0.0)
+    print(f"Your model said: {output}")
+    expected = "Jingle bells, jingle bells, jingle all the way up to the top of the mountain."
+    assert output == expected
+    print("Greedy decoding a second time (should be deterministic): ")
+    output = sample_tokens(gpt, tokenizer, prompt, max_tokens_generated=8, temperature=0.0)
+    print(f"Your model said: {output}")
+    expected = "Jingle bells, jingle bells, jingle all the way up to the top of the mountain."
+    assert output == expected
+    print("Tests passed!")
+# %%
+def sample_basic(logits: t.Tensor) -> int:
+    '''
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    Return: a sampled token
+    '''
+    return t.distributions.categorical.Categorical(logits=logits).sample()
+if __name__ == "__main__":
+    N = 20000
+    probs = t.linspace(0, 0.4, 5)
+    unnormalized_logits = probs.log() + 1.2345
+    samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)])
+    counts = t.bincount(samples, minlength=len(probs)) / N
+    print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+    t.testing.assert_close(counts, probs, atol=0.01, rtol=0)
+    print("Tests passed!")
+# %%
+def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor:
+    '''
+    logits: shape (vocab_size, )
+    Return: shape (vocab_size, )
+    '''
+    assert temperature > 0
+    return logits / temperature
+if __name__ == '__main__':
+    logits = t.tensor([1, 2]).log()
+    cold_logits = apply_temperature(logits, 0.001)
+    print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits)
+    t.testing.assert_close(cold_logits, 1000.0 * logits)
+    hot_logits = apply_temperature(logits, 1000.0)
+    print("A high temperature flattens the distribution: ", hot_logits)
+    t.testing.assert_close(hot_logits, 0.001 * logits)
+    print("Tests passed!")
+# %%
+def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor:
+    '''
+    input_ids: shape (seq, )
+    logits: shape (vocab_size, )
+    Return: shape (vocab_size, )
+    '''
+    count = input_ids.bincount(minlength=len(logits))
+    logits -= count * freq_penalty
+    return logits
+if __name__ == "__main__":
+    bieber_prompt = "And I was like Baby, baby, baby, oh Like, Baby, baby, baby, no Like, Baby, baby, baby, oh I thought you'd always be mine, mine"
+    input_ids = tokenizer.encode(bieber_prompt, return_tensors="pt").squeeze()
+    logits = t.ones(tokenizer.vocab_size)
+    penalized_logits = apply_freq_penalty(input_ids, logits, 2.0)
+    assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space"
+    assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space"
+    print("Tests passed!")
+# %%
+N_RUNS = 0
+your_prompt = "Jingle bells, jingle bells, jingle all the way"
+cases = [
+    ("High freq penalty", dict(freq_penalty=100.0)),
+    ("Negative freq penalty", dict(freq_penalty=-1.0)),
+    ("Too hot!", dict(temperature=2.0)),
+    ("Pleasantly cool", dict(temperature=0.7)),
+    ("Pleasantly warm", dict(temperature=0.9)),
+    ("Too cold!", dict(temperature=0.01)),
+]
+for (name, kwargs) in cases:
+    for i in range(N_RUNS):
+        output = sample_tokens(gpt, tokenizer, your_prompt, max_tokens_generated=24, **kwargs)
+        print(f"Sample {i} with: {name} ({kwargs}):")
+        print(f"Your model said: {repr(output)}\n")
+# %%
+def sample_top_k(logits: t.Tensor, top_k: int) -> int:
+    '''
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    top_k: only consider this many of the most likely tokens for sampling
+    Return: a sampled token
+    '''
+    values, indices = t.topk(logits, top_k)
+    return indices[sample_basic(values)].item()
+if __name__ == "__main__":
+    N = 50000
+    k = 3
+    probs = t.linspace(0, 0.4, 5)
+    unnormalized_logits = probs.log() + 1.2345
+    samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)])
+    counts = t.bincount(samples, minlength=len(probs)) / N
+    expected = probs.clone()
+    expected[:-k] = 0
+    expected /= expected.sum()
+    print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+    t.testing.assert_close(counts, expected, atol=0.01, rtol=0)
+    print("Tests passed!")
+# %%
+if __name__ == "__main__":
+    your_prompt = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English."
+    output = sample_tokens(gpt, tokenizer, your_prompt, temperature=0.7, top_k=40, max_tokens_generated=64)
+    print(f"Your model said: {repr(output)}")
+# %%
+def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int:
+    '''
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    Return: a sampled token
+    '''
+    probs = t.exp(logits.double()) / t.exp(logits.double()).sum()
+    sorted_probs, sorted_indices = probs.sort(descending=True)
+    cum_probs = sorted_probs.cumsum(-1)
+    last_index = max(min_tokens_to_keep, t.where(cum_probs >= top_p)[0][0].numpy() + 1)
+    masked_probs = sorted_probs[:last_index]
+    sample = t.distributions.categorical.Categorical(probs=t.tensor(masked_probs)).sample()
+    return sorted_indices[sample]
+if __name__ == "__main__":
+    N = 2000
+    unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
+    samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)])
+    counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
+    print("top_p of 0.5 or lower should only return token 2: ", counts)
+    assert counts[0] == 0 and counts[1] == 0
+    N = 2000
+    unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
+    samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)])
+    counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
+    print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts)
+    assert counts[0] == 0
+    N = 50000
+    top_p = 0.71
+    probs = t.linspace(0, 0.4, 5)
+    unnormalized_logits = probs.log() + 1.2345
+    samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)])
+    counts = t.bincount(samples, minlength=len(probs)) / N
+    expected = probs.clone()
+    expected[0:2] = 0
+    expected /= expected.sum()
+    print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+    t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0)
+    print("All tests passed!")
+# %%
+if __name__ == "__main__":
+    your_prompt = "Eliezer Shlomo Yudkowsky (born September 11, 1979) is an American decision and artificial intelligence (AI) theorist and writer, best known for"
+    output = sample_tokens(gpt, tokenizer, your_prompt, temperature=0.7, top_p=0.95, max_tokens_generated=64)
+    print(f"Your model said: {repr(output)}")
+# %%

shakespeare_demo.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#%%
+import yaml
+import torch as t
+import gradio as gr
+import re
+from word_data import WordData
+import sampling
+import transformer_replication
+#%%
+MAIN = __name__ == '__main__'
+device = 'cuda' if t.cuda.is_available() else 'cpu'
+#%%
+shakespeare = WordData.from_file(
+    '100-0.txt', device=device, start="1\n", end='ALL’S WELL THAT ENDS WELL'
+)
+if MAIN:
+    print('Vocab size: ', len(shakespeare.vocab))
+#%%
+#%%
+with open('config.yaml', 'r') as f:
+    yaml_cfg = yaml.safe_load(f)
+#%%
+with open('model_state_dict.pt') as f:
+    state_dict = t.load(
+        'model_state_dict.pt'
+    )
+#%%
+base_config = transformer_replication.TransformerConfig(
+    num_layers=yaml_cfg['num_layers']['value'],
+    num_heads=yaml_cfg['num_heads']['value'],
+    vocab_size=len(shakespeare.vocab),
+    hidden_size=yaml_cfg['hidden_size']['value'],
+    max_seq_len=yaml_cfg['max_seq_len']['value'],
+    dropout=yaml_cfg['dropout']['value'],
+)
+shakespeare.model_max_length = yaml_cfg['max_seq_len']['value']
+model = transformer_replication.DecoderOnlyTransformer(base_config)
+model.load_state_dict(state_dict)
+#%%
+def generate(
+        text: str, max_tokens: int, temperature: float,
+        top_k: int,
+) -> str:
+    return sampling.sample_tokens(
+        model,
+        shakespeare,
+        text,
+        max_tokens_generated=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+    )
+#%%
+def safe_generate(
+        text: str, max_tokens: int = 300, temperature: float = 1.0,
+        top_k: int = 20,
+    ) -> str:
+    try:
+        raw = generate(
+            text, max_tokens=max_tokens, temperature=temperature, top_k=top_k,
+        )
+        match = re.match(r"(?P<start>\D*)\d+\n", raw)
+        if match is None:
+            return raw
+        return match.group('start')
+    except KeyError as e:
+        return f"I'm sorry, {str(e)} is not in Shakespeare's vocabulary"
+#%%
+examples = [
+    ["I sang a beautiful song"],
+    ["To be free is to"],
+    ["How I love thee"],
+]
+#%%
+if MAIN:
+    print(safe_generate('How I love thee'))
+#%%
+def make_demo():
+    demo = gr.Interface(
+        fn=safe_generate,
+        inputs=[
+            gr.components.Textbox(lines=5, label="Input Text"),
+            gr.components.Slider(
+                label='max tokens generated', minimum=1, maximum=1000,
+                value=300, step=1,
+            ),
+            gr.components.Slider(
+                label='temperature', minimum=0, maximum=2, value=1, step=0.1,
+            ),
+            gr.components.Slider(
+                label='top_k', minimum=1, maximum=100, value=10, step=1,
+            ),
+        ],
+        outputs=gr.components.Textbox(label="Generated Text"),
+        examples=examples
+    )
+    demo.launch()
+# %%
+'''
+FIXME:
+* deploy to heroku
+* link from github home
+'''

transformer_replication.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#%%
+import transformers
+import torch as t
+import torch.nn as nn
+from typing import Union, List
+from fancy_einsum import einsum
+import torch as t
+from torch import nn
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+from typing import Union, Optional, Callable, Tuple
+import numpy as np
+from einops import rearrange
+import time
+# %%
+tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
+if __name__ == "__main__":
+    print(tokenizer("hello meg"))
+    print(tokenizer.encode("hello meg"))
+    print(tokenizer.decode([31373, 17243]))
+    print(tokenizer.tokenize("hello meg"))
+    print(f"'{tokenizer.decode(17243)}'")
+# %%
+class Embedding(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.weight = nn.Parameter(t.randn((self.num_embeddings, self.embedding_dim)))
+    def forward(self, x: t.LongTensor) -> t.Tensor:
+        '''For each integer in the input, return that row of the embedding.
+        '''
+        #return einsum('num_embeddings embedding_dim, i num_embeddings -> i embedding_dim', self.weight, nn.functional.one_hot(x, num_classes=self.num_embeddings).float())
+        return self.weight[x]
+    def extra_repr(self) -> str:
+        return f"{self.num_embeddings}, {self.embedding_dim}"
+# %%
+#TODO positional encoding
+class PositionalEncoding(nn.Module):
+    def __init__(self, max_seq_len: int, embedding_dim: int):
+        super().__init__()
+        # Defining our positional encoding array, with `max_seq_len` rows
+        # This is an advantage of using sinusoidal encoding: we can easily expand to sequences of greater length without adding more learned params
+        angles = t.outer(t.arange(max_seq_len), 1 / 10000 ** (2 * t.arange(embedding_dim//2) / embedding_dim))
+        pe = t.zeros((max_seq_len, embedding_dim))
+        pe[:, ::2] = t.sin(angles)
+        pe[:, 1::2] = t.cos(angles)
+        # Register array as a buffer, rather than parameter (we don't want it to be updated by gradient descent)
+        self.register_buffer('pe', pe)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        """
+        x: shape (batch, seq_len, embedding_dim)
+        """
+        batch, seq_len, embedding_dim = x.shape
+        # We slice the positional encoding, so it's the same shape as x
+        # This is equivalent to just using an nn.Embedding, but having the input be t.arange(seq_len)
+        return x + self.pe[:seq_len, :] # type: ignore
+# %%
+class LayerNorm(nn.Module):
+    def __init__(self, normalized_shape: Union[int, List[int]], eps: float = 1e-05, elementwise_affine: bool = True):
+        super().__init__()
+        self.normalized_shape = normalized_shape
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(t.ones(normalized_shape))
+            self.bias = nn.Parameter(t.zeros(normalized_shape))
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        normalized_shape_dims = 1 if isinstance(self.normalized_shape, int) else len(self.normalized_shape)
+        x_mean = x.mean(dim=list(range(x.dim()))[-normalized_shape_dims:], keepdim=True) # complement of the normalised shape
+        x_var = x.var(dim=list(range(x.dim()))[-normalized_shape_dims:], keepdim=True, unbiased=False) # complement of the normalised shape
+        x_scaled = (x - x_mean) / t.sqrt(x_var + self.eps)
+        if self.elementwise_affine:
+            return x_scaled * self.weight + self.bias
+        return x_scaled
+    def extra_repr(self) -> str:
+        pass
+# %%
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class TransformerConfig:
+    '''Constants used throughout your decoder-only transformer model.'''
+    num_layers: int
+    num_heads: int
+    vocab_size: int
+    hidden_size: int
+    max_seq_len: int
+    dropout: float = 0.1
+    layer_norm_epsilon: float = 1e-05
+# %%
+import attention_replication
+class BertMLP(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.linear1 = nn.Linear(config.hidden_size, 4 * config.hidden_size)
+        self.gelu = nn.GELU()
+        self.linear2 = nn.Linear(4 * config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        x = self.linear1(x)
+        x = self.gelu(x)
+        x = self.linear2(x)
+        x = self.dropout(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.attention = attention_replication.MultiheadMaskedAttention(config.hidden_size, config.num_heads)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, config.layer_norm_epsilon)
+        self.mlp = BertMLP(config)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, config.layer_norm_epsilon)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        y = self.attention(x)
+        y = self.layer_norm1(y)
+        x = x + y
+        z = self.mlp(x)
+        z = self.layer_norm2(z)
+        x = x + z
+        return x
+class DecoderOnlyTransformer(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.token_embedding = Embedding(config.vocab_size, config.hidden_size)
+        self.positional_embedding = PositionalEncoding(config.max_seq_len, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+        self.bert_blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config.num_layers)])
+        self.layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_epsilon)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        x = self.token_embedding(x)
+        x = self.positional_embedding(x)
+        x = self.dropout(x)
+        for block in self.bert_blocks:
+            x = block(x)
+        x = self.layer_norm(x)
+        x = einsum('num_embeddings embedding_dim,batch seq_len embedding_dim ->batch seq_len num_embeddings', self.token_embedding.weight, x)
+        return x
+# %%
+from torch.utils.data import Dataset
+class CustomTextDataset(Dataset):
+    def __init__(self, texts, labels):
+        self.labels = labels
+        self.texts = texts
+    @staticmethod
+    def from_config(config, samples):
+        texts = [t.randint(high=config.vocab_size, size=(config.max_seq_len,)) for _ in range(samples)]
+        labels = [t.flip(text, (0,)) for text in texts]
+        return CustomTextDataset(texts, labels)
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        label = self.labels[idx]
+        text = self.texts[idx]
+        sample = (text, label)
+        return sample

word_data.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import re
+from typing import Optional, Union
+import requests
+from torch.utils.data import Dataset
+import torch as t
+class WordsDataset(Dataset):
+    def __init__(self, texts, labels):
+        self.texts = texts
+        self.labels = labels
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        label = self.labels[idx]
+        text = self.texts[idx]
+        sample = (text, label)
+        return sample
+#%%
+def tokenize(text):
+    return re.split(r"\b", text)
+def _remove_duplicates(text, string=" "):
+    if string + string in text:
+        text = text.replace(string + string, string)
+        return _remove_duplicates(text, string)
+    return text
+def remove_duplicates(text):
+    text = _remove_duplicates(text, ' ')
+    text = _remove_duplicates(text, '\n')
+    return text
+# %%
+class WordData():
+    def __init__(self, text, start, end, device):
+        self.complete_text = remove_duplicates(text)
+        if start is not None and end is not None:
+            self.complete_text = self.get_excerpt(start, end)
+        self.complete_tokens = tokenize(self.complete_text)
+        self.vocab = sorted(set(self.complete_tokens))
+        self.token_to_id = dict(zip(self.vocab, list(range(len(self.vocab)))))
+        self.id_to_token = dict(zip(list(range(len(self.vocab))), self.vocab))
+        self.model_max_length = None
+        self.device = device
+    @staticmethod
+    def from_link(link, device, start=None, end=None):
+        return WordData(
+            requests.get(link).content.decode('utf-8'),
+            start,
+            end,
+            device=device
+        )
+    @staticmethod
+    def from_file(filename, device, start=None, end=None):
+        with open(filename, encoding='utf-8') as f:
+            text = f.read()
+        return WordData(text, start, end, device=device)
+    def get_excerpt(self, start="THE SONNETS", end="THE END", text=None):
+        if text is None:
+            text = self.complete_text
+        assert start in text, f'get_excerpt: cannot find {start} in text'
+        l_stripped = text.split(start, maxsplit=1)[1]
+        assert end in l_stripped, f'get_excerpt: cannot find {end} in text'
+        r_stripped = l_stripped.split(end, maxsplit=1)[0]
+        return r_stripped
+    def generate_autoregressive_dataset(self, sequence_length, text=None):
+        self.model_max_length = sequence_length
+        if text is None:
+            text = self.complete_text
+        token_ids = self.encode(text, return_tensors="pt")
+        inputs = [token_ids[i:i + sequence_length] for i in range(len(token_ids) - sequence_length)]
+        labels = [token_ids[i + 1:i + 1 + sequence_length] for i in range(len(token_ids) - sequence_length)]
+        return WordsDataset(inputs, labels)
+    def encode(self, initial_text: str, return_tensors: Optional[str] = None) -> Union[list, t.Tensor]:
+        '''
+        Tokenizes initial_text, then returns the token ids.
+        Return type is list by default, but if return_tensors="pt" then it is returned as a tensor.
+        '''
+        tokens = tokenize(initial_text)
+        token_ids = [self.token_to_id[t] for t in tokens]
+        if return_tensors == "pt":
+            return t.tensor(token_ids, device=self.device)
+        return token_ids
+    def decode(self, list_of_ids: Union[t.Tensor, list]) -> str:
+        '''
+        Converts ids to a list of tokens, then joins them into a single string.
+        '''
+        tokens = [self.id_to_token[int(i)] for i in list_of_ids]
+        return "".join(tokens)