Spaces:

Guy24
/

inner_lexicon

Running on Zero

App Files Files Community

Guy24 commited on Apr 20

Commit

0108542

1 Parent(s): b7e1c46

adding application

Browse files

Files changed (13) hide show

processor.py +106 -0
requirements.txt +10 -213
utils/__init__.py +0 -0
utils/calibration_utils.py +288 -0
utils/data_utils.py +214 -0
utils/enums.py +13 -0
utils/eval_utils.py +334 -0
utils/file_utils.py +94 -0
utils/logit_lens.py +304 -0
utils/model_utils.py +320 -0
utils/procrustes/__init__.py +0 -0
utils/procrustes/orthogonal.py +383 -0
utils/procrustes/utils.py +495 -0

processor.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import random
+import torch
+class RetrievalProcessor:
+    def __init__(self, model, tokenizer, multi_token_kind, num_tokens_to_generate,
+                 add_context, model_name, whitespace_token='Ġ'):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.multi_token_kind = multi_token_kind
+        self.num_tokens_to_generate = num_tokens_to_generate
+        self.add_context = add_context
+        self.model_name = model_name
+        self.whitespace_token = whitespace_token
+    def get_next_word(self, tokens, i, max_length=1000, device='cuda'):
+        token_str = self.tokenizer.convert_ids_to_tokens(tokens[i].item())
+        j = i + 1
+        word_tokens = [tokens[i]]
+        if token_str.startswith(self.whitespace_token):
+            while j < len(tokens) and (
+                    self.is_alpha_not_prefix(tokens[j])):
+                word_tokens.append(tokens[j])
+                j += 1
+        word = self.tokenizer.decode(word_tokens)
+        original_word = word
+        context = self.tokenizer.decode(tokens[:i]) if self.add_context else ""
+        combined_text = context + word
+        tokenized_combined_text = self.tokenizer(combined_text, return_tensors='pt', truncation=True,
+                                                 max_length=max_length).to(device)
+        return j, word_tokens, word, context, tokenized_combined_text, combined_text, original_word
+    def get_next_full_word_typo(self, tokens, i, max_length=1000, device='cuda'):
+        tokens_str = self.tokenizer.convert_ids_to_tokens(tokens)
+        word_tokens = [tokens[i]]
+        word = self.tokenizer.decode(word_tokens)
+        original_word = word
+        if self.is_full_word(tokens_str, i, word, word_tokens):
+            word = self.introduce_typo(word)
+        word_tokens = self.tokenizer(word, return_tensors='pt', truncation=True, max_length=max_length).input_ids[0][1:]
+        context = self.tokenizer.decode(tokens[:i]) if self.add_context else ""
+        combined_text = context + word
+        tokenized_combined_text = self.tokenizer(combined_text, return_tensors='pt', truncation=True,
+                                                 max_length=max_length).to(device)
+        j = len(tokenized_combined_text.input_ids[0]) - 1 if self.add_context else len(tokenized_combined_text.input_ids[0]) - 1 + i
+        return j, word_tokens, word, context, tokenized_combined_text, combined_text, original_word
+    def get_next_full_word_separated(self, tokens, i, max_length=1000, device='cuda'):
+        tokens_str = self.tokenizer.convert_ids_to_tokens(tokens)
+        word_tokens = [tokens[i]]
+        word = self.tokenizer.decode(word_tokens)
+        original_word = word
+        if self.is_full_word(tokens_str, i, word, word_tokens):
+            word = torch.tensor(self.separate_word(word)).unsqueeze(0)
+        else:
+            word = word_tokens[0].unsqueeze(0).unsqueeze(0)
+        context = self.tokenizer.decode(tokens[:i]) if self.add_context else ""
+        tokenized_combined_text = self.tokenizer(context, return_tensors='pt', truncation=True,
+                                                 max_length=max_length).to(device)
+        print(tokenized_combined_text.input_ids)
+        print(word)
+        tokenized_combined_text.input_ids = torch.cat((tokenized_combined_text.input_ids, word), dim=1)
+        word_tokens = word
+        j = i+1
+        return j, word_tokens, word, context, tokenized_combined_text, self.tokenizer.decode(tokenized_combined_text.input_ids[0]), original_word
+    def is_alpha_not_prefix(self, token):
+        return (not self.tokenizer.convert_ids_to_tokens(token.item()).startswith(self.whitespace_token)
+                and self.tokenizer.convert_ids_to_tokens(token.item()).isalpha())
+    def introduce_typo(self, word, typo_type=None):
+        letters = 'abcdefghijklmnopqrstuvwxyz'
+        if typo_type is None:
+            typo_type = random.choice(["substitution", "deletion", "insertion", "transposition"])
+        if typo_type == "substitution":
+            position = random.randint(1, len(word) - 1)
+            original_char = word[position]
+            typo_char = random.choice([c for c in letters if c != original_char])
+            return word[:position] + typo_char + word[position + 1:]
+        elif typo_type == "deletion":
+            position = random.randint(1, len(word) - 1)
+            return word[:position] + word[position + 1:]
+        elif typo_type == "insertion":
+            position = random.randint(1, len(word) - 1)
+            typo_char = random.choice(letters)
+            return word[:position] + typo_char + word[position:]
+        elif typo_type == "transposition":
+            position = random.randint(1, len(word) - 2)
+            return word[:position] + word[position + 1] + word[position] + word[position + 2:]
+        else:
+            return word
+    def separate_word(self, word):
+        character_tokens = [self.tokenizer.encode(f'\n{char}')[-1] for char in ''.join(word)]
+        character_tokens = character_tokens[3:]
+        return character_tokens
+    def is_full_word(self, token_str, i, token, word_tokens):
+        next_token = self.tokenizer.decode(word_tokens[i + 1]) if i + 1 < len(word_tokens) else ""
+        return (token[1:].isalpha() and
+                len(token) > 5 and
+                token_str[i].startswith(self.whitespace_token) and
+                not next_token.isalpha())

requirements.txt CHANGED Viewed

@@ -1,213 +1,10 @@
-accelerate==1.2.1
-aiofiles==23.2.1
-aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
-aiosignal==1.3.2
-annotated-types==0.7.0
-anyio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_68kdsx8iyd/croot/anyio_1729121281958/work
-appnope @ file:///Users/ktietz/demo/mc3/conda-bld/appnope_1629146036738/work
-argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work
-argon2-cffi-bindings @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_2ef471wnyf/croot/argon2-cffi-bindings_1736182451265/work
-asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
-async-lru @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_02efro5ps8/croot/async-lru_1699554529181/work
-async-timeout==5.0.1
-attrs @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_93pjmt0git/croot/attrs_1734533120523/work
-Babel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_00k1rl2pus/croot/babel_1671781944131/work
-backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
-beautifulsoup4 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_94rx5n7wo9/croot/beautifulsoup4-split_1718029832430/work
-bleach @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_faqg19k8gh/croot/bleach_1732292152791/work
-blis==1.2.0
-Brotli @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_f7i0oxypt6/croot/brotli-split_1736182464088/work
-catalogue==2.0.10
-certifi @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d8j59rqun5/croot/certifi_1734473289913/work/certifi
-cffi @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e4xd9yd9i2/croot/cffi_1736182819442/work
-charset-normalizer @ file:///croot/charset-normalizer_1721748349566/work
-click==8.1.8
-cloudpathlib==0.20.0
-cloudpickle==3.1.0
-comm @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3doui0bmzb/croot/comm_1709322861485/work
-confection==0.1.5
-contourpy==1.3.0
-cycler==0.12.1
-cymem==2.0.11
-dask==2024.8.0
-dask-expr==1.1.10
-datasets==3.2.0
-debugpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_563_nwtkoc/croot/debugpy_1690905063850/work
-decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
-defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
--e git+https://github.com/tokeron/diffusers.git@00769b5d64c2ea35201e0df7a082db3513619afe#egg=diffusers&subdirectory=../../../../../../diffusers
-dill==0.3.8
-distro==1.9.0
-docker-pycreds==0.4.0
-editdistance==0.8.1
-en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl#sha256=293e9547a655b25499198ab15a525b05b9407a75f10255e405e8c3854329ab63
-en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl#sha256=5e6329fe3fecedb1d1a02c3ea2172ee0fede6cea6e4aefb6a02d832dba78a310
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
-eval_type_backport==0.2.2
-exceptiongroup @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b2258scr33/croot/exceptiongroup_1706031391815/work
-executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
-fastapi==0.115.12
-fastjsonschema @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d1wgyi4enb/croot/python-fastjsonschema_1731939426145/work
-ffmpy==0.5.0
-filelock==3.16.1
-fonttools==4.55.3
-frozenlist==1.5.0
-fsspec==2024.9.0
-gitdb==4.0.12
-GitPython==3.1.44
-gradio==4.44.1
-gradio_client==1.3.0
-h11 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_110bmw2coo/croot/h11_1706652289620/work
-httpcore @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_fcxiho9nv7/croot/httpcore_1706728465004/work
-httpx @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_cc4egw1482/croot/httpx_1723474826664/work
-huggingface-hub==0.27.1
-idna==3.10
-importlib_metadata @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_cc4qelzghy/croot/importlib_metadata-suite_1732633706960/work
-importlib_resources==6.5.2
-ipykernel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_ddflobe9t3/croot/ipykernel_1728665605034/work
-ipython @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6599f73fa7/croot/ipython_1694181355402/work
-jedi @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_38ctoinnl0/croot/jedi_1733987402850/work
-Jinja2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_b15nuwux5r/croot/jinja2_1730902833938/work
-jiter==0.8.2
-joblib==1.4.2
-json5 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b9ww6ewhv3/croot/json5_1730786813588/work
-jsonschema @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_7boelfqucq/croot/jsonschema_1728486715888/work
-jsonschema-specifications @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d38pclgu95/croot/jsonschema-specifications_1699032390832/work
-jupyter-events @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_db0avcjzq5/croot/jupyter_events_1718738111427/work
-jupyter-lsp @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_ae9br5v37x/croot/jupyter-lsp-meta_1699978259353/work
-jupyter_client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_58w2siozyz/croot/jupyter_client_1699455907045/work
-jupyter_core @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_73nomeum4p/croot/jupyter_core_1718818302815/work
-jupyter_server @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d1t69bk94b/croot/jupyter_server_1718827086930/work
-jupyter_server_terminals @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e7ryd60iuw/croot/jupyter_server_terminals_1686870731283/work
-jupyterlab @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a2d0br6r6g/croot/jupyterlab_1725895226942/work
-jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work
-jupyterlab_server @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_f64fg3hglz/croot/jupyterlab_server_1725865356410/work
-kiwisolver==1.4.7
-langcodes==3.5.0
-language_data==1.3.0
-locket==1.0.0
-marisa-trie==1.2.1
-markdown-it-py==3.0.0
-MarkupSafe @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a84ni4pci8/croot/markupsafe_1704206002077/work
-matplotlib==3.9.4
-matplotlib-inline @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f6fdc0hldi/croots/recipe/matplotlib-inline_1662014472341/work
-matplotlib-venn==1.1.2
-mdurl==0.1.2
-mistune @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_17ya6k1sbs/croots/recipe/mistune_1661496228719/work
-mpmath==1.3.0
-multidict==6.1.0
-multiprocess==0.70.16
-murmurhash==1.0.12
-nbclient @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_626hpwnurm/croot/nbclient_1698934218848/work
-nbconvert @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f4c1s1qk1f/croot/nbconvert_1728049432295/work
-nbformat @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_2cv_qoc1gw/croot/nbformat_1728049423516/work
-nest-asyncio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_310vb5e2a0/croot/nest-asyncio_1708532678212/work
-networkx==3.2.1
-nltk==3.9.1
-notebook @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_539v4hufo2/croot/notebook_1727199149603/work
-notebook_shim @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d6_ze10f45/croot/notebook-shim_1699455897525/work
-numpy==2.0.2
-openai==1.59.7
-orjson==3.10.16
-overrides @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_70s80guh9g/croot/overrides_1699371144462/work
-packaging @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a6_qk3qyg7/croot/packaging_1734472142254/work
-pandas==2.2.3
-pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work
-parso @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_8824a1w4md/croot/parso_1733963320105/work
-partd==1.4.2
-patsy==1.0.1
-pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
-pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
-pillow==10.4.0
-platformdirs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8u4fy8k9o/croot/platformdirs_1692205661656/work
-plotly==5.24.1
-preshed==3.0.9
-prometheus_client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_803ymjpv2u/croot/prometheus_client_1731958793251/work
-prompt-toolkit @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c63v4kqjzr/croot/prompt-toolkit_1704404354115/work
-propcache==0.2.1
-protobuf==5.29.2
-psutil @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1310b568-21f4-4cb0-b0e3-2f3d31e39728k9coaga5/croots/recipe/psutil_1656431280844/work
-ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
-pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
-pyarrow==18.1.0
-pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
-pydantic==2.10.4
-pydantic_core==2.27.2
-pydub==0.25.1
-Pygments @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_29bs9f_dh9/croot/pygments_1684279974747/work
-pyparsing==3.2.1
-PySocks @ file:///Users/ktietz/Code/oss/ci_pkgs/pysocks_1626781349491/work
-python-box==7.3.0
-python-dateutil @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_66ud1l42_h/croot/python-dateutil_1716495741162/work
-python-json-logger @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_9bjmcmh4nm/croot/python-json-logger_1734370248301/work
-python-multipart==0.0.20
-pytz @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a4b76c83ik/croot/pytz_1713974318928/work
-PyYAML @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_faoex52hrr/croot/pyyaml_1728657970485/work
-pyzmq @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_95lsut8ymz/croot/pyzmq_1734709560733/work
-referencing @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5cz64gsx70/croot/referencing_1699012046031/work
-regex==2024.11.6
-requests @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_ee45nsd33z/croot/requests_1730999134038/work
-rfc3339-validator @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_76ae5cu30h/croot/rfc3339-validator_1683077051957/work
-rfc3986-validator @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d0l5zd97kt/croot/rfc3986-validator_1683058998431/work
-rich==13.9.4
-rpds-py @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_93fzmr7v9h/croot/rpds-py_1732228422522/work
-ruff==0.11.6
-safetensors==0.5.0
-scikit-learn==1.6.0
-scipy==1.13.1
-seaborn==0.13.2
-semantic-version==2.10.0
-Send2Trash @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_5b31f0zzlv/croot/send2trash_1699371144121/work
-sentencepiece==0.2.0
-sentry-sdk==2.19.2
-setproctitle==1.3.4
-shellingham==1.5.4
-six @ file:///tmp/build/80754af9/six_1644875935023/work
-smart-open==7.1.0
-smmap==5.0.2
-sniffio @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1573pknjrg/croot/sniffio_1705431298885/work
-soupsieve @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_9798xzs_03/croot/soupsieve_1696347567192/work
-spacy==3.8.3
-spacy-legacy==3.0.12
-spacy-loggers==1.0.5
-srsly==2.5.1
-stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
-starlette==0.46.2
-statsmodels==0.14.4
-swifter==1.4.0
-sympy==1.13.1
-tabulate==0.9.0
-tenacity==9.0.0
-terminado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fcfvyc0an2/croot/terminado_1671751835701/work
-thinc==8.3.4
-threadpoolctl==3.5.0
-tinycss2 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fcw5_i306t/croot/tinycss2_1668168825117/work
-together==1.4.1
-tokenizers==0.21.0
-tomli @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d0e5ffbf-5cf1-45be-8693-c5dff8108a2awhthtjlq/croots/recipe/tomli_1657175508477/work
-tomlkit==0.12.0
-toolz==1.0.0
-torch==2.5.1
-torchvision==0.20.1
-tornado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_0axef5a0m0/croot/tornado_1733960501260/work
-tqdm==4.67.1
-traitlets @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_500m2_1wyk/croot/traitlets_1718227071952/work
-transformers==4.47.1
-typer==0.15.1
-typing_extensions @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_0b3jpv_f79/croot/typing_extensions_1734714864260/work
-tzdata==2024.2
-urllib3 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_06_m8gdsy6/croot/urllib3_1727769822458/work
-uvicorn==0.34.2
-wandb==0.19.1
-wasabi==1.1.3
-wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
-weasel==0.4.1
-webencodings==0.5.1
-websocket-client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d37u7gqts8/croot/websocket-client_1715878310260/work
-websockets==12.0
-wordcloud==1.9.4
-wrapt==1.17.2
-xxhash==3.5.0
-yarl==1.18.3
-zipp @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_echurpkwug/croot/zipp_1732630743967/work

+torch
+transformers
+pandas
+functools
+tqdm
+abc
+enum
+typing
+scikit-learn
+gradio

utils/__init__.py ADDED Viewed

File without changes

utils/calibration_utils.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import os
+import torch
+from torch import nn
+from tqdm import tqdm
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
+from transformers import get_scheduler
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from collections import defaultdict
+from torch.utils.data import DataLoader
+import torch.optim as optim
+from ..utils.data_utils import load_lm_dataset, extract_new_words_from_dataset, get_group_texts_func, get_tokenize_func
+class EmbeddingCalibrator(nn.Module):
+    def __init__(self, hidden_size, lora_r=None, lora_alpha=None, dtype=torch.bfloat16):
+        super().__init__()
+        self.use_lora = lora_r is not None
+        if not self.use_lora:
+            self.weight = nn.Parameter(torch.zeros(hidden_size, hidden_size, dtype=dtype))
+        else:
+            self.lora_scaling = lora_alpha / lora_r if lora_alpha is not None else 1.0
+            self.lora_A = nn.Parameter(torch.randn(lora_rank, hidden_size, dtype=dtype) * (1/lora_r))
+            self.lora_B = nn.Parameter(torch.zeros(hidden_size, lora_rank, dtype=dtype))
+    def forward(self, x):
+        if not self.use_lora:
+            return x + torch.matmul(x, self.weight.t())
+        else:
+            # Low-rank adaptation
+            lora_out = torch.matmul(x, self.lora_A.t())
+            lora_out = torch.matmul(lora_out, self.lora_B.t())
+            return x + self.lora_scaling * lora_out
+class CalibrationModel(nn.Module):
+    def __init__(
+            self,
+            base_model, lm_head, original_vocab_size, num_new_tokens,
+            calibrate_embedding=True, calibrate_lm_head=True, empty_init=False,
+            lora_alpha=None, lora_r=None,
+            target_loss_weight=0.15, subsequent_loss_weight=0.15,
+    ):
+        super().__init__()
+        self.base_model = base_model
+        self.lm_head = lm_head
+        self.new_tokens_start = original_vocab_size
+        self.new_tokens_end = original_vocab_size + num_new_tokens
+        self.calibrate_lm_head = calibrate_lm_head
+        self.calibrate_embedding = calibrate_embedding
+        if not empty_init:
+            self.lm_head_calibrator = EmbeddingCalibrator(base_model.config.hidden_size, lora_r, lora_alpha)
+            self.embedding_calibrator = EmbeddingCalibrator(base_model.config.hidden_size, lora_r, lora_alpha)
+        self.loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.subsequent_tokens_loss_alpha = subsequent_loss_weight
+        self.new_tokens_loss_alpha = target_loss_weight
+        self.original_tokens_loss_alpha = 1 - self.new_tokens_loss_alpha - self.subsequent_tokens_loss_alpha
+    def forward(self, input_ids, labels, attention_mask=None):
+        # shift labels by 1 for CLM
+        labels = labels[:, 1:].contiguous()
+        input_ids = input_ids[:, :-1].contiguous()
+        if self.calibrate_embedding:
+            E_weights = self.base_model.get_input_embeddings().weight.data
+            E_weights = torch.cat((E_weights[:self.new_tokens_start], self.embedding_calibrator(E_weights[self.new_tokens_start:])))
+            input_embeddings = E_weights[input_ids]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+            outputs = self.base_model(inputs_embeds=input_embeddings, attention_mask=attention_mask)
+        else:
+            with torch.no_grad():
+                # Forward pass through the base model
+                outputs = self.base_model(input_ids, attention_mask=attention_mask)
+        if self.calibrate_lm_head:
+            with torch.no_grad():
+                lm_head_weights = self.lm_head.weight
+                normed_weights = lm_head_weights.clone()
+            normed_weights[self.new_tokens_start:self.new_tokens_end] = self.lm_head_calibrator(lm_head_weights[self.new_tokens_start:self.new_tokens_end])
+            logits = torch.matmul(outputs['last_hidden_state'], normed_weights.T)
+        else:
+            if self.calibrate_embedding:
+                logits = self.lm_head(outputs['last_hidden_state'])
+            else:
+                with torch.no_grad():
+                    logits = self.lm_head(outputs['last_hidden_state'])
+        per_example_loss = self.loss_fct(logits.transpose(1,2), labels)
+        original_tokens_mask = labels < self.new_tokens_start
+        new_tokens_mask = ~original_tokens_mask
+        loss = 0.0
+        if self.original_tokens_loss_alpha > 0.0:
+            loss += self.original_tokens_loss_alpha * per_example_loss[original_tokens_mask].mean()
+        if self.new_tokens_loss_alpha > 0.0:
+            loss += self.new_tokens_loss_alpha * per_example_loss[new_tokens_mask].mean()
+        if self.subsequent_tokens_loss_alpha > 0.0:
+            subsequent_tokens_mask = torch.zeros_like(original_tokens_mask, dtype=torch.bool)
+            subsequent_tokens_mask[:, 1:][new_tokens_mask[:, :-1]] = True
+            loss += self.subsequent_tokens_loss_alpha * per_example_loss[subsequent_tokens_mask].mean()
+        return {'loss': loss, 'logits': logits}
+    def get_calibrators(self):
+        embedding_calibrator = self.embedding_calibrator if self.calibrate_embedding else None
+        lm_head_calibrator = self.lm_head_calibrator if self.calibrate_lm_head else None
+        return {
+            "embedding_calibrator": embedding_calibrator,
+            "lm_head_calibrator": lm_head_calibrator,
+            "new_tokens_start": self.new_tokens_start,
+            "new_tokens_end": self.new_tokens_end,
+        }
+    def set_calibrators(self, embedding_calibrator=None, lm_head_calibrator=None):
+        self.embedding_calibrator = embedding_calibrator
+        self.lm_head_calibrator = lm_head_calibrator
+    def save_calibrators(self, save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+        if self.calibrate_embedding:
+            torch.save(self.embedding_calibrator, os.path.join(save_dir, "embedding_calibrator.pt"))
+        if self.calibrate_lm_head:
+            torch.save(self.lm_head_calibrator, os.path.join(save_dir, "lm_head_calibrator.pt"))
+    def load_calibrators(self, load_dir, fail_ok=False):
+        """Loads the model's state dictionary from a file."""
+        try:
+            if self.calibrate_embedding:
+                self.embedding_calibrator = torch.load(os.path.join(load_dir, "embedding_calibrator.pt"))
+            if self.calibrate_lm_head:
+                self.lm_head_calibrator = torch.load(os.path.join(load_dir, "lm_head_calibrator.pt"))
+            return True
+        except:
+            if fail_ok:
+                return False
+            raise FileNotFoundError(f"Loading calibrators from '{load_dir}' failed")
+def get_calibration_model(model, original_vocab_size, num_new_tokens, target_loss_weight=0.15, subsequent_loss_weight=0.15):
+    calibrated_model = CalibrationModel(model.model, model.lm_head, original_vocab_size, num_new_tokens, target_loss_weight=target_loss_weight, subsequent_loss_weight=subsequent_loss_weight)
+    calibrated_model.base_model.eval()
+    calibrated_model.lm_head.eval()
+    for param in calibrated_model.base_model.parameters():
+        param.requires_grad = False
+    for param in calibrated_model.lm_head.parameters():
+        param.requires_grad = False
+    for param in calibrated_model.lm_head_calibrator.parameters():
+        param.requires_grad = True
+    for param in calibrated_model.embedding_calibrator.parameters():
+        param.requires_grad = True
+    return calibrated_model
+def train_calibration_model(calibrated_model: CalibrationModel, tokenizer, dataset, save_dir=None, max_samples=None, filter_examples_without_new_tokens=True, lr=1e-4, lr_schedule="linear", num_epochs=1, batch_size=8, max_length=256, n_warmup_steps=0, text_col_name="text", clip_grad_norm=1.0, mixed_precision=None):
+    accelerator = Accelerator(mixed_precision=mixed_precision)
+    # Optimizer
+    optimizer = optim.AdamW(calibrated_model.parameters(), lr=lr)
+    # Tokenize data
+    if tokenizer.bos_token is not None and max_length:
+        add_start_token = True
+        # leave room for <BOS> token to be added:
+        max_tokenized_len = max_length - 1
+    else:
+        add_start_token = False
+        max_tokenized_len = max_length
+    def _add_start_token(batch):
+        bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * batch["input_ids"].size(dim=0)).to(batch["input_ids"].device)
+        batch["input_ids"] = torch.cat([bos_tokens_tensor, batch["input_ids"]], dim=1)
+        batch["attention_mask"] = torch.cat(
+            [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(batch["attention_mask"].device), batch["attention_mask"]], dim=1)
+        return batch
+    tokenize_function = get_tokenize_func(tokenizer, text_col_name)
+    column_names = dataset.column_names
+    with accelerator.main_process_first():
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=False,
+            desc="Running tokenizer on dataset",
+        )
+        group_texts = get_group_texts_func(block_size=max_tokenized_len)
+        lm_dataset = tokenized_dataset.map(
+            group_texts,
+            batched=True,
+        )
+    if filter_examples_without_new_tokens:
+        examples_w_new_token = np.arange(len(lm_dataset))[np.any(np.array(lm_dataset['input_ids']) >= calibrated_model.new_tokens_start, axis=1)]
+        lm_dataset = lm_dataset.select(examples_w_new_token)
+    if max_samples is not None:
+        lm_dataset = lm_dataset.select(np.arange(max_samples))
+    data_collator = default_data_collator
+    # Create data loaders
+    dataloader = DataLoader(
+        lm_dataset, collate_fn=data_collator, batch_size=batch_size, drop_last=True, shuffle=True,
+    )
+    # Learning rate scheduler
+    if isinstance(n_warmup_steps, float):
+        n_warmup_steps = n_warmup_steps * len(dataloader)
+    scheduler = get_scheduler(lr_schedule, optimizer=optimizer, num_warmup_steps=n_warmup_steps, num_training_steps=len(dataloader) * num_epochs)
+    calibrated_model, dataloader = accelerator.prepare(calibrated_model, dataloader)
+    # Freeze the original lm_head weights
+    for param in calibrated_model.lm_head.parameters():
+        param.requires_grad = False
+    calibrated_model.train()
+    for epoch in tqdm(range(num_epochs), unit="epochs", desc="Fitting calibration"):
+        total_loss = 0.0
+        for step, batch in tqdm(enumerate(dataloader), total=len(dataloader), miniters=10, unit="batches"):
+            if add_start_token:
+                batch = _add_start_token(batch)
+            batch["labels"] = batch["input_ids"]
+            optimizer.zero_grad()
+            outputs = calibrated_model(**batch)
+            loss = outputs['loss']
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(calibrated_model.parameters(), max_norm=clip_grad_norm)
+            optimizer.step()
+            scheduler.step()
+            total_loss += loss.item()
+            # # Log loss
+            # if step % 10 == 0:
+            #     print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.item()}")
+        avg_loss = total_loss / len(dataloader)
+        print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss}")
+    if save_dir is not None:
+        calibrated_model.save_calibrators(save_dir)
+    return calibrated_model
+def merge_calibrators_to_hf_model(hf_model, new_tokens_start, new_tokens_end=None, embedding_calibrator=None, lm_head_calibrator=None):
+    embedding_calibrator.to(hf_model.device)
+    lm_head_calibrator.to(hf_model.device)
+    if embedding_calibrator is not None:
+        embedding_weights = hf_model.get_input_embeddings().weight
+        with torch.no_grad():
+            calibrated_weights = embedding_calibrator(embedding_weights[new_tokens_start:new_tokens_end])
+            hf_model.model.embed_tokens.weight.data[
+            new_tokens_start:new_tokens_end] = calibrated_weights
+    if lm_head_calibrator is not None:
+        lm_head_weights = hf_model.get_output_embeddings().weight
+        with torch.no_grad():
+            calibrated_weights = lm_head_calibrator(lm_head_weights[new_tokens_start:new_tokens_end])
+            hf_model.lm_head.weight.data[new_tokens_start:new_tokens_end] = calibrated_weights
+    return hf_model
+def merge_calibration_model_to_hf_model(hf_model, calibrated_model):
+    calibrated_model.to(hf_model.device)
+    if calibrated_model.calibrate_lm_head:
+        lm_head_weights = calibrated_model.lm_head.weight
+        normed_weights = calibrated_model.lm_head_calibrator(lm_head_weights[calibrated_model.new_tokens_start:calibrated_model.new_tokens_end])
+        with torch.no_grad():
+            hf_model.lm_head.weight.data[calibrated_model.new_tokens_start:calibrated_model.new_tokens_end] = normed_weights
+    if calibrated_model.calibrate_embedding:
+        embedding_weights = calibrated_model.base_model.get_input_embeddings().weight
+        normed_weights = calibrated_model.embedding_calibrator(embedding_weights[calibrated_model.new_tokens_start:calibrated_model.new_tokens_end])
+        with torch.no_grad():
+            hf_model.model.embed_tokens.weight.data[calibrated_model.new_tokens_start:calibrated_model.new_tokens_end] = normed_weights
+    return hf_model

utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import re
+from datasets import load_dataset, Dataset, DatasetDict
+from itertools import chain
+from tqdm import tqdm
+from collections import Counter
+from accelerate import Accelerator
+LANGUAGES_TO_DECODE_FROM_BYTES = ["he", "fr", "uk"]
+STREAMING_DATASETS = ["fineweb-edu"]
+def load_pg19_val_and_test():
+    # Load the dataset in streaming mode
+    streaming_dataset = load_dataset("deepmind/pg19", split=None, streaming=True)
+    # Extract test and validation splits
+    test_split = list(streaming_dataset["test"])
+    validation_split = list(streaming_dataset["validation"])
+    # Convert them into regular datasets
+    test_dataset = Dataset.from_list(test_split)
+    validation_dataset = Dataset.from_list(validation_split)
+    # validation_dataset = load_dataset("deepmind/pg19", split="validation")
+    # test_dataset = load_dataset("deepmind/pg19", split="test")
+    return DatasetDict({"validation": validation_dataset, "test": test_dataset})
+def load_pubmed(n_samples=10000):
+    # Load the dataset in streaming mode
+    streaming_dataset = load_dataset("MedRAG/pubmed", streaming=True)
+    # Extract test and validation splits
+    data = list(streaming_dataset["train"].take(n_samples*4))
+    train = data[:2*n_samples]
+    validation = data[2*n_samples:3*n_samples]
+    test = data[3*n_samples:]
+    # Convert them into regular datasets
+    train = Dataset.from_list(train)
+    validation = Dataset.from_list(validation)
+    test = Dataset.from_list(test)
+    dataset = DatasetDict({"train": train, 'validation': validation, 'test': test})
+    dataset = dataset.rename_column('content', 'text')
+    return dataset
+def load_lm_dataset(dataset_name, language="en", split=None):
+    """
+    Loads a popular pretraining or perplexity evaluation dataset by name and language.
+    Args:
+        dataset_name (str): The name of the dataset to load. Options include:
+            - 'wikitext' (wikitext-2, smaller WikiText dataset)
+            - 'wikitext-103' (larger WikiText dataset)
+            - 'pg19' (Project Gutenberg dataset for long-context modeling)
+            - 'c4' (Common Crawl-based English corpus)
+            - 'wiki40b' (Wikipedia dataset in multiple languages)
+            - 'mc4' (Multilingual C4 dataset in various languages)
+        language (str): Language code for datasets that support multilingual options (e.g., 'en' for English).
+                        Defaults to 'en'.
+    Returns:
+        Dataset: Loaded Hugging Face dataset.
+    """
+    if dataset_name.lower() == 'wikitext':
+        return load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", split=split)
+    elif dataset_name.lower() == 'fineweb-edu':
+        return load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT")
+    elif dataset_name.lower() == 'wikitext-103':
+        return load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split=split)
+    elif dataset_name.lower() == 'cord19':
+        return load_dataset("allenai/cord19", "fulltext", trust_remote_code=True)
+    elif dataset_name.lower() == 'pubmed':
+        return load_pubmed()
+    elif dataset_name.lower() == 'wikilingua':
+        dataset = load_dataset("GEM/wiki_lingua", trust_remote_code=True)
+        dataset = dataset.filter(lambda ex: (ex['source_language'] == "en") & (ex['target_language'] == "en"))
+        dataset = dataset.rename_column("source", "text")
+        dataset = dataset.rename_column("target", "summary")
+        return dataset
+    elif dataset_name.lower() == 'xsum':
+        dataset = load_dataset("EdinburghNLP/xsum")
+        dataset = dataset.rename_column("document", "text")
+        return dataset
+    elif dataset_name.lower() == 'cnn':
+        dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
+        dataset = dataset.rename_column("article", "text")
+        dataset = dataset.rename_column("highlights", "summary")
+        dataset = dataset.map(lambda example: {"text": example["text"].replace("(CNN)", "")})
+        return dataset
+    elif dataset_name.lower() == 'pg19':
+        return load_pg19_val_and_test()
+    elif dataset_name.lower() == 'wiki40b':
+        dataset = load_dataset("google/wiki40b", language, split=split)
+        if language in LANGUAGES_TO_DECODE_FROM_BYTES:
+            dataset = dataset.map(lambda x: {
+                "text": bytes(x["text"][2:-1], "utf-8").decode("unicode_escape").encode("latin1").decode("utf-8").replace("_NEWLINE_", "\n")
+            })
+        return dataset
+    else:
+        raise ValueError(
+            "Dataset not recognized. Available options: 'wikitext-2', 'wikitext-103', 'pg19', 'c4', 'wiki40b', 'mc4'.")
+def extract_new_words_from_dataset(
+        dataset: Dataset, tokenizer, text_column: str = "text", max_samples: int = None, filter_func=(lambda word, token_count: True)):
+    """
+    Loads a Hugging Face dataset and extracts all unique words from the specified text column.
+    Args:
+        dataset (Dataset): Name of the dataset to load.
+        split (str): Dataset split to use, typically 'train' for training data. Defaults to 'train'.
+        text_column (str): The column in the dataset containing text. Defaults to 'text'.
+        max_samples (int): Number of samples from the dataset to go over.
+    Returns:
+        set: A set of unique words in the dataset.
+    """
+    if max_samples:
+        dataset = dataset.select(range(max_samples))
+    # Regular expression to split text into words (adjust as needed for specific languages)
+    # word_pattern = re.compile(r"\b\w+\b")
+    word_pattern = re.compile(r"\b\w+(?:[-']\w+)*\b")
+    # Iterate over each entry in the dataset and extract unique words
+    all_words = list()
+    new_words = list()
+    for record in tqdm(dataset, total=len(dataset), miniters=10, desc="Extracting all words from dataset...", unit="examples"):
+        text = record.get(text_column, "")
+        words = word_pattern.findall(text)
+        all_words += words
+    # all_words = list(dict.fromkeys(all_words))
+    word_frequencies = Counter(all_words)
+    all_words = list(word_frequencies.keys())
+    token_counts = [len(x) for x in tokenizer(all_words, add_special_tokens=False)["input_ids"]]
+    w_whitespace_token_counts = [len(x) for x in tokenizer([f" {w}" for w in all_words], add_special_tokens=False)["input_ids"]]
+    new_words = [word for word, count, w_whitespace_count in zip(all_words, token_counts, w_whitespace_token_counts) if ((count > 1) and (w_whitespace_count > 1) and filter_func(word, count))]
+    new_words_freq = {word: word_frequencies[word] for word in new_words}
+    # for word, token_count in tqdm(all_words, total=len(all_words), miniters=10, desc="Finding new words...", unit="words"):
+    #     if (not tokenizer.vocab.get(word, False)) and :
+    #         new_words.append(word)
+    # remove duplicates and return
+    return new_words, new_words_freq
+def get_group_texts_func(block_size=1024):
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    return group_texts
+def get_tokenize_func(tokenizer, text_col_name):
+    def _tokenize(examples):
+        output = tokenizer(
+            examples[text_col_name],
+            return_token_type_ids=False,
+            add_special_tokens=False,
+        )
+        return output
+    return _tokenize
+def tokenize_and_prepare_dataset(
+        dataset, tokenizer, accelerator=None,
+        text_col_name: str = "text",
+        max_length: int = 256,
+        eval_max_samples: int = None,
+):
+    if tokenizer.bos_token is not None and max_length:
+        # leave room for <BOS> token to be added:
+        max_tokenized_len = max_length - 1
+    else:
+        max_tokenized_len = max_length
+    tokenize_function = get_tokenize_func(tokenizer, text_col_name)
+    column_names = dataset.column_names
+    tokenized_dataset = dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=column_names,
+        load_from_cache_file=False,
+        desc="Running tokenizer on dataset",
+    )
+    group_texts = get_group_texts_func(block_size=max_tokenized_len)
+    lm_dataset = tokenized_dataset.map(
+        group_texts,
+        batched=True,
+    )
+    if eval_max_samples:
+        lm_dataset = lm_dataset.select(range(eval_max_samples))
+    return lm_dataset

utils/enums.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from enum import Enum
+class RetrievalTechniques(Enum):
+    ReverseLogitLens = 1
+    LogitLens = 2
+    Patchscopes = 3
+class MultiTokenKind(Enum):
+    Split = 1
+    Typo = 2
+    Natural = 3

utils/eval_utils.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from accelerate import Accelerator
+from transformers import default_data_collator
+from collections import defaultdict
+from tqdm import tqdm
+import numpy as np
+def is_not_number(s):
+    try:
+        float(s)  # Try converting the string to a float
+        return False  # If conversion is successful, it's a number
+    except ValueError:
+        return True  # If conversion fails, it's not a number
+def get_contexts_ending_with_word(word, dataset):
+    result_contexts = []
+    word_len = len(word)
+    # Iterate over the dataset
+    for example in dataset:
+        text = example["text"]
+        # Find all occurrences of the word in the text
+        start = 0
+        while True:
+            idx = text.find(word, start)
+            if idx == -1:
+                break
+            # Ensure that the word is isolated (not a substring of another word)
+            if (idx == 0 or not text[idx - 1].isalnum()) and (
+                    idx + word_len == len(text) or not text[idx + word_len].isalnum()):
+                # Text ends with the word
+                result_contexts.append(text[:idx + word_len].strip())
+            start = idx + word_len
+    return result_contexts
+def get_texts_containing_word(words, dataset):
+    result_texts = []
+    words_set = set(words)
+    # Iterate over the dataset
+    for example in dataset:
+        if words_set.intersection(set(example["text"].split())):
+            result_texts.append(example["text"])
+    return result_texts
+def compute_topk_token_rank(logits, labels, k=1000):
+    # Get the top-k predicted logits and their indices
+    topk_logits, topk_indices = torch.topk(logits, k, dim=-1)
+    # Expand the labels for comparison
+    labels_expanded = labels.unsqueeze(-1).expand_as(topk_indices)
+    # Check if the label token is within the top-k predictions
+    rank_in_topk = (topk_indices == labels_expanded).nonzero(as_tuple=False)
+    # Create a rank tensor initialized with k (max rank is k)
+    ranks = torch.full(labels.shape, k, dtype=torch.long, device=logits.device)
+    # For labels in top-k, set the rank accordingly
+    ranks[rank_in_topk[:, 0], rank_in_topk[:, 1]] = rank_in_topk[:, 2] + 1
+    return ranks
+def count_tokens_in_dataset(dataset, tokenizer, text_column='text'):
+    def tokenize_and_count(examples):
+        return {'num_tokens': [len(tokenizer(ex).input_ids) for ex in examples[text_column]]}
+    tokenized_dataset = dataset.map(tokenize_and_count, batched=True, remove_columns=dataset.column_names)
+    total_tokens = sum(tokenized_dataset['num_tokens'])
+    return total_tokens
+def filter_single_token_words(array, tokenizer, add_space_prefix_for_lower=True):
+    def _is_multi_token(word):
+        if add_space_prefix_for_lower and word[0].islower():
+            word = " " + word
+        return len(tokenizer.encode(word, add_special_tokens=False))
+    token_counts = array.apply(_is_multi_token)
+    mask = token_counts > 1
+    return array[mask], token_counts
+# TODO make clearer what's its use
+def get_last_zero_in_every_seq_mask(tensor):
+    # Find where consecutive zeros end
+    zero_mask = (tensor == 0)
+    diff = torch.diff(zero_mask.int(), dim=1)
+    last_zero_mask = torch.cat([diff, torch.ones(tensor.size(0), 1, dtype=diff.dtype).to(tensor.device)], dim=1) == -1
+    # Create the output
+    output = 1 - tensor
+    output[zero_mask & ~last_zero_mask] = 0
+    return output
+def get_first_zero_in_every_seq_mask(tensor):
+    # Identify where consecutive zeros begin
+    zero_mask = (tensor == 0)
+    diff = torch.diff(zero_mask.int(), dim=1, prepend=torch.zeros(tensor.size(0), 1, dtype=torch.int).to(tensor.device))
+    first_zero_mask = diff == 1  # Marks the beginning of each sequence of zeros
+    # Create the output
+    output = 1 - tensor
+    output[zero_mask & ~first_zero_mask] = 0
+    return output
+def _add_start_token(batch, tokenizer):
+    bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * batch["input_ids"].size(dim=0)).to(batch["input_ids"].device)
+    batch["input_ids"] = torch.cat([bos_tokens_tensor, batch["input_ids"]], dim=1)
+    batch["attention_mask"] = torch.cat(
+        [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(batch["attention_mask"].device), batch["attention_mask"]], dim=1)
+    return batch
+def _ignore_new_words_in_attention_mask(shift_attention_mask_batch, shift_labels, new_token_ids=None, replaced_token_seqs_by_len=None):
+    # Ignore token_ids of new vocabulary words in shift_labels and shift_logits
+    if new_token_ids is not None:
+        ignore_mask = torch.isin(shift_labels, new_token_ids)
+        shift_attention_mask_batch = shift_attention_mask_batch * (~ignore_mask).long()
+    # Ignore multi-token sequences of that were replaced with a single token
+    if replaced_token_seqs_by_len is not None:
+        # Create a mask that will be updated where sequences match
+        ignore_mask = shift_attention_mask_batch.clone()  # Clone the attention mask to modify it
+        # Loop over sequences in skip_token_seqs
+        for seq_len, seqs in replaced_token_seqs_by_len.items():
+            # Create a sliding window of the same size as the skip_seq and check for matches
+            for i in range(shift_labels.size(1) - seq_len + 1):
+                # Check if the sequence matches at position i
+                window = shift_labels[:, i:i + seq_len]
+                curr_mask = torch.all(window.unsqueeze(1) == seqs.unsqueeze(0), dim=-1)
+                if curr_mask.any():
+                    # Zero out the ignore mask for the length of the sequence
+                    ignore_mask[curr_mask.any(dim=-1), i:i + seq_len] = 0
+        # Apply the ignore mask to the attention mask
+        shift_attention_mask_batch *= ignore_mask
+    return shift_attention_mask_batch, ignore_mask
+# TODO consider not aggregating results here, to enable metrics for specific words
+def compute_metrics(
+        logits, labels, attention_mask,
+        compute_target_metrics=True, compute_subsequent_metrics=True, compute_perplexity=False,
+        return_successful_targets=False,
+        original_labels=None, original_logits=None,
+        debug=False):
+    target_results = dict()  # will hold metrics for all the new words we add or their original tokenization
+    background_results = dict()  # will hold metrics for all background tokens, i.e., not the ones we add or replace
+    overall_results = dict()  # will hold metrics for all tokens
+    successful_targets = None  # will hold list of target tokens successfully predicted
+    if compute_subsequent_metrics:
+        # prepare labels and attentions masks for computing metrics only for the 1st tokens following the new words
+        subsequent_labels = labels[:,  1:]
+        subsequent_attention_mask = get_last_zero_in_every_seq_mask(attention_mask[..., :-1].contiguous())
+        subsequent_attention_mask_bool = subsequent_attention_mask == 1
+    attention_mask_bool = attention_mask == 1
+    overall_mask_bool = attention_mask_bool
+    if compute_target_metrics:
+        target_mask = get_first_zero_in_every_seq_mask(attention_mask)
+        target_mask_bool = target_mask == 1
+        overall_mask_bool = attention_mask_bool | target_mask_bool
+    if compute_perplexity:
+        background_results["perplexity"] = torch.exp(
+            (F.cross_entropy(logits.transpose(1, 2), labels, reduction="none") * attention_mask).sum(1)
+            / attention_mask.sum(1)
+        ).mean().detach().cpu().numpy()
+    top1 = logits.argmax(dim=-1)
+    if original_logits is not None:
+        orig_top1 = original_logits.argmax(dim=-1)
+    if compute_target_metrics:
+        target_results["top1_acc"] = ((labels == top1)[target_mask_bool]).detach().cpu().numpy()
+        if original_labels is not None:
+            target_results["sum_top1_acc"] = (
+                ((original_labels == top1) | (labels == top1))[target_mask_bool]).detach().cpu().numpy()
+            if original_logits is not None:
+                target_results["orig_top1_acc"] = (
+                    (original_labels == orig_top1)[target_mask_bool]).detach().cpu().numpy()
+        if return_successful_targets:
+            successful_targets = (labels[(labels == top1) & target_mask_bool]).detach().cpu().numpy()
+    background_results["top1_acc"] = ((
+                         labels == top1)[attention_mask_bool]).detach().cpu().numpy()
+    if compute_subsequent_metrics:
+        background_results["subsequent_top1_acc"] = ((subsequent_labels == top1[:, 1:])[subsequent_attention_mask_bool]).detach().cpu().numpy()
+    if original_logits is not None:
+        background_results["orig_top1_acc"] = (
+            (original_labels == orig_top1)[attention_mask_bool]).detach().cpu().numpy()
+        if compute_subsequent_metrics:
+            background_results["orig_subsequent_top1_acc"] = (
+            (subsequent_labels == orig_top1[:, 1:])[subsequent_attention_mask_bool]).detach().cpu().numpy()
+    overall_results["top1_acc"] = ((labels == top1))[overall_mask_bool].detach().cpu().numpy()
+    if original_labels is not None:
+        overall_results["sum_top1_acc"] = (
+            ((original_labels == top1) | (labels == top1)))[overall_mask_bool].detach().cpu().numpy()
+        if original_logits is not None:
+            overall_results["orig_top1_acc"] = (
+                (original_labels == orig_top1)[overall_mask_bool]).detach().cpu().numpy()
+    if debug:
+        import pdb; pdb.set_trace()
+    return background_results, target_results, overall_results, successful_targets
+def eval_next_word_prediction(
+        model, tokenizer, lm_dataset, accelerator=None,
+        batch_size: int = 4,
+        new_token_ids=None, replaced_token_seqs_by_len=None,
+        new_token_to_original_first_token=None,
+        max_length: int = 256,
+        drop_last: bool = True,
+        eval_max_samples: int = None,
+        eval_shuffle_samples: bool = False,
+        reduction="none",
+):
+    if accelerator is None:
+        accelerator = Accelerator()
+    model.eval()
+    if tokenizer.bos_token is not None and max_length:
+        add_start_token = True
+    else:
+        add_start_token = False
+    data_collator = default_data_collator
+    if eval_max_samples:
+        eval_idx = range(len(lm_dataset), min(eval_max_samples, len(lm_dataset)))
+        if eval_shuffle_samples:
+            eval_idx = np.random.choice(len(lm_dataset), min(eval_max_samples, len(lm_dataset)))
+        lm_dataset = lm_dataset.select(eval_idx)
+    # Create data loaders
+    eval_dataloader = DataLoader(
+        lm_dataset, collate_fn=data_collator, batch_size=batch_size, drop_last=drop_last, shuffle=False,
+    )
+    eval_dataloader = accelerator.prepare(eval_dataloader)
+    model.eval()
+    if new_token_ids is not None:
+        new_token_ids = torch.tensor(new_token_ids).to(model.device)
+    if replaced_token_seqs_by_len is not None:
+        replaced_token_seqs_by_len = {token_length: torch.tensor(skip_token_seqs).to(model.device) for token_length, skip_token_seqs in replaced_token_seqs_by_len.items() if len(skip_token_seqs) > 0}
+    if new_token_to_original_first_token is not None:
+        # Convert the mapping into a tensor for efficient indexing, create a mapping tensor that defaults to identity
+        new_token_to_orig_first_mapping_tensor = torch.arange(len(tokenizer), device=model.device)
+        new_token_to_orig_first_mapping_tensor[torch.tensor(list(new_token_to_original_first_token.keys()), device=model.device)] = \
+            torch.tensor(list(new_token_to_original_first_token.values()), device=model.device)
+    target_metrics = defaultdict(list)
+    background_metrics = defaultdict(list)
+    overall_metrics = defaultdict(list)
+    # run eval and compute metrics
+    for batch_i, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader), miniters=10, desc="Evaluating vocabulary..."):
+        if add_start_token:
+            batch = _add_start_token(batch, tokenizer)
+        labels = batch["input_ids"]
+        attn_mask = batch["attention_mask"]
+        batch.pop("labels")
+        with torch.no_grad():
+            outputs = model(**batch)
+        out_logits = outputs.logits
+        shift_logits = out_logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
+        shift_attention_mask_batch, ignore_mask = \
+            _ignore_new_words_in_attention_mask(
+                shift_attention_mask_batch, shift_labels, new_token_ids, replaced_token_seqs_by_len)
+        original_labels = None if new_token_to_original_first_token is None \
+            else new_token_to_orig_first_mapping_tensor[shift_labels]
+        original_logits = None if new_token_ids is None else torch.cat([shift_logits[:, :, :min(new_token_ids)], shift_logits[:, :, max(new_token_ids)+1:]], dim=-1)
+        background_results, target_results, overall_results, successful_targets = \
+            compute_metrics(
+                shift_logits, shift_labels, shift_attention_mask_batch,
+                original_labels=original_labels, original_logits=original_logits, compute_perplexity=True)
+        for metric_name, metric_value in target_results.items():
+            target_metrics[metric_name].append(np.array(metric_value))
+        for metric_name, metric_value in background_results.items():
+            background_metrics[metric_name].append(metric_value)
+        for metric_name, metric_value in overall_results.items():
+            overall_metrics[metric_name].append(metric_value)
+    eval_dataloader = accelerator.free_memory(eval_dataloader)
+    def _concat_func(x):
+        if isinstance(x, np.ndarray) and len(x.shape) > 1:
+            x = np.concat(x)
+        elif isinstance(x, (list, tuple)) and len(x) > 1:
+            if isinstance(x[0], np.ndarray) and len(x[0].shape) == 0:
+                x = np.array(x)
+            else:
+                x = np.concat(x)
+        return x
+    # apply reduction
+    reduce_func = _concat_func
+    if reduction == 'mean':
+        reduce_func = lambda x: np.mean(_concat_func(x)).item()
+    for metric_name, metric_value in target_metrics.items():
+        target_metrics[metric_name] = reduce_func(metric_value)
+    for metric_name, metric_value in background_metrics.items():
+        background_metrics[metric_name] = reduce_func(metric_value)
+    for metric_name, metric_value in overall_metrics.items():
+        overall_metrics[metric_name] = reduce_func(metric_value)
+    return background_metrics, target_metrics, overall_metrics

utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import re
+import pandas as pd
+def save_df_to_dir(results_df, base_dir, sub_dirs, file_name_format, add_context, model_name):
+    # Get the root directory of the project
+    root_dir = os.path.dirname(os.path.abspath(__file__))
+    # Construct the output directory path
+    output_dir = os.path.join(root_dir, base_dir, *sub_dirs)
+    os.makedirs(output_dir, exist_ok=True)
+    # Construct the file name
+    file_name = file_name_format.format(model_name=model_name,
+                                        context="with_context" if add_context else "without_context")
+    # Construct the full file path
+    file_path = os.path.join(output_dir, file_name)
+    # Save the DataFrame to CSV
+    results_df.to_csv(file_path, index=False)
+def merge_dfs(base_dir, exp_name, part_format="part_{i}_", output_dir=None,
+                     filename="patchscopes_results.parquet", output_filename="patchscopes_results.parquet"):
+    """
+    Merges DataFrames from directories matching the part format into a single DataFrame,
+    and optionally saves the result to a file.
+    Args:
+        base_dir (str): The base directory containing the data.
+        exp_name (str): The experiment name to look for within part directories.
+        part_format (str): The general format for identifying parts (e.g., "part_{i}_").
+        output_dir (str, optional): Directory to save the merged DataFrame. Default is None.
+        filename (str): The filename of the Parquet file to read in each part directory.
+        output_filename (str): Name of the output file if saving is enabled.
+    Returns:
+        pd.DataFrame: A single DataFrame containing data from all parts.
+    """
+    dataframes = []
+    part_regex = part_format.replace("{i}", r"\d+")
+    # List all directories in base_dir
+    for dir_name in os.listdir(base_dir):
+        if os.path.isdir(os.path.join(base_dir, dir_name)) and re.match(part_regex, dir_name) and (dir_name.endswith(exp_name)):
+            part_dir = os.path.join(base_dir, dir_name)
+            file_path = os.path.join(part_dir, filename)
+            if os.path.exists(file_path):
+                # Read the DataFrame and add it to the list
+                df = pd.read_parquet(file_path)
+                dataframes.append(df)
+    # Concatenate all DataFrames into a single DataFrame
+    merged_df = pd.concat(dataframes, axis=1)
+    # Save the result to file if output_dir is given
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, output_filename)
+        merged_df.to_parquet(output_path, index=False)
+    return merged_df, dataframes
+def parse_string_list_from_file(file_path, delimiter=None):
+    """
+    Parses a list of strings from a file, handling various list formats.
+    Args:
+        file_path (str): Path to the file containing the list.
+    Returns:
+        list: A list of parsed strings.
+    """
+    with open(file_path, 'r') as file:
+        content = file.read()
+    if delimiter is None:
+        # Remove newlines and excess whitespace
+        content = re.sub(r'\s+', ' ', content.strip())
+        # Handle different delimiters and list formats
+        # Removes common list notations like commas, brackets, quotes, etc.
+        items = re.split(r'[,\[\]\(\)\{\}"\'\s]+', content)
+    else:
+        if delimiter == "newline":  # TODO fix this
+            delimiter = "\n"
+        items = [item.strip() for item in content.split(delimiter)]
+    # Filter out any empty strings from the list
+    return [item for item in items if item]

utils/logit_lens.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""Provides a class for mapping transformer hidden states to logits (and vice versa).
+Example:
+from standalone_logit_lens import LogitLens, ReverseLogitLens
+model = AutoModelForCausalLM.from_pretrained(model_name).to(device).to(dtype)
+lens = LogitLens.from_model(model).to(device).to(dtype)
+reverse_lens = ReverseLogitLens.from_model(model).to(device).to(dtype)
+hidden_state = ...
+result = lens(hidden_state, layer_index)  # layer_index is not really used, you can pass whatever
+"""
+import abc
+import logging
+import copy
+from typing import Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+import transformers
+from transformers import models
+from transformers import PreTrainedModel
+Model = Union[PreTrainedModel]
+Norm = Union[
+    nn.LayerNorm,
+    models.llama.modeling_llama.LlamaRMSNorm,
+    models.gemma.modeling_gemma.GemmaRMSNorm,
+    models.gemma2.modeling_gemma2.Gemma2RMSNorm,
+    nn.Module,
+]
+def get_unembedding_matrix(model: Model) -> nn.Linear:
+    """The final linear tranformation from the model hidden state to the output."""
+    if isinstance(model, PreTrainedModel):
+        unembed = model.get_output_embeddings()
+        if not isinstance(unembed, nn.Linear):
+            raise ValueError("We currently only support linear unemebdings")
+        return unembed
+    else:
+        raise ValueError(f"Model class {type(model)} not recognized!")
+def get_embedding_matrix(model: nn.Module) -> nn.Embedding:
+    """The initial embedding matrix from the input tokens to the model hidden state."""
+    if isinstance(model, PreTrainedModel):
+        embed = model.get_input_embeddings()
+        if not isinstance(embed, nn.Embedding):
+            raise ValueError("We currently only support embedding matrices")
+        return embed
+    else:
+        raise ValueError(f"Model class {type(model)} not recognized!")
+def get_final_norm(model: Model) -> Norm:
+    """Get the final norm from a model.
+    This isn't standardized across models, so this will need to be updated as
+    we add new models.
+    """
+    if not hasattr(model, "base_model"):
+        raise ValueError("Model does not have a `base_model` attribute.")
+    base_model = model.base_model
+    if isinstance(base_model, models.opt.modeling_opt.OPTModel):
+        final_layer_norm = base_model.decoder.final_layer_norm
+    elif isinstance(base_model, models.gpt_neox.modeling_gpt_neox.GPTNeoXModel):
+        final_layer_norm = base_model.final_layer_norm
+    elif isinstance(
+        base_model,
+        (
+            models.bloom.modeling_bloom.BloomModel,
+            models.gpt2.modeling_gpt2.GPT2Model,
+            models.gpt_neo.modeling_gpt_neo.GPTNeoModel,
+            models.gptj.modeling_gptj.GPTJModel,
+        ),
+    ):
+        final_layer_norm = base_model.ln_f
+    elif isinstance(base_model, models.llama.modeling_llama.LlamaModel):
+        final_layer_norm = base_model.norm
+    elif isinstance(base_model, models.mistral.modeling_mistral.MistralModel):
+        final_layer_norm = base_model.norm
+    elif isinstance(base_model, models.t5.modeling_t5.T5ForConditionalGeneration):
+        # For T5, use the LayerNorm from the last decoder block, before the feed-forward layer.
+        final_layer_norm = base_model.decoder.block[-1].layer[1].layer_norm
+    else:
+        raise NotImplementedError(f"Unknown model type {type(base_model)}")
+    if final_layer_norm is None:
+        raise ValueError("Model does not have a final layer norm.")
+    assert isinstance(final_layer_norm, Norm.__args__)  # type: ignore
+    return final_layer_norm
+class Unembed(nn.Module):
+    """Module that maps transformer hidden states to logits (and vice versa)."""
+    final_norm: Norm
+    unembedding: nn.Linear
+    def __init__(
+        self,
+        model: Model,
+    ):
+        """Initialize unmebed.
+        Args:
+            model: A HuggingFace model from which to extract the unembedding matrix.
+        """
+        super().__init__()
+        final_norm = get_final_norm(model)
+        unembedding_matrix = get_unembedding_matrix(model)
+        self.final_norm = copy.deepcopy(final_norm)
+        self.unembedding = copy.deepcopy(unembedding_matrix)
+        # In general we don't want to finetune the unembed operation.
+        self.requires_grad_(False)
+    def forward(self, h: torch.Tensor) -> torch.Tensor:
+        """Convert hidden states into logits."""
+        return self.unembedding(self.final_norm(h))
+class Reembed(nn.Module):
+    """Module that maps transformer hidden states to logits (and vice versa)."""
+    embedding: torch.Tensor
+    def __init__(
+        self,
+        model: Model,
+        distance_metric: str = "logits",
+    ):
+        """Initialize unmebed.
+        Args:
+            model: A HuggingFace model from which to extract the unembedding matrix.
+        """
+        super().__init__()
+        embedding_matrix = get_embedding_matrix(model)
+        self.embedding = copy.deepcopy(embedding_matrix.weight.data)
+        self.distance_metric = distance_metric
+        # In general we don't want to finetune the unembed operation.
+        self.requires_grad_(False)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        """Convert hidden states into logits."""
+        if self.distance_metric == 'logits':
+            logits = torch.matmul(hidden_state, self.embedding.T).squeeze(0)
+        elif self.distance_metric == 'cosine':
+            # Normalize E and h
+            E_normalized = F.normalize(self.embedding, p=2, dim=-1)
+            h_normalized = F.normalize(hidden_state, p=2, dim=-1)
+            # Compute cosine similarity
+            logits = torch.matmul(h_normalized, E_normalized.T).squeeze(0)
+        elif self.distance_metric == 'euclidean':
+            # Compute Euclidean distance
+            distances = torch.cdist(hidden_state, self.embedding, p=2).squeeze(0)
+            # Convert distances to logits (negative distance for logits-like values)
+            logits = -distances
+        else:  # Compute regular dot-product as a similarity measure
+            logits = torch.matmul(hidden_state, self.embedding.T).squeeze(0)
+        return logits
+class ReverseLens(abc.ABC, nn.Module):
+    """Abstract base class for all Lens."""
+    reembed: Reembed
+    def __init__(self, reembed: Reembed):
+        """Create a Lens.
+        Args:
+            unembed: The unembed operation to use.
+        """
+        super().__init__()
+        self.reembed = reembed
+    @abc.abstractmethod
+    def forward(self, h: torch.Tensor, idx: int) -> torch.Tensor:
+        """Decode hidden states into logits."""
+        ...
+class ReverseLogitLens(ReverseLens):
+    """Reembeds the residual stream into logits."""
+    reembed: Reembed
+    def __init__(
+        self,
+        reembed: Reembed,
+    ):
+        """Create a Reverse Logit Lens.
+        Args:
+            reembed: The reembed operation to use.
+        """
+        super().__init__(reembed)
+    @classmethod
+    def from_model(
+        cls,
+        model: PreTrainedModel,
+    ) -> "ReverseLogitLens":
+        """Create a ReverseLogitLens from a pretrained model.
+        Args:
+            model: A pretrained model from the transformers library you wish to inspect.
+        """
+        reembed = Reembed(model)
+        return cls(reembed)
+    def forward(self, h: torch.Tensor, idx: int) -> torch.Tensor:
+        """Decode a hidden state into logits.
+        Args:
+            h: The hidden state to decode.
+            idx: the layer of the transformer these hidden states come from.
+        """
+        del idx
+        return self.reembed.forward(h)
+class Lens(abc.ABC, nn.Module):
+    """Abstract base class for all Lens."""
+    unembed: Unembed
+    def __init__(self, unembed: Unembed):
+        """Create a Lens.
+        Args:
+            unembed: The unembed operation to use.
+        """
+        super().__init__()
+        self.unembed = unembed
+    @abc.abstractmethod
+    def forward(self, h: torch.Tensor, idx: int) -> torch.Tensor:
+        """Decode hidden states into logits."""
+        ...
+class LogitLens(Lens):
+    """Unembeds the residual stream into logits."""
+    unembed: Unembed
+    def __init__(
+        self,
+        unembed: Unembed,
+    ):
+        """Create a Logit Lens.
+        Args:
+            unembed: The unembed operation to use.
+        """
+        super().__init__(unembed)
+    @classmethod
+    def from_model(
+        cls,
+        model: PreTrainedModel,
+    ) -> "LogitLens":
+        """Create a LogitLens from a pretrained model.
+        Args:
+            model: A pretrained model from the transformers library you wish to inspect.
+        """
+        unembed = Unembed(model)
+        return cls(unembed)
+    def forward(self, h: torch.Tensor, idx: int) -> torch.Tensor:
+        """Decode a hidden state into logits.
+        Args:
+            h: The hidden state to decode.
+            idx: the layer of the transformer these hidden states come from.
+        """
+        del idx
+        return self.unembed.forward(h)

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,320 @@

+from tqdm import tqdm
+from typing import Iterable, List, Union
+from transformers import PreTrainedModel, PreTrainedTokenizer
+import torch
+from torch import nn
+from sklearn.linear_model import LinearRegression
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+def extract_token_i_hidden_states(
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizer,
+        inputs: Union[str, List[str]],
+        token_idx_to_extract: int = -1,
+        batch_size: int = 1,
+        layers_to_extract: List[int] = None,
+        return_dict: bool = True,
+        verbose: bool = True,
+) -> torch.Tensor:
+    device = model.device
+    model.eval()
+    if isinstance(inputs, str):
+        inputs = [inputs]
+    if layers_to_extract is None:
+        layers_to_extract = list(range(1, model.config.num_hidden_layers + 1))  # extract all but initial embeddings
+    all_hidden_states = {layer: [] for layer in layers_to_extract}
+    with torch.no_grad():
+        for i in tqdm(range(0, len(inputs), batch_size), desc="Extracting hidden states", unit="batch", disable=not verbose):
+            input_ids = tokenizer(inputs[i:i+batch_size], return_tensors="pt", return_attention_mask=False)['input_ids']
+            try:
+                outputs = model(input_ids.to(device), output_hidden_states=True)
+            except:
+                import pdb; pdb.set_trace()
+                # from transformers import AutoModelForCausalLM
+                # model2 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", torch_dtype=torch.bfloat16).to(device)
+            for input_i in range(len(input_ids)):
+                for layer in layers_to_extract:
+                    hidden_states = outputs.hidden_states[layer]
+                    all_hidden_states[layer].append(hidden_states[:, token_idx_to_extract, :].detach().cpu())
+    for layer in all_hidden_states:
+        all_hidden_states[layer] = torch.concat(all_hidden_states[layer], dim=0)
+    if not return_dict:
+        all_hidden_states = torch.concat([all_hidden_states[layer] for layer in layers_to_extract], dim=0)
+    return all_hidden_states
+def extract_vocab_hidden_states(
+        model: PreTrainedModel,
+        tokenizer: PreTrainedTokenizer,
+        tokens_ids_to_extract: Iterable[int] = None,
+        prompt: str = "{target}",
+        prompt_target: str = "{target}",
+        batch_size: int = 128,
+        layers_to_extract: List[int] = None
+) -> torch.Tensor:
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    if layers_to_extract is None:
+        layers_to_extract = list(range(1, model.config.num_hidden_layers + 1))  # extract all but initial embeddings
+    all_hidden_states = {layer: [] for layer in layers_to_extract}
+    tokens_ids_to_extract = tokens_ids_to_extract if tokens_ids_to_extract is not None else range(tokenizer.vocab_size)
+    tokens_to_extract = [tokenizer.decode(tok_id) for tok_id in tokens_ids_to_extract]
+    # add pad token if necessary
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    with torch.no_grad():
+        for i in tqdm(range(0, len(tokens_to_extract), batch_size), desc="Extracting hidden states", unit="batch"):
+            prompts = [prompt.replace(prompt_target, target) for target in tokens_to_extract[i:i+batch_size]]
+            input_ids = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")["input_ids"]
+            # input_ids = tokenizer(prompts, return_tensors="pt")["input_ids"]
+            outputs = model(input_ids.to(device), output_hidden_states=True)
+            for layer in layers_to_extract:
+                hidden_states = outputs.hidden_states[layer]
+                all_hidden_states[layer].append(hidden_states[:, -1, :].detach().cpu())
+    for layer in all_hidden_states:
+        all_hidden_states[layer] = torch.concat(all_hidden_states[layer], dim=0)
+    return all_hidden_states
+def get_vocab_tokens(tokenizer: PreTrainedTokenizer, min_word_len: int = None):
+    vocab_size = tokenizer.vocab_size
+    tokens = list(range(vocab_size))
+    if min_word_len:
+        tokens_str = [tokenizer.decode(i) for i in tokens]
+        tokens_len = [len(x) for x in tokens_str]
+        tokens = [tok for tok, tok_len in zip(tokens, tokens_len) if tok_len >= min_word_len]
+    return tokens
+def learn_linear_map(X: torch.Tensor, Y: torch.Tensor, fit_intercept=False):
+    input_dtype = X.dtype
+    linear_reg = LinearRegression(fit_intercept=fit_intercept).fit(X.cpu().to(float).numpy(), Y.cpu().to(float).numpy())
+    linear_map = nn.Linear(X.size(1), Y.size(1), bias=fit_intercept)
+    with torch.no_grad():
+        linear_map.weight.data = torch.Tensor(linear_reg.coef_.T)
+        if fit_intercept:
+            linear_map.bias.data = torch.Tensor(linear_reg.intercept_)
+    linear_map = linear_map.to(input_dtype)
+    return linear_map
+def train_model(
+    model,
+    dataloader,
+    optimizer,
+    loss_func="mse",
+    scheduler=None,
+    num_epochs=5,
+    gradient_accumulation_steps=1,
+    max_grads_norm=1.0,
+):
+    """
+    Trains a two-layer MLP to map hidden states from X to Y.
+    Parameters:
+        X (torch.Tensor): Input tensor of shape (N, D).
+        Y (torch.Tensor): Target tensor of shape (N, D).
+        activation_func (nn.Module): Activation function for the hidden layer. Default is SiLU.
+        lr (float): Learning rate. Default is 0.001.
+        weight_decay (float): Weight decay for the optimizer. Default is 0.0.
+        loss_func (str): Loss function to use ('mse', 'huber', 'cosine'). Default is 'mse'.
+        lr_schedule (str): Learning rate schedule. Default is 'linear'.
+        num_epochs (int): Number of training epochs. Default is 20.
+        batch_size (int): Batch size for DataLoader. Default is 32.
+        gradient_accumulation_steps (int): Number of steps to accumulate gradients. Default is 1.
+    Returns:
+        nn.Module: Trained MLP model.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Select loss function
+    if loss_func == "mse":
+        criterion = nn.MSELoss()
+    elif loss_func == "huber":
+        criterion = nn.HuberLoss()
+    elif loss_func == "cosine":
+        criterion = nn.CosineEmbeddingLoss()
+    else:
+        raise ValueError("Unsupported loss function. Choose from 'mse', 'huber', or 'cosine'.")
+    # Training loop
+    model.train()
+    for epoch in range(num_epochs):
+        epoch_loss = 0.0
+        for i, (x_batch, y_batch) in enumerate(dataloader):
+            outputs = model(x_batch.to(device))
+            if loss_func == "cosine":
+                # Cosine loss requires an additional target tensor of 1s
+                loss = criterion(outputs, y_batch.to(device), torch.ones(x_batch.size(0)))
+            else:
+                loss = criterion(outputs, y_batch.to(device))
+            loss = loss / gradient_accumulation_steps
+            loss.backward()
+            if max_grads_norm is not None:
+                nn.utils.clip_grad_norm_(model.parameters(), max_grads_norm)
+            if (i + 1) % gradient_accumulation_steps == 0 or (i + 1) == len(dataloader):
+                optimizer.step()
+                optimizer.zero_grad()
+                if scheduler:
+                    scheduler.step()
+            epoch_loss += loss.item() * gradient_accumulation_steps
+        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(dataloader):.6f}")
+    return model.cpu()
+def learn_mlp(
+    X: torch.Tensor, Y: torch.Tensor,
+    activation_func=nn.SiLU,
+    batch_size=128,
+    lr=0.001,
+    weight_decay=0.0,
+    loss_func="mse",
+    lr_schedule="linear",
+    expansion_alpha=1.0,
+    num_epochs=5,
+    gradient_accumulation_steps=1,
+    max_grads_norm=1.0,
+):
+    """
+    Trains a two-layer MLP to map hidden states from X to Y.
+    Parameters:
+        X (torch.Tensor): Input tensor of shape (N, D).
+        Y (torch.Tensor): Target tensor of shape (N, D).
+        activation_func (nn.Module): Activation function for the hidden layer. Default is SiLU.
+        lr (float): Learning rate. Default is 0.001.
+        weight_decay (float): Weight decay for the optimizer. Default is 0.0.
+        loss_func (str): Loss function to use ('mse', 'huber', 'cosine'). Default is 'mse'.
+        lr_schedule (str): Learning rate schedule. Default is 'linear'.
+        num_epochs (int): Number of training epochs. Default is 20.
+        batch_size (int): Batch size for DataLoader. Default is 32.
+        gradient_accumulation_steps (int): Number of steps to accumulate gradients. Default is 1.
+    Returns:
+        nn.Module: Trained MLP model.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_dim = X.shape[1]
+    hidden_dim = int(input_dim * expansion_alpha)
+    output_dim = Y.shape[1]
+    model = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim),
+                activation_func(),
+                nn.Linear(hidden_dim, output_dim)
+    ).to(device)
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    # DataLoader setup
+    dataset = TensorDataset(X, Y)
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+    # Learning rate scheduler
+    if lr_schedule == "linear":
+        total_steps = (len(dataloader) * num_epochs) // gradient_accumulation_steps
+        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step: 1 - step / total_steps)
+    else:
+        scheduler = None
+    return train_model(
+        model,
+        dataloader,
+        optimizer,
+        loss_func=loss_func,
+        scheduler=scheduler,
+        num_epochs=num_epochs,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        max_grads_norm=max_grads_norm,
+    )
+class FFN(nn.Module):
+    def __init__(self, input_dim):
+        super(FFN, self).__init__()
+        self.gate_proj = nn.Linear(input_dim, input_dim)
+        self.activation = nn.SiLU()
+        self.map_proj = nn.Linear(input_dim, input_dim)
+    def forward(self, x):
+        return (self.activation(self.gate_proj(x)) * x) + self.map_proj(x)
+def learn_ffn(
+    X: torch.Tensor, Y: torch.Tensor,
+    activation_func=nn.SiLU,
+    batch_size=128,
+    lr=0.001,
+    weight_decay=0.0,
+    loss_func="mse",
+    lr_schedule="linear",
+    num_epochs=5,
+    gradient_accumulation_steps=1,
+    max_grads_norm=1.0,
+):
+    """
+    Trains a two-layer MLP to map hidden states from X to Y.
+    Parameters:
+        X (torch.Tensor): Input tensor of shape (N, D).
+        Y (torch.Tensor): Target tensor of shape (N, D).
+        activation_func (nn.Module): Activation function for the hidden layer. Default is SiLU.
+        lr (float): Learning rate. Default is 0.001.
+        weight_decay (float): Weight decay for the optimizer. Default is 0.0.
+        loss_func (str): Loss function to use ('mse', 'huber', 'cosine'). Default is 'mse'.
+        lr_schedule (str): Learning rate schedule. Default is 'linear'.
+        num_epochs (int): Number of training epochs. Default is 20.
+        batch_size (int): Batch size for DataLoader. Default is 32.
+        gradient_accumulation_steps (int): Number of steps to accumulate gradients. Default is 1.
+    Returns:
+        nn.Module: Trained MLP model.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_dim = X.shape[1]
+    model = FFN(input_dim).to(device)
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    # DataLoader setup
+    dataset = TensorDataset(X, Y)
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+    # Learning rate scheduler
+    if lr_schedule == "linear":
+        total_steps = (len(dataloader) * num_epochs) // gradient_accumulation_steps
+        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step: 1 - step / total_steps)
+    else:
+        scheduler = None
+    return train_model(
+        model,
+        dataloader,
+        optimizer,
+        loss_func=loss_func,
+        scheduler=scheduler,
+        num_epochs=num_epochs,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        max_grads_norm=max_grads_norm,
+    )

utils/procrustes/__init__.py ADDED Viewed

File without changes

utils/procrustes/orthogonal.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# -*- coding: utf-8 -*-
+# The Procrustes library provides a set of functions for transforming
+# a matrix to make it as similar as possible to a target matrix.
+#
+# Copyright (C) 2017-2022 The QC-Devs Community
+#
+# This file is part of Procrustes.
+#
+# Procrustes is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# Procrustes is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>
+#
+# --
+"""Orthogonal Procrustes Module."""
+# import warnings
+from typing import Optional
+import numpy as np
+from .utils import compute_error, ProcrustesResult, setup_input_arrays
+import scipy
+__all__ = [
+    "orthogonal",
+    "orthogonal_2sided",
+]
+def orthogonal(
+    a: np.ndarray,
+    b: np.ndarray,
+    pad: bool = True,
+    translate: bool = False,
+    scale: bool = False,
+    unpad_col: bool = False,
+    unpad_row: bool = False,
+    check_finite: bool = True,
+    weight: Optional[np.ndarray] = None,
+    lapack_driver: str = "gesvd",
+) -> ProcrustesResult:
+    r"""Perform orthogonal Procrustes.
+    Given a matrix :math:`\mathbf{A}_{m \times n}` and a reference matrix :math:`\mathbf{B}_{m
+    \times n}`, find the orthogonal transformation matrix :math:`\mathbf{Q}_{n
+    \times n}` that makes :math:`\mathbf{AQ}` as close as possible to :math:`\mathbf{B}`.
+    In other words,
+    .. math::
+       \underbrace{\min}_{\left\{\mathbf{Q} | \mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger \right\}}
+                          \|\mathbf{A}\mathbf{Q} - \mathbf{B}\|_{F}^2
+    This Procrustes method requires the :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices to
+    have the same shape, which is gauranteed with the default ``pad`` argument for any given
+    :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices. In preparing the :math:`\mathbf{A}` and
+    :math:`\mathbf{B}` matrices, the (optional) order of operations is: **1)** unpad zero
+    rows/columns, **2)** translate the matrices to the origin, **3)** weight entries of
+    :math:`\mathbf{A}`, **4)** scale the matrices to have unit norm, **5)** pad matrices with zero
+    rows/columns so they have the same shape.
+    Parameters
+    ----------
+    a : ndarray
+        The 2D-array :math:`\mathbf{A}` which is going to be transformed.
+    b : ndarray
+        The 2D-array :math:`\mathbf{B}` representing the reference matrix.
+    pad : bool, optional
+        Add zero rows (at the bottom) and/or columns (to the right-hand side) of matrices
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` so that they have the same shape.
+    translate : bool, optional
+        If True, both arrays are centered at origin (columns of the arrays will have mean zero).
+    scale : bool, optional
+        If True, both arrays are normalized with respect to the Frobenius norm, i.e.,
+        :math:`\text{Tr}\left[\mathbf{A}^\dagger\mathbf{A}\right] = 1` and
+        :math:`\text{Tr}\left[\mathbf{B}^\dagger\mathbf{B}\right] = 1`.
+    unpad_col : bool, optional
+        If True, zero columns (with values less than 1.0e-8) on the right-hand side of the intial
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices are removed.
+    unpad_row : bool, optional
+        If True, zero rows (with values less than 1.0e-8) at the bottom of the intial
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices are removed.
+    check_finite : bool, optional
+        If True, convert the input to an array, checking for NaNs or Infs.
+    weight : ndarray, optional
+        The 1D-array representing the weights of each row of :math:`\mathbf{A}`. This defines the
+        elements of the diagonal matrix :math:`\mathbf{W}` that is multiplied by :math:`\mathbf{A}`
+        matrix, i.e., :math:`\mathbf{A} \rightarrow \mathbf{WA}`.
+    lapack_driver : {'gesvd', 'gesdd'}, optional
+        Whether to use the more efficient divide-and-conquer approach ('gesdd') or the more robust
+        general rectangular approach ('gesvd') to compute the singular-value decomposition with
+        `scipy.linalg.svd`.
+    Returns
+    -------
+    res : ProcrustesResult
+        The Procrustes result represented as a class:`utils.ProcrustesResult` object.
+    Notes
+    -----
+    The optimal orthogonal matrix is obtained by,
+    .. math::
+        \mathbf{Q}^{\text{opt}} =
+        \arg \underbrace{\min}_{\left\{\mathbf{Q} \left| {\mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger}
+             \right. \right\}} \|\mathbf{A}\mathbf{Q} - \mathbf{B}\|_{F}^2 =
+        \arg \underbrace{\max}_{\left\{\mathbf{Q} \left| {\mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger}
+             \right. \right\}} \text{Tr}\left[\mathbf{Q^\dagger}\mathbf{A^\dagger}\mathbf{B}\right]
+    The solution is obtained using the singular value decomposition (SVD) of the
+    :math:`\mathbf{A}^\dagger \mathbf{B}` matrix,
+    .. math::
+       \mathbf{A}^\dagger \mathbf{B} &= \tilde{\mathbf{U}} \tilde{\mathbf{\Sigma}}
+                                          \tilde{\mathbf{V}}^{\dagger} \\
+       \mathbf{Q}^{\text{opt}} &= \tilde{\mathbf{U}} \tilde{\mathbf{V}}^{\dagger}
+    The singular values are always listed in decreasing order, with the smallest singular
+    value in the bottom-right-hand corner of :math:`\tilde{\mathbf{\Sigma}}`.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import ortho_group
+    >>> from procrustes import orthogonal
+    >>> a = np.random.rand(5, 3)   # random input matrix
+    >>> q = ortho_group.rvs(3)     # random orthogonal transformation
+    >>> b = np.dot(a, q) + np.random.rand(1, 3)   # random target matrix
+    >>> result = orthogonal(a, b, translate=True, scale=False)
+    >>> print(result.error)      # error (should be zero)
+    >>> print(result.t)          # transformation matrix (same as q)
+    >>> print(result.new_a)      # translated array a
+    >>> print(result.new_b)      # translated array b
+    """
+    # check inputs
+    new_a, new_b = setup_input_arrays(
+        a,
+        b,
+        unpad_col,
+        unpad_row,
+        pad,
+        translate,
+        scale,
+        check_finite,
+        weight,
+    )
+    if new_a.shape != new_b.shape:
+        raise ValueError(
+            f"Shape of A and B does not match: {new_a.shape} != {new_b.shape} "
+            "Check pad, unpad_col, and unpad_row arguments."
+        )
+    # calculate SVD of A.T * B
+    u, _, vt = scipy.linalg.svd(np.dot(new_a.T, new_b), lapack_driver=lapack_driver)
+    # compute optimal orthogonal transformation
+    u_opt = np.dot(u, vt)
+    # compute one-sided error
+    error = compute_error(new_a, new_b, u_opt)
+    return ProcrustesResult(error=error, new_a=new_a, new_b=new_b, t=u_opt, s=None)
+def orthogonal_2sided(
+    a: np.ndarray,
+    b: np.ndarray,
+    single: bool = True,
+    pad: bool = True,
+    translate: bool = False,
+    scale: bool = False,
+    unpad_col: bool = False,
+    unpad_row: bool = False,
+    check_finite: bool = True,
+    weight: Optional[np.ndarray] = None,
+    lapack_driver: str = "gesvd",
+) -> ProcrustesResult:
+    r"""Perform two-sided orthogonal Procrustes with one- or two-transformations.
+    **Two Transformations:** Given a matrix :math:`\mathbf{A}_{m \times n}` and a reference matrix
+    :math:`\mathbf{B}_{m \times n}`, find two :math:`n \times n` orthogonal
+    transformation matrices :math:`\mathbf{Q}_1^\dagger` and :math:`\mathbf{Q}_2` that makes
+    :math:`\mathbf{Q}_1^\dagger\mathbf{A}\mathbf{Q}_2` as close as possible to :math:`\mathbf{B}`.
+    In other words,
+    .. math::
+          \underbrace{\text{min}}_{\left\{ {\mathbf{Q}_1 \atop \mathbf{Q}_2} \left|
+            {\mathbf{Q}_1^{-1} = \mathbf{Q}_1^\dagger \atop \mathbf{Q}_2^{-1} =
+            \mathbf{Q}_2^\dagger} \right. \right\}}
+            \|\mathbf{Q}_1^\dagger \mathbf{A} \mathbf{Q}_2 - \mathbf{B}\|_{F}^2
+    **Single Transformations:** Given a **symmetric** matrix :math:`\mathbf{A}_{n \times n}` and
+    a reference :math:`\mathbf{B}_{n \times n}`, find one orthogonal transformation
+    matrix :math:`\mathbf{Q}_{n \times n}` that makes :math:`\mathbf{A}` as close as possible to
+    :math:`\mathbf{B}`. In other words,
+    .. math::
+       \underbrace{\min}_{\left\{\mathbf{Q} | \mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger \right\}}
+                          \|\mathbf{Q}^\dagger\mathbf{A}\mathbf{Q} - \mathbf{B}\|_{F}^2
+    This Procrustes method requires the :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices to
+    have the same shape, which is gauranteed with the default ``pad`` argument for any given
+    :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices. In preparing the :math:`\mathbf{A}` and
+    :math:`\mathbf{B}` matrices, the (optional) order of operations is: **1)** unpad zero
+    rows/columns, **2)** translate the matrices to the origin, **3)** weight entries of
+    :math:`\mathbf{A}`, **4)** scale the matrices to have unit norm, **5)** pad matrices with zero
+    rows/columns so they have the same shape.
+    Parameters
+    ----------
+    a : ndarray
+        The 2D-array :math:`\mathbf{A}` which is going to be transformed.
+    b : ndarray
+        The 2D-array :math:`\mathbf{B}` representing the reference matrix.
+    single : bool, optional
+        If True, single transformation is used (i.e., :math:`\mathbf{Q}_1=\mathbf{Q}_2=\mathbf{Q}`),
+        otherwise, two transformations are used.
+    pad : bool, optional
+        Add zero rows (at the bottom) and/or columns (to the right-hand side) of matrices
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` so that they have the same shape.
+    translate : bool, optional
+        If True, both arrays are centered at origin (columns of the arrays will have mean zero).
+    scale : bool, optional
+        If True, both arrays are normalized with respect to the Frobenius norm, i.e.,
+        :math:`\text{Tr}\left[\mathbf{A}^\dagger\mathbf{A}\right] = 1` and
+        :math:`\text{Tr}\left[\mathbf{B}^\dagger\mathbf{B}\right] = 1`.
+    unpad_col : bool, optional
+        If True, zero columns (with values less than 1.0e-8) on the right-hand side of the intial
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices are removed.
+    unpad_row : bool, optional
+        If True, zero rows (with values less than 1.0e-8) at the bottom of the intial
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` matrices are removed.
+    check_finite : bool, optional
+        If True, convert the input to an array, checking for NaNs or Infs.
+    weight : ndarray, optional
+        The 1D-array representing the weights of each row of :math:`\mathbf{A}`. This defines the
+        elements of the diagonal matrix :math:`\mathbf{W}` that is multiplied by :math:`\mathbf{A}`
+        matrix, i.e., :math:`\mathbf{A} \rightarrow \mathbf{WA}`.
+    lapack_driver : {"gesvd", "gesdd"}, optional
+        Used in the singular value decomposition function from SciPy. Only allowed two options,
+        with "gesvd" being less-efficient than "gesdd" but is more robust. Default is "gesvd".
+    Returns
+    -------
+    res : ProcrustesResult
+        The Procrustes result represented as a class:`utils.ProcrustesResult` object.
+    Notes
+    -----
+    **Two-Sided Orthogonal Procrustes with Two Transformations:**
+    The optimal orthogonal transformations are obtained by:
+    .. math::
+       \mathbf{Q}_{1}^{\text{opt}}, \mathbf{Q}_{2}^{\text{opt}} = \arg
+          \underbrace{\text{min}}_{\left\{ {\mathbf{Q}_1 \atop \mathbf{Q}_2} \left|
+            {\mathbf{Q}_1^{-1} = \mathbf{Q}_1^\dagger \atop \mathbf{Q}_2^{-1} =
+            \mathbf{Q}_2^\dagger} \right. \right\}}
+            \|\mathbf{Q}_1^\dagger \mathbf{A} \mathbf{Q}_2 - \mathbf{B}\|_{F}^2 = \arg
+       \underbrace{\text{max}}_{\left\{ {\mathbf{Q}_1 \atop \mathbf{Q}_2} \left|
+             {\mathbf{Q}_1^{-1} = \mathbf{Q}_1^\dagger \atop \mathbf{Q}_2^{-1} =
+             \mathbf{Q}_2^\dagger} \right. \right\}}
+          \text{Tr}\left[\mathbf{Q}_2^\dagger\mathbf{A}^\dagger\mathbf{Q}_1\mathbf{B} \right]
+    This is solved by taking the singular value decomposition (SVD) of :math:`\mathbf{A}` and
+    :math:`\mathbf{B}`,
+    .. math::
+       \mathbf{A} = \mathbf{U}_A \mathbf{\Sigma}_A \mathbf{V}_A^\dagger \\
+       \mathbf{B} = \mathbf{U}_B \mathbf{\Sigma}_B \mathbf{V}_B^\dagger
+    Then the two optimal orthogonal matrices are given by,
+    .. math::
+       \mathbf{Q}_1^{\text{opt}} = \mathbf{U}_A \mathbf{U}_B^\dagger \\
+       \mathbf{Q}_2^{\text{opt}} = \mathbf{V}_A \mathbf{V}_B^\dagger
+    **Two-Sided Orthogonal Procrustes with Single-Transformation:**
+    The optimal orthogonal transformation is obtained by:
+    .. math::
+       \mathbf{Q}^{\text{opt}} = \arg
+       \underbrace{\min}_{\left\{\mathbf{Q} | \mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger \right\}}
+                          \|\mathbf{Q}^\dagger\mathbf{A}\mathbf{Q} - \mathbf{B}\|_{F}^2 = \arg
+       \underbrace{\text{max}}_{\left\{\mathbf{Q} | \mathbf{Q}^{-1} = {\mathbf{Q}}^\dagger\right\}}
+          \text{Tr}\left[\mathbf{Q}^\dagger\mathbf{A}^\dagger\mathbf{Q}\mathbf{B} \right]
+    Using the singular value decomposition (SVD) of :math:`\mathbf{A}` and :math:`\mathbf{B}`,
+    .. math::
+       \mathbf{A} = \mathbf{U}_A \mathbf{\Lambda}_A \mathbf{U}_A^\dagger \\
+       \mathbf{B} = \mathbf{U}_B \mathbf{\Lambda}_B \mathbf{U}_B^\dagger
+    The optimal orthogonal matrix :math:`\mathbf{Q}^\text{opt}` is obtained through,
+    .. math::
+       \mathbf{Q}^\text{opt} = \mathbf{U}_A \mathbf{S} \mathbf{U}_B^\dagger
+    where :math:`\mathbf{S}` is a diagonal matrix with :math:`\pm{1}` elements,
+    .. math::
+       \mathbf{S} =
+       \begin{bmatrix}
+        { \pm 1} & 0       &\cdots &0 \\
+        0        &{ \pm 1} &\ddots &\vdots \\
+        \vdots   &\ddots   &\ddots &0\\
+        0        &\cdots   &0      &{ \pm 1}
+       \end{bmatrix}
+    The matrix :math:`\mathbf{S}` is chosen to be the identity matrix.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> a = np.array([[30, 33, 20], [33, 53, 43], [20, 43, 46]])
+    >>> b = np.array([[ 22.78131838, -0.58896768,-43.00635291, 0., 0.],
+    ...               [ -0.58896768, 16.77132475,  0.24289990, 0., 0.],
+    ...               [-43.00635291,  0.2428999 , 89.44735687, 0., 0.],
+    ...               [  0.        ,  0.        ,  0.        , 0., 0.]])
+    >>> res = orthogonal_2sided(a, b, single=True, pad=True, unpad_col=True)
+    >>> res.t
+    array([[ 0.25116633,  0.76371527,  0.59468855],
+           [-0.95144277,  0.08183302,  0.29674906],
+           [ 0.17796663, -0.64034549,  0.74718507]])
+    >>> res.error
+    1.9646186414076689e-26
+    """
+    # if translate:
+    #     warnings.warn(
+    #         "The translation matrix was not well defined. \
+    #             Two sided rotation and translation don't commute.",
+    #         stacklevel=2,
+    #     )
+    # Check inputs
+    new_a, new_b = setup_input_arrays(
+        a,
+        b,
+        unpad_col,
+        unpad_row,
+        pad,
+        translate,
+        scale,
+        check_finite,
+        weight,
+    )
+    # check symmetry if single_transform=True
+    if single:
+        if not np.allclose(new_a.T, new_a):
+            raise ValueError(
+                f"Array A with {new_a.shape} shape is not symmetric. "
+                "Check pad, unpad_col, and unpad_row arguments."
+            )
+        if not np.allclose(new_b.T, new_b):
+            raise ValueError(
+                f"Array B with {new_b.shape} shape is not symmetric. "
+                "Check pad, unpad_col, and unpad_row arguments."
+            )
+    # two-sided orthogonal Procrustes with one-transformations
+    if single:
+        _, ua = np.linalg.eigh(new_a)
+        _, ub = np.linalg.eigh(new_b)
+        u_opt = np.dot(ua, ub.T)
+        # compute one-sided error
+        error = compute_error(new_a, new_b, u_opt, u_opt.T)
+        return ProcrustesResult(error=error, new_a=new_a, new_b=new_b, t=u_opt, s=u_opt.T)
+    # two-sided orthogonal Procrustes with two-transformations
+    ua, _, vta = scipy.linalg.svd(new_a, lapack_driver=lapack_driver)
+    ub, _, vtb = scipy.linalg.svd(new_b, lapack_driver=lapack_driver)
+    u_opt1 = np.dot(ua, ub.T)
+    u_opt2 = np.dot(vta.T, vtb)
+    error = compute_error(new_a, new_b, u_opt2, u_opt1.T)
+    return ProcrustesResult(error=error, new_a=new_a, new_b=new_b, t=u_opt2, s=u_opt1.T)

utils/procrustes/utils.py ADDED Viewed

	@@ -0,0 +1,495 @@

+# -*- coding: utf-8 -*-
+# The Procrustes library provides a set of functions for transforming
+# a matrix to make it as similar as possible to a target matrix.
+#
+# Copyright (C) 2017-2022 The QC-Devs Community
+#
+# This file is part of Procrustes.
+#
+# Procrustes is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# Procrustes is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>
+#
+# --
+"""Utility Module."""
+from typing import List, Optional, Tuple
+import numpy as np
+__all__ = [
+    "compute_error",
+    "setup_input_arrays",
+    "ProcrustesResult",
+]
+def _zero_padding(
+    array_a: np.ndarray, array_b: np.ndarray, pad_mode: str = "row-col"
+) -> Tuple[np.ndarray, np.ndarray]:
+    r"""
+    Return arrays padded with rows and/or columns of zero.
+    Parameters
+    ----------
+    array_a : ndarray
+        The 2D-array :math:`\mathbf{A}_{n_a \times m_a}`.
+    array_b : ndarray
+        The 2D-array :math:`\mathbf{B}_{n_b \times m_b}`.
+    pad_mode : str
+        Specifying how to pad the arrays. Should be one of
+        - "row"
+            The array with fewer rows is padded with zero rows so that both have the same
+            number of rows.
+        - "col"
+            The array with fewer columns is padded with zero columns so that both have the
+            same number of columns.
+        - "row-col"
+            The array with fewer rows is padded with zero rows, and the array with fewer
+            columns is padded with zero columns, so that both have the same dimensions.
+            This does not necessarily result in square arrays.
+        - "square"
+            The arrays are padded with zero rows and zero columns so that they are both
+            squared arrays. The dimension of square array is specified based on the highest
+            dimension, i.e. :math:`\text{max}(n_a, m_a, n_b, m_b)`.
+    Returns
+    -------
+    padded_a : ndarray
+        Padded array_a.
+    padded_b : ndarray
+        Padded array_b.
+    """
+    # sanity checks
+    if not isinstance(array_a, np.ndarray) or not isinstance(array_b, np.ndarray):
+        raise ValueError("Arguments array_a & array_b should be numpy arrays.")
+    if array_a.ndim != 2 or array_b.ndim != 2:
+        raise ValueError("Arguments array_a & array_b should be 2D arrays.")
+    if array_a.shape == array_b.shape and array_a.shape[0] == array_a.shape[1]:
+        # special case of square arrays, mode is set to None so that array_a & array_b are returned.
+        pad_mode = None
+    if pad_mode == "square":
+        # calculate desired dimension of square array
+        (a_n1, a_m1), (a_n2, a_m2) = array_a.shape, array_b.shape
+        dim = max(a_n1, a_n2, a_m1, a_m2)
+        # padding rows to have both arrays have dim rows
+        if a_n1 < dim:
+            array_a = np.pad(array_a, [[0, dim - a_n1], [0, 0]], "constant", constant_values=0)
+        if a_n2 < dim:
+            array_b = np.pad(array_b, [[0, dim - a_n2], [0, 0]], "constant", constant_values=0)
+        # padding columns to have both arrays have dim columns
+        if a_m1 < dim:
+            array_a = np.pad(array_a, [[0, 0], [0, dim - a_m1]], "constant", constant_values=0)
+        if a_m2 < dim:
+            array_b = np.pad(array_b, [[0, 0], [0, dim - a_m2]], "constant", constant_values=0)
+    if pad_mode in ["row", "row-col"]:
+        # padding rows to have both arrays have the same number of rows
+        diff = array_a.shape[0] - array_b.shape[0]
+        if diff < 0:
+            array_a = np.pad(array_a, [[0, -diff], [0, 0]], "constant", constant_values=0)
+        else:
+            array_b = np.pad(array_b, [[0, diff], [0, 0]], "constant", constant_values=0)
+    if pad_mode in ["col", "row-col"]:
+        # padding columns to have both arrays have the same number of columns
+        diff = array_a.shape[1] - array_b.shape[1]
+        if diff < 0:
+            array_a = np.pad(array_a, [[0, 0], [0, -diff]], "constant", constant_values=0)
+        else:
+            array_b = np.pad(array_b, [[0, 0], [0, diff]], "constant", constant_values=0)
+    return array_a, array_b
+def _translate_array(
+    array_a: np.ndarray, array_b: Optional[np.ndarray] = None, weight: Optional[np.ndarray] = None
+) -> Tuple[np.ndarray, float]:
+    """
+    Return translated array_a and translation vector.
+    Columns of both arrays will have mean zero.
+    Parameters
+    ----------
+    array_a : ndarray
+        The 2D-array to translate.
+    array_b : ndarray, optional
+        The 2D-array to translate array_a based on.
+    weight : ndarray, optional
+        The weight vector.
+    Returns
+    -------
+    array_a : ndarray
+        If array_b is None, array_a is translated to origin using its centroid.
+        If array_b is given, array_a is translated to centroid of array_b (the centroid of
+        translated array_a will centroid with the centroid array_b).
+    centroid : float
+        If array_b is given, the centroid is returned.
+    """
+    # The mean is strongly affected by outliers and is not a robust estimator for central location
+    # see https://docs.python.org/3.6/library/statistics.html?highlight=mean#statistics.mean
+    if weight is not None:
+        if weight.ndim != 1:
+            raise ValueError("The weight should be a 1d row vector.")
+        if not (weight >= 0).all():
+            raise ValueError("The elements of the weight should be non-negative.")
+    centroid_a = np.average(array_a, axis=0, weights=weight)
+    if array_b is not None:
+        # translation vector to b centroid
+        centroid_a -= np.average(array_b, axis=0, weights=weight)
+    return array_a - centroid_a, -1 * centroid_a
+def _scale_array(array_a, array_b=None) -> Tuple[np.ndarray, float]:
+    """
+    Return scaled/normalized array_a and scaling vector.
+    Parameters
+    ----------
+    array_a : ndarray
+        The 2D-array to scale
+    array_b : ndarray, default=None
+        The 2D-array to scale array_a based on.
+    Returns
+    -------
+    scaled_a, ndarray
+        If array_b is None, array_a is normalized using the Frobenius norm.
+        If array_b is given, array_a is scaled to match array_b"s norm (the norm of array_a
+        will be equal norm of array_b).
+    scale : float
+        The scaling factor to match array_b norm.
+    """
+    # scaling factor to match unit sphere
+    scale = 1.0 / np.linalg.norm(array_a)
+    if array_b is not None:
+        # scaling factor to match array_b norm
+        scale *= np.linalg.norm(array_b)
+    return array_a * scale, scale
+def _hide_zero_padding(
+    array_a: np.ndarray,
+    remove_zero_col: bool = True,
+    remove_zero_row: bool = True,
+    tol: float = 1.0e-8,
+) -> np.ndarray:
+    r"""
+    Return array with zero-padded rows (bottom) and columns (right) removed.
+    Parameters
+    ----------
+    array_a : ndarray
+        The initial array.
+    remove_zero_col : bool, optional
+        If True, zero columns (values less than 1e-8) on the right side will be removed.
+    remove_zero_row : bool, optional
+        If True, zero rows (values less than 1e-8) on the bottom will be removed.
+    tol : float, optional
+        Tolerance value.
+    Returns
+    -------
+    new_A : ndarray
+        Array, with either near zero columns and/or zero rows are removed.
+    """
+    # Input checking
+    if array_a.ndim > 2:
+        raise TypeError("Matrix inputs must be 1- or 2- dimensional arrays")
+    # Check zero rows from bottom to top
+    if remove_zero_row:
+        num_row = array_a.shape[0]
+        tmp_a = array_a[..., np.newaxis] if array_a.ndim == 1 else array_a
+        for array_v in tmp_a[::-1]:
+            if any(abs(i) > tol for i in array_v):
+                break
+            num_row -= 1
+        array_a = array_a[:num_row]
+    # Cut off zero rows
+    if remove_zero_col:
+        if array_a.ndim == 2:
+            # Check zero columns from right to left
+            col_m = array_a.shape[1]
+            for array_v in array_a.T[::-1]:
+                if any(abs(i) > tol for i in array_v):
+                    break
+                col_m -= 1
+            # Cut off zero columns
+            array_a = array_a[:, :col_m]
+    return array_a
+def compute_error(
+    a: np.ndarray, b: np.ndarray, t: np.ndarray, s: Optional[np.ndarray] = None
+) -> float:
+    r"""Return the one- or two-sided Procrustes (squared Frobenius norm) error.
+    The double-sided Procrustes error is defined as
+    .. math::
+       \|\mathbf{S}\mathbf{A}\mathbf{T} - \mathbf{B}\|_{F}^2 =
+       \text{Tr}\left[
+            \left(\mathbf{S}\mathbf{A}\mathbf{T} - \mathbf{B}\right)^\dagger
+            \left(\mathbf{S}\mathbf{A}\mathbf{T} - \mathbf{B}\right)\right]
+    when :math:`\mathbf{S}` is the identity matrix :math:`\mathbf{I}`, this is called the one-sided
+    Procrustes error.
+    Parameters
+    ----------
+    a : ndarray
+        The 2D-array :math:`\mathbf{A}_{m \times n}` which is going to be transformed.
+    b : ndarray
+        The 2D-array :math:`\mathbf{B}_{m \times n}` representing the reference matrix.
+    t : ndarray
+        The 2D-array :math:`\mathbf{T}_{n \times n}` representing the right-hand-side transformation
+        matrix.
+    s : ndarray, optional
+        The 2D-array :math:`\mathbf{S}_{m \times m}` representing the left-hand-side transformation
+        matrix. If set to `None`, the one-sided Procrustes error is computed.
+    Returns
+    -------
+    error : float
+        The squared Frobenius norm of difference between the transformed array, :math:`\mathbf{S}
+        \mathbf{A}\mathbf{T}`, and the reference array, :math:`\mathbf{B}`.
+    """
+    # transform matrix A to either AT or SAT
+    a_trans = np.dot(a, t) if s is None else np.dot(np.dot(s, a), t)
+    # subtract matrix B and compute Frobenius norm squared
+    return np.linalg.norm(a_trans - b, ord=None) ** 2
+def setup_input_arrays(
+    array_a: np.ndarray,
+    array_b: np.ndarray,
+    remove_zero_col: bool,
+    remove_zero_row: bool,
+    pad: bool,
+    translate: bool,
+    scale: bool,
+    check_finite: bool,
+    weight: Optional[np.ndarray] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    r"""
+    Check and process array inputs for the Procrustes transformation routines.
+    Usually, the precursor step before all Procrustes methods.
+    Parameters
+    ----------
+    array_a : npdarray
+        The 2D array :math:`A` being transformed.
+    array_b : npdarray
+        The 2D reference array :math:`B`.
+    remove_zero_col : bool
+        If True, zero columns (values less than 1e-8) on the right side will be removed.
+    remove_zero_row : bool
+        If True, zero rows (values less than 1e-8) on the bottom will be removed.
+    pad : bool
+        Add zero rows (at the bottom) and/or columns (to the right-hand side) of matrices
+        :math:`\mathbf{A}` and :math:`\mathbf{B}` so that they have the same shape.
+    translate : bool
+        If true, then translate both arrays :math:`A, B` to the origin, ie columns of the arrays
+        will have mean zero.
+    scale :
+        If True, both arrays are normalized to one with respect to the Frobenius norm, ie
+        :math:`Tr(A^T A) = 1`.
+    check_finite : bool
+        If true, then checks if both arrays :math:`A, B` are numpy arrays and two-dimensional.
+    weight : A list of ndarray or ndarray
+        A list of the weight arrays or one numpy array. When only on numpy array provided,
+        it is assumed that the two arrays :math:`A` and :math:`B` share the same weight matrix.
+    Returns
+    -------
+    (ndarray, ndarray) :
+        Returns the padded arrays, in that they have the same matrix dimensions.
+    """
+    array_a = _setup_input_array_lower(
+        array_a, None, remove_zero_col, remove_zero_row, translate, scale, check_finite, weight
+    )
+    array_b = _setup_input_array_lower(
+        array_b, None, remove_zero_col, remove_zero_row, translate, scale, check_finite, weight
+    )
+    if pad:
+        array_a, array_b = _zero_padding(array_a, array_b, pad_mode="row-col")
+    return array_a, array_b
+def setup_input_arrays_multi(
+    array_list: List[np.ndarray],
+    array_ref: np.ndarray,
+    remove_zero_col: bool,
+    remove_zero_row: bool,
+    pad_mode: str,
+    translate: bool,
+    scale: bool,
+    check_finite: bool,
+    weight: Optional[np.ndarray] = None,
+) -> List[np.ndarray]:
+    r"""
+    Check and process array inputs for the Procrustes transformation routines.
+    Parameters
+    ----------
+    array_list : List
+        A list of 2D arrays that being transformed.
+    array_ref : ndarray
+        The 2D reference array :math:`B`.
+    remove_zero_col : bool
+        If True, zero columns (values less than 1e-8) on the right side will be removed.
+    remove_zero_row : bool
+        If True, zero rows (values less than 1e-8) on the bottom will be removed.
+    pad_mode : str
+        Specifying how to pad the arrays. Should be one of
+            - "row"
+                The array with fewer rows is padded with zero rows so that both have the same
+                number of rows.
+            - "col"
+                The array with fewer columns is padded with zero columns so that both have the
+                same number of columns.
+            - "row-col"
+                The array with fewer rows is padded with zero rows, and the array with fewer
+                columns is padded with zero columns, so that both have the same dimensions.
+                This does not necessarily result in square arrays.
+            - "square"
+                The arrays are padded with zero rows and zero columns so that they are both
+                squared arrays. The dimension of square array is specified based on the highest
+                dimension, i.e. :math:`\text{max}(n_a, m_a, n_b, m_b)`.
+    translate : bool
+        If true, then translate both arrays :math:`A, B` to the origin, ie columns of the arrays
+        will have mean zero.
+    scale :
+        If True, both arrays are normalized to one with respect to the Frobenius norm, ie
+        :math:`Tr(A^T A) = 1`.
+    check_finite : bool
+        If true, then checks if both arrays :math:`A, B` are numpy arrays and two-dimensional.
+    weight : A list of ndarray or ndarray, optional
+        A list of the weight arrays or one numpy array. When only on numpy array provided,
+        it is assumed that the two arrays :math:`A` and :math:`B` share the same weight matrix.
+    Returns
+    -------
+    List of arrays :
+        Returns the padded arrays, in that they have the same matrix dimensions.
+    """
+    array_list_new = [
+        _setup_input_array_lower(
+            array_a=arr,
+            array_ref=array_ref,
+            remove_zero_col=remove_zero_col,
+            remove_zero_row=remove_zero_row,
+            translate=translate,
+            scale=scale,
+            check_finite=check_finite,
+            weight=weight,
+        )
+        for arr in array_list
+    ]
+    arr_shape = np.array([arr.shape for arr in array_list_new])
+    array_b = np.ones(np.max(arr_shape, axis=0), dtype=int)
+    array_list_new = [_zero_padding(arr, array_b, pad_mode=pad_mode) for arr in array_list_new]
+    return array_list_new
+def _setup_input_array_lower(
+    array_a: np.ndarray,
+    array_ref: np.ndarray,
+    remove_zero_col: np.ndarray,
+    remove_zero_row: np.ndarray,
+    translate: bool,
+    scale: bool,
+    check_finite: bool,
+    weight: Optional[np.ndarray] = None,
+) -> np.ndarray:
+    """Pre-processing the matrices with translation, scaling."""
+    _check_arraytypes(array_a)
+    if check_finite:
+        array_a = np.asarray_chkfinite(array_a)
+        # Sometimes arrays already have zero padding that messes up zero padding below.
+    array_a = _hide_zero_padding(array_a, remove_zero_col, remove_zero_row)
+    if translate:
+        array_a, _ = _translate_array(array_a, array_ref, weight)
+    # scale the matrix when translate is False, but weight is True
+    else:
+        if weight is not None:
+            array_a = np.dot(np.diag(weight), array_a)
+    if scale:
+        array_a, _ = _scale_array(array_a, array_ref)
+    return array_a
+def _check_arraytypes(*args) -> None:
+    r"""Check array input types to Procrustes transformation routines."""
+    if any(not isinstance(arr_x, np.ndarray) for arr_x in args):
+        raise TypeError("Matrix inputs must be NumPy arrays")
+    if any(x.ndim != 2 for x in args):
+        raise TypeError("Matrix inputs must be 2-dimensional arrays")
+class ProcrustesResult(dict):
+    r"""Represents the Procrustes analysis result.
+    Attributes
+    ----------
+    error : float
+        The Procrustes (squared Frobenius norm) error.
+    new_a : ndarray
+        The translated/scaled numpy ndarray :math:`\mathbf{A}`.
+    new_b : ndarray
+        The translated/scaled numpy ndarray :math:`\mathbf{B}`.
+    t : ndarray
+        The 2D-array :math:`\mathbf{T}` representing the right-hand-side transformation matrix.
+    s : ndarray
+        The 2D-array :math:`\mathbf{S}` representing the left-hand-side transformation
+        matrix. If set to `None`, the one-sided Procrustes was performed.
+    """
+    # modification on https://github.com/scipy/scipy/blob/v1.4.1/scipy/optimize/optimize.py#L77-L132
+    def __getattr__(self, name: str):
+        """Deal with attributes which it doesn't explicitly manage."""
+        try:
+            return self[name]
+        # Not using raise from makes the traceback inaccurate, because the message implies there
+        # is a bug in the exception-handling code itself, which is a separate situation than
+        # wrapping an exception
+        # W0707 from http://pylint.pycqa.org/en/latest/technical_reference/features.html
+        except KeyError as ke_info:
+            raise AttributeError(name) from ke_info
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+    def __repr__(self):
+        """Return a human friendly representation."""
+        if self.keys():
+            max_len = max(map(len, list(self.keys()))) + 1
+            return "\n".join([k.rjust(max_len) + ": " + repr(v) for k, v in sorted(self.items())])
+        else:
+            return self.__class__.__name__ + "()"
+    def __dir__(self):
+        """Provide basic customization of module attribute access with a list."""
+        return list(self.keys())