byroneverson commited on
Commit
3a95083
Β·
verified Β·
1 Parent(s): ba0d2f8

Upload abliterate-internlm-2-5-8b-chat.ipynb

Browse files
abliterate-internlm-2-5-8b-chat.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30747,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Download original abliterator script for harmful and harmless instructions txt files\nCredit: https://github.com/Sumandora/remove-refusals-with-transformers","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n!git clone https://github.com/Sumandora/remove-refusals-with-transformers.git","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-08-18T16:09:50.289357Z","iopub.execute_input":"2024-08-18T16:09:50.290084Z","iopub.status.idle":"2024-08-18T16:09:51.864366Z","shell.execute_reply.started":"2024-08-18T16:09:50.290049Z","shell.execute_reply":"2024-08-18T16:09:51.863145Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Cloning into 'remove-refusals-with-transformers'...\nremote: Enumerating objects: 33, done.\u001b[K\nremote: Counting objects: 100% (33/33), done.\u001b[K\nremote: Compressing objects: 100% (23/23), done.\u001b[K\nremote: Total 33 (delta 14), reused 23 (delta 9), pack-reused 0 (from 0)\u001b[K\nUnpacking objects: 100% (33/33), 656.80 KiB | 7.55 MiB/s, done.\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Install requirements for the entire process","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/remove-refusals-with-transformers\n!pip install -r requirements.txt","metadata":{"execution":{"iopub.status.busy":"2024-08-19T07:07:35.096136Z","iopub.execute_input":"2024-08-19T07:07:35.096496Z","iopub.status.idle":"2024-08-19T07:07:53.414825Z","shell.execute_reply.started":"2024-08-19T07:07:35.096467Z","shell.execute_reply":"2024-08-19T07:07:53.413930Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"/kaggle/working\n/kaggle/working/remove-refusals-with-transformers\nCollecting jaxtyping (from -r requirements.txt (line 1))\n Downloading jaxtyping-0.2.33-py3-none-any.whl.metadata (6.4 kB)\nRequirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from -r requirements.txt (line 2)) (4.42.3)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from -r requirements.txt (line 3)) (4.66.4)\nCollecting einops (from -r requirements.txt (line 4))\n Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)\nRequirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (from -r requirements.txt (line 5)) (2.1.2)\nCollecting bitsandbytes (from -r requirements.txt (line 6))\n Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)\nRequirement already satisfied: accelerate in /opt/conda/lib/python3.10/site-packages (from -r requirements.txt (line 7)) (0.32.1)\nCollecting typeguard==2.13.3 (from jaxtyping->-r requirements.txt (line 1))\n Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (3.13.1)\nRequirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (0.23.4)\nRequirement already satisfied: numpy<2.0,>=1.17 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (21.3)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (6.0.1)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (2023.12.25)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (2.32.3)\nRequirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (0.4.3)\nRequirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (0.19.1)\nRequirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch->-r requirements.txt (line 5)) (4.9.0)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch->-r requirements.txt (line 5)) (1.13.0)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch->-r requirements.txt (line 5)) (3.2.1)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch->-r requirements.txt (line 5)) (3.1.2)\nRequirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch->-r requirements.txt (line 5)) (2024.5.0)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate->-r requirements.txt (line 7)) (5.9.3)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->transformers->-r requirements.txt (line 2)) (3.1.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch->-r requirements.txt (line 5)) (2.1.3)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->-r requirements.txt (line 2)) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->-r requirements.txt (line 2)) (3.6)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->-r requirements.txt (line 2)) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->-r requirements.txt (line 2)) (2024.7.4)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch->-r requirements.txt (line 5)) (1.3.0)\nDownloading jaxtyping-0.2.33-py3-none-any.whl (42 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)\nDownloading einops-0.8.0-py3-none-any.whl (43 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.2/43.2 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.5/137.5 MB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n\u001b[?25hInstalling collected packages: typeguard, einops, jaxtyping, bitsandbytes\n Attempting uninstall: typeguard\n Found existing installation: typeguard 4.1.5\n Uninstalling typeguard-4.1.5:\n Successfully uninstalled typeguard-4.1.5\n\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nydata-profiling 4.6.4 requires numpy<1.26,>=1.16.0, but you have numpy 1.26.4 which is incompatible.\nydata-profiling 4.6.4 requires typeguard<5,>=4.1.2, but you have typeguard 2.13.3 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0mSuccessfully installed bitsandbytes-0.43.3 einops-0.8.0 jaxtyping-0.2.33 typeguard-2.13.3\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Download InternLM 2.5 7B chat model locally","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=\"internlm/internlm2_5-7b-chat\", local_dir=\"./internlm2_5-7b-chat\")\n","metadata":{"execution":{"iopub.status.busy":"2024-08-19T09:14:29.832992Z","iopub.execute_input":"2024-08-19T09:14:29.833931Z","iopub.status.idle":"2024-08-19T09:15:41.650782Z","shell.execute_reply.started":"2024-08-19T09:14:29.833872Z","shell.execute_reply":"2024-08-19T09:15:41.649727Z"},"trusted":true},"execution_count":84,"outputs":[{"name":"stdout","text":"/kaggle/working\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"18ab180b1719435c9e943830368349fc"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"configuration_internlm2.py: 0%| | 0.00/8.84k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"579ecdc39c404c36bdd0871640b614b1"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"README.md: 0%| | 0.00/18.9k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cb1ec35a3a9a48829f271ac435d9bb0e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/916 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"651e51d8bf9343f4a45e29d4f0e7cada"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"generation_config.json: 0%| | 0.00/123 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"63935ce52aa741bab7da3440623faf6f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":".gitattributes: 0%| | 0.00/1.52k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"233ae209095947f6b8d9608564f49d00"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00008-of-00008.safetensors: 0%| | 0.00/1.75G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"15b1b382b0904bd1b1c6ed32ff7ba3c6"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00003-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b000c2d67d3f450893e5e9a8a64a0a71"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00007-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d610d42d416a4135aff6893ac650e4cd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00006-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e0eb5c6f5642421e9824296bf32d648e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00005-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cb6f0d27c48841a8b8a0a35b8def086d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00004-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b6100a81a73d4516a0def87b6f3e5d2e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00001-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"caabdc8e10244e9893f0ad0e9d404d45"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00002-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b34ba4ad882f412bbffd19fc48e90c80"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model.safetensors.index.json: 0%| | 0.00/18.2k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"de805d507c2a4371bb7e5be94f3c2cb3"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"modeling_internlm2.py: 0%| | 0.00/80.7k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"47964f123c6b4ea8b9524edbfa61a5fb"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"special_tokens_map.json: 0%| | 0.00/713 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8657d6e5d51c4057a5c70756b83dd611"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenization_internlm2.py: 0%| | 0.00/8.81k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6e6c0c4f7e2b410ab1c99b6a97c1aec5"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenization_internlm2_fast.py: 0%| | 0.00/7.80k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"36c1b6cc01b6495a849057296fdcd167"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/2.51k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"044fb5206d8941b98f44bb7d4275cac1"}},"metadata":{}},{"execution_count":84,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/internlm2_5-7b-chat'"},"metadata":{}}]},{"cell_type":"markdown","source":"# Obtain estimated refusal direction vector","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/remove-refusals-with-transformers\n\nimport jaxtyping\n\nimport random\n\nimport torch\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n\nimport einops\n\nfrom tqdm import tqdm\n\ntorch.cuda.empty_cache()\n\ntorch.inference_mode()\n\nlocal_repo_dir = \"/kaggle/working/internlm2_5-7b-chat\"\n\nmodel = AutoModelForCausalLM.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True, torch_dtype=torch.float16, quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16))\ntokenizer = AutoTokenizer.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\n\n# settings:\n# I have used 128 and 256 with success but may as well use the max for a better estimation\ninstructions = 512\nlayer_idx = int(len(model.model.layers) * 0.6)\npos = -1\n\nprint(\"Instruction count: \" + str(instructions))\nprint(\"Using layer index: \" + str(layer_idx))\n\nwith open(\"harmful.txt\", \"r\") as f:\n harmful = f.readlines()\n\nwith open(\"harmless.txt\", \"r\") as f:\n harmless = f.readlines()\n\nharmful_instructions = random.sample(harmful, instructions)\nharmless_instructions = random.sample(harmless, instructions)\n\nharmful_toks = [\n tokenizer.apply_chat_template(conversation=[{\"role\": \"user\", \"content\": insn}], add_generation_prompt=True,\n return_tensors=\"pt\") for insn in harmful_instructions]\nharmless_toks = [\n tokenizer.apply_chat_template(conversation=[{\"role\": \"user\", \"content\": insn}], add_generation_prompt=True,\n return_tensors=\"pt\") for insn in harmless_instructions]\n\nmax_its = instructions * 2\nbar = tqdm(total=max_its)\n\ndef generate(toks):\n bar.update(n=1)\n return model.generate(toks.to(model.device), use_cache=False, max_new_tokens=1, return_dict_in_generate=True, output_hidden_states=True)\n\nharmful_outputs = [generate(toks) for toks in harmful_toks]\nharmless_outputs = [generate(toks) for toks in harmless_toks]\n\nbar.close()\n\nharmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]\nharmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]\n\n#print(harmful_hidden)\n\nharmful_mean = torch.stack(harmful_hidden).mean(dim=0)\nharmless_mean = torch.stack(harmless_hidden).mean(dim=0)\n\n#print(harmful_mean)\n\nrefusal_dir = harmful_mean - harmless_mean\nrefusal_dir = refusal_dir / refusal_dir.norm()\n\nprint(refusal_dir)\n\ntorch.save(refusal_dir, local_repo_dir + \"_refusal_dir.pt\")\n\nmodel = []\nharmful_instructions = []\nharmless_instructions = []\nharmful_toks = []\nharmless_toks = []\nharmful_outputs = []\nharmless_outputs = []\nharmful_hidden = []\nharmless_hidden = []\n\nimport gc\ngc.collect()\ntorch.cuda.empty_cache()","metadata":{"execution":{"iopub.status.busy":"2024-08-19T07:08:38.213983Z","iopub.execute_input":"2024-08-19T07:08:38.214317Z","iopub.status.idle":"2024-08-19T07:12:35.223271Z","shell.execute_reply.started":"2024-08-19T07:08:38.214290Z","shell.execute_reply":"2024-08-19T07:12:35.222308Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stdout","text":"/kaggle/working/remove-refusals-with-transformers\n","output_type":"stream"},{"name":"stderr","text":"`low_cpu_mem_usage` was None, now set to True since model is quantized.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Loading checkpoint shards: 0%| | 0/8 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c881df3504ea49de815f00f9980d6864"}},"metadata":{}},{"name":"stdout","text":"Instruction count: 512\nUsing layer index: 19\n","output_type":"stream"},{"name":"stderr","text":" 0%| | 0/1024 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n2024-08-19 07:09:15.454511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n2024-08-19 07:09:15.454602: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n2024-08-19 07:09:15.578923: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1024/1024 [03:22<00:00, 5.06it/s]\n","output_type":"stream"},{"name":"stdout","text":"tensor([[ 0.0073, -0.0051, 0.0119, ..., -0.0010, 0.0158, 0.0123]],\n device='cuda:0', dtype=torch.float16)\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Apply our method to quant model and test via chat\nModified weights will not be quantized (tok_embeddings, attention.wo, feed_forward.w2)\n\nCredits:\n\nhttps://www.lesswrong.com/posts/jGuXSZgv6qfdhMCuJ/refusal-in-llms-is-mediated-by-a-single-direction\n\nhttps://github.com/FailSpy/abliterator\n\nhttps://gemini.google.com/","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/remove-refusals-with-transformers\n\nfrom typing import Optional, Tuple\n\nimport einops\nimport jaxtyping\nimport torch\nimport torch.nn as nn\nfrom transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n\ntorch.cuda.empty_cache()\n\ntorch.inference_mode()\n\ntorch.set_default_device(\"cpu\")\n\nlocal_repo_dir = \"/kaggle/working/internlm2_5-7b-chat\"\n\nmodel = AutoModelForCausalLM.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True, device_map=\"cuda\", quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=[\"wo\", \"w2\"], bnb_4bit_compute_dtype=torch.float16))\ntokenizer = AutoTokenizer.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\nconfig = AutoConfig.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\nprint(config)\n\nrefusal_dir = torch.load(local_repo_dir + \"_refusal_dir.pt\")\n\ndef orthogonalize_matrix(matrix: jaxtyping.Float[torch.Tensor, \"... d\"], \n direction: jaxtyping.Float[torch.Tensor, \"d\"]) -> jaxtyping.Float[torch.Tensor, \"... d\"]:\n proj = einops.einsum(matrix, direction.view(-1, 1), \"... d, d single -> ... single\") * direction\n return matrix - proj\n\n# Orthogonalize tok_embeddings\ndevice = model.model.tok_embeddings.weight.device\nmodel.model.tok_embeddings.weight.data.copy_(orthogonalize_matrix(model.model.tok_embeddings.weight, refusal_dir.to(device)))\n\n# Orthogonalize layers\n# Skip first 8 and last 8 layers, this should be tailored to the model, more layers skipped = better quality but less effective abliteration\nskip_count = 8\nstart_idx = skip_count\nend_idx = len(model.model.layers) - skip_count\nfor idx in reversed(range(start_idx, end_idx)): # for qwen 1 this needs to be changed to model.transformer.h\n # wo must be rearranged for orthogonalization and reversed when complete\n device = model.model.layers[idx].attention.wo.weight.device\n wo_rearranged = einops.rearrange(model.model.layers[idx].attention.wo.weight, \"m (n h) -> n h m\", n=config.num_attention_heads).to(device)\n wo_orthogonalized = orthogonalize_matrix(wo_rearranged, refusal_dir.to(device))\n model.model.layers[idx].attention.wo.weight.data.copy_(einops.rearrange(wo_orthogonalized, \"n h m -> m (n h)\", n=config.num_attention_heads).to(device))\n wo_rearranged = []\n wo_orthogonalized = []\n \n # w2 must be transposed for orthogonalization and reversed when complete\n device = model.model.layers[idx].feed_forward.w2.weight.device\n w2_transposed = model.model.layers[idx].feed_forward.w2.weight.T.to(device)\n w2_orthogonalized = orthogonalize_matrix(w2_transposed, refusal_dir.to(device))\n model.model.layers[idx].feed_forward.w2.weight.data.copy_(w2_orthogonalized.T.to(device))\n w2_transposed = []\n w2_orthogonalized = []\n\nimport gc\ngc.collect()\ntorch.cuda.empty_cache()\n\nconversation = []\n\nstreamer = TextStreamer(tokenizer)\n\nprint(f\"Chat:\")\nwhile True:\n prompt = input()\n conversation.append({\"role\": \"user\", \"content\": prompt})\n toks = tokenizer.apply_chat_template(conversation=conversation,\n add_generation_prompt=True, return_tensors=\"pt\")\n\n gen = model.generate(toks.to(model.device), streamer=streamer, max_new_tokens=1337)\n\n decoded = tokenizer.batch_decode(gen[0][len(toks[0]):], skip_special_tokens=True)\n conversation.append({\"role\": \"assistant\", \"content\": \"\".join(decoded)})","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Modify individual safetensors files separately to save memory","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/internlm2_5-7b-chat\n\nmodel = []\n\nimport gc\ngc.collect()\ntorch.cuda.empty_cache()\n\nfrom safetensors import safe_open\nfrom safetensors.torch import save_file\nfrom typing import Optional, Tuple\n\nimport einops\nimport jaxtyping\nimport torch\n\nlocal_repo_dir = \"/kaggle/working/internlm2_5-7b-chat\"\n\nconfig = AutoConfig.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\nrefusal_direction = torch.load(local_repo_dir + \"_refusal_dir.pt\").to(torch.float32)\n\ndef orthogonalize_matrix(matrix: jaxtyping.Float[torch.Tensor, \"... d\"], \n direction: jaxtyping.Float[torch.Tensor, \"d\"]) -> jaxtyping.Float[torch.Tensor, \"... d\"]:\n proj = einops.einsum(matrix, direction.view(-1, 1), \"... d, d single -> ... single\") * direction\n return matrix - proj\n\ndef load_safetensors_file(file_path):\n \"\"\"Loads a single safetensors file into a dictionary of tensors.\n Args:\n file_path (str): Path to the safetensors file.\n Returns:\n dict: A dictionary containing the loaded tensors.\n \"\"\"\n tensors = {}\n with safe_open(file_path, framework=\"pt\", device=\"cpu\") as f:\n #print(f.metadata())\n for key in f.keys():\n tensors[key] = f.get_tensor(key)\n return tensors\n\ndevice = refusal_direction.device\nsafetensors_count = 8\nfor idx in range(safetensors_count):\n filename = \"model-\" + str(idx + 1).zfill(5) + \"-of-\" + str(safetensors_count).zfill(5) + \".safetensors\"\n print(filename)\n file_path = local_repo_dir + \"/\" + filename\n tensors = load_safetensors_file(file_path)\n \n for tensor in tensors:\n # tok_embeddings\n if \".tok_embeddings.weight\" in tensor:\n print(\"β€’ \" + tensor)\n dtype = tensors[tensor].dtype\n t = tensors[tensor].to(torch.float32).to(device)\n tensors[tensor].copy_(orthogonalize_matrix(t, refusal_direction).to(dtype))\n t = []\n \n # attention.wo\n if \".attention.wo.weight\" in tensor:\n print(\"β€’ \" + tensor)\n dtype = tensors[tensor].dtype\n t = tensors[tensor].to(torch.float32).to(device)\n t_rearranged = einops.rearrange(t, \"m (n h) -> n h m\", n=config.num_attention_heads).to(device)\n t_orthogonalized = orthogonalize_matrix(t_rearranged, refusal_direction)\n tensors[tensor].copy_(einops.rearrange(t_orthogonalized, \"n h m -> m (n h)\", n=config.num_attention_heads).to(dtype))\n t = []\n t_rearranged = []\n t_orthogonalized = []\n \n # feed_forward.w2\n if \".feed_forward.w2.weight\" in tensor:\n print(\"β€’ \" + tensor)\n dtype = tensors[tensor].dtype\n t = tensors[tensor].to(torch.float32).to(device)\n t_transposed = t.T.to(device)\n t_orthogonalized = orthogonalize_matrix(t_transposed, refusal_direction)\n tensors[tensor].copy_(t_orthogonalized.T.to(dtype))\n t = []\n t_transposed = []\n t_orthogonalized = []\n \n # Save file\n save_file(tensors, file_path, metadata={'format': 'pt'})\n\n# Patching done\nprint(\"done!\")\n","metadata":{"execution":{"iopub.status.busy":"2024-08-19T09:54:45.179906Z","iopub.execute_input":"2024-08-19T09:54:45.180601Z","iopub.status.idle":"2024-08-19T09:57:29.208426Z","shell.execute_reply.started":"2024-08-19T09:54:45.180568Z","shell.execute_reply":"2024-08-19T09:57:29.207231Z"},"trusted":true},"execution_count":94,"outputs":[{"name":"stdout","text":"/kaggle/working/internlm2_5-7b-chat\nmodel-00001-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.0.attention.wo.weight\nβ€’ model.layers.0.feed_forward.w2.weight\nβ€’ model.layers.1.attention.wo.weight\nβ€’ model.layers.1.feed_forward.w2.weight\nβ€’ model.layers.2.attention.wo.weight\nβ€’ model.tok_embeddings.weight\nmodel-00002-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.2.feed_forward.w2.weight\nβ€’ model.layers.3.attention.wo.weight\nβ€’ model.layers.3.feed_forward.w2.weight\nβ€’ model.layers.4.attention.wo.weight\nβ€’ model.layers.4.feed_forward.w2.weight\nβ€’ model.layers.5.attention.wo.weight\nβ€’ model.layers.5.feed_forward.w2.weight\nβ€’ model.layers.6.attention.wo.weight\nβ€’ model.layers.6.feed_forward.w2.weight\nβ€’ model.layers.7.attention.wo.weight\nmodel-00003-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.10.attention.wo.weight\nβ€’ model.layers.10.feed_forward.w2.weight\nβ€’ model.layers.11.attention.wo.weight\nβ€’ model.layers.7.feed_forward.w2.weight\nβ€’ model.layers.8.attention.wo.weight\nβ€’ model.layers.8.feed_forward.w2.weight\nβ€’ model.layers.9.attention.wo.weight\nβ€’ model.layers.9.feed_forward.w2.weight\nmodel-00004-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.11.feed_forward.w2.weight\nβ€’ model.layers.12.attention.wo.weight\nβ€’ model.layers.12.feed_forward.w2.weight\nβ€’ model.layers.13.attention.wo.weight\nβ€’ model.layers.13.feed_forward.w2.weight\nβ€’ model.layers.14.attention.wo.weight\nβ€’ model.layers.14.feed_forward.w2.weight\nβ€’ model.layers.15.attention.wo.weight\nβ€’ model.layers.15.feed_forward.w2.weight\nβ€’ model.layers.16.attention.wo.weight\nmodel-00005-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.16.feed_forward.w2.weight\nβ€’ model.layers.17.attention.wo.weight\nβ€’ model.layers.17.feed_forward.w2.weight\nβ€’ model.layers.18.attention.wo.weight\nβ€’ model.layers.18.feed_forward.w2.weight\nβ€’ model.layers.19.attention.wo.weight\nβ€’ model.layers.19.feed_forward.w2.weight\nβ€’ model.layers.20.attention.wo.weight\nmodel-00006-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.20.feed_forward.w2.weight\nβ€’ model.layers.21.attention.wo.weight\nβ€’ model.layers.21.feed_forward.w2.weight\nβ€’ model.layers.22.attention.wo.weight\nβ€’ model.layers.22.feed_forward.w2.weight\nβ€’ model.layers.23.attention.wo.weight\nβ€’ model.layers.23.feed_forward.w2.weight\nβ€’ model.layers.24.attention.wo.weight\nβ€’ model.layers.24.feed_forward.w2.weight\nβ€’ model.layers.25.attention.wo.weight\nmodel-00007-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.25.feed_forward.w2.weight\nβ€’ model.layers.26.attention.wo.weight\nβ€’ model.layers.26.feed_forward.w2.weight\nβ€’ model.layers.27.attention.wo.weight\nβ€’ model.layers.27.feed_forward.w2.weight\nβ€’ model.layers.28.attention.wo.weight\nβ€’ model.layers.28.feed_forward.w2.weight\nβ€’ model.layers.29.attention.wo.weight\nmodel-00008-of-00008.safetensors\n{'format': 'pt'}\nβ€’ model.layers.29.feed_forward.w2.weight\nβ€’ model.layers.30.attention.wo.weight\nβ€’ model.layers.30.feed_forward.w2.weight\nβ€’ model.layers.31.attention.wo.weight\nβ€’ model.layers.31.feed_forward.w2.weight\ndone!\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Test chat with abliterated model","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/internlm2_5-7b-chat\n\nfrom typing import Optional, Tuple\n\nimport einops\nimport jaxtyping\nimport torch\nimport torch.nn as nn\nfrom transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n\nimport gc\n\nmodel = []\ngc.collect()\ntorch.cuda.empty_cache()\n\ntorch.inference_mode()\n\ntorch.set_default_device(\"cpu\")\n\nlocal_repo_dir = \"/kaggle/working/internlm2_5-7b-chat\"\n\nmodel = AutoModelForCausalLM.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True, device_map=\"cuda\", quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=[\"wo\", \"w2\"], bnb_4bit_compute_dtype=torch.float16))\ntokenizer = AutoTokenizer.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\nconfig = AutoConfig.from_pretrained(local_repo_dir, local_files_only=True, trust_remote_code=True)\nprint(config)\n\ngc.collect()\ntorch.cuda.empty_cache()\n\nconversation = []\n\nstreamer = TextStreamer(tokenizer)\n\nprint(f\"Chat:\")\nwhile True:\n prompt = input()\n conversation.append({\"role\": \"user\", \"content\": prompt})\n toks = tokenizer.apply_chat_template(conversation=conversation,\n add_generation_prompt=True, return_tensors=\"pt\")\n\n gen = model.generate(toks.to(model.device), streamer=streamer, max_new_tokens=1337)\n\n decoded = tokenizer.batch_decode(gen[0][len(toks[0]):], skip_special_tokens=True)\n conversation.append({\"role\": \"assistant\", \"content\": \"\".join(decoded)})","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Upload to huggingface","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working/\n\nimport gc\n\nmodel = []\ngc.collect()\ntorch.cuda.empty_cache()\n\nlocal_repo_dir = \"internlm2_5-7b-chat\"\nrepo_id = \"byroneverson/internlm2_5-7b-chat-abliterated\"\n\nfrom kaggle_secrets import UserSecretsClient\ntoken = UserSecretsClient().get_secret(\"hf_write\") \n\nfrom huggingface_hub import upload_folder\n\nupload_folder(folder_path=local_repo_dir, repo_id=repo_id, token=token)","metadata":{"execution":{"iopub.status.busy":"2024-08-19T10:22:26.699851Z","iopub.execute_input":"2024-08-19T10:22:26.700219Z","iopub.status.idle":"2024-08-19T10:24:43.445230Z","shell.execute_reply.started":"2024-08-19T10:22:26.700193Z","shell.execute_reply":"2024-08-19T10:24:43.444266Z"},"trusted":true},"execution_count":111,"outputs":[{"name":"stdout","text":"/kaggle/working\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Upload 9 LFS files: 0%| | 0/9 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6905f9a23c0a4b8388d9534a07217e80"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00005-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a41f6b5a57654b88891a7581559e9774"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00002-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"78736d24423f4071b8b670ab89a7a441"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00003-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"317fc0df9ebb44c89ef0e09ca9d9a855"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00001-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"177d45dfb6944e59b48d88cac181e793"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00004-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a8b30f95b9754d55b1f57a50dc5b415f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00006-of-00008.safetensors: 0%| | 0.00/1.95G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fa224cf4efda4d42b38ba440969b0901"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00007-of-00008.safetensors: 0%| | 0.00/1.98G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"766834642b2e40ffae85822b81f3aaf7"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model-00008-of-00008.safetensors: 0%| | 0.00/1.75G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a4325a7e0f4449bf90252209b0119e56"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.model: 0%| | 0.00/1.48M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2ea4f606362e49dc9e20cab8ba274019"}},"metadata":{}},{"execution_count":111,"output_type":"execute_result","data":{"text/plain":"CommitInfo(commit_url='https://huggingface.co/byroneverson/internlm2_5-7b-chat/commit/5047e9334ee01881340c2aef2ecd6732d53146e8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='5047e9334ee01881340c2aef2ecd6732d53146e8', pr_url=None, pr_revision=None, pr_num=None)"},"metadata":{}}]}]}