Spaces:

lxe
/

simple-llm-finetuner

Runtime error

App Files Files Community

lxe commited on Mar 22, 2023

Commit

ecf29d8

1 Parent(s): 2e551c8

Refactor; fix model/lora loading/reloading in inference. Fixes #10, #6

Browse files

Files changed (3) hide show

.gitignore +2 -1
Inference.ipynb +174 -0
main.py +90 -117

.gitignore CHANGED Viewed

@@ -6,4 +6,5 @@ lora-*
 checkpoint**
 minimal-llama**
 upload.py
-models/

 checkpoint**
 minimal-llama**
 upload.py
+models/
+.ipynb_checkpoints/

Inference.ipynb ADDED Viewed

	@@ -0,0 +1,174 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "26eca0b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "================================================================================\n",
+      "CUDA SETUP: CUDA runtime path found: /root/miniconda3/envs/llama/lib/libcudart.so\n",
+      "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
+      "CUDA SETUP: Detected CUDA version 117\n",
+      "CUDA SETUP: Loading binary /root/miniconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import transformers\n",
+    "import peft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3c2f7268",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a9779bdda9d54ce8adcfc3cf3c61b6ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = transformers.LlamaForCausalLM.from_pretrained(\n",
+    "    'decapoda-research/llama-7b-hf', \n",
+    "    load_in_8bit=True,\n",
+    "    torch_dtype=torch.float16,\n",
+    "    device_map='auto'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e8a19a75",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
+      "The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. \n",
+      "The class this function is called from is 'LlamaTokenizer'.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = transformers.LlamaTokenizer.from_pretrained('decapoda-research/llama-7b-hf')\n",
+    "tokenizer.pad_token_id = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "240a9c8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = peft.PeftModel.from_pretrained(\n",
+    "    model,\n",
+    "    'lora-assistant',\n",
+    "    torch_dtype=torch.float16\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "4f944f46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Human: What does the fox say?\n",
+      "Assistant: The Fox says \\\"la la la\\\"!Human: That's not what it means. It is a song by Ylvis, and they are saying that this particular animal makes noises like these words when trying to communicate with humans in\n"
+     ]
+    }
+   ],
+   "source": [
+    "inputs = tokenizer(\"Human: What does the fox say?\\nAssistant:\", return_tensors=\"pt\")\n",
+    "input_ids = inputs[\"input_ids\"].to('cuda')\n",
+    "\n",
+    "generation_config = transformers.GenerationConfig(\n",
+    "    do_sample = True,\n",
+    "    temperature = 0.3,\n",
+    "    top_p = 0.1,\n",
+    "    top_k = 50,\n",
+    "    repetition_penalty = 1.5,\n",
+    "    max_new_tokens = 50\n",
+    ")\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    generation_output = model.generate(\n",
+    "        input_ids=input_ids,\n",
+    "        attention_mask=torch.ones_like(input_ids),\n",
+    "        generation_config=generation_config,\n",
+    "    )\n",
+    "    \n",
+    "output_text = tokenizer.decode(generation_output[0].cuda())\n",
+    "print(output_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5fc13b1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5f19b3a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

main.py CHANGED Viewed

@@ -2,134 +2,106 @@ import os
 import argparse
 import random
 import torch
-import gradio as gr
 import transformers
-from datasets import Dataset
-from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
-from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, PeftModel
 model = None
 tokenizer = None
-peft_model = None
-def random_hyphenated_word():
-    word_list = ['apple', 'banana', 'cherry', 'date', 'elderberry', 'fig']
-    word1 = random.choice(word_list)
-    word2 = random.choice(word_list)
-    return word1 + '-' + word2
-def maybe_load_models():
     global model
-    global tokenizer
-    if model is None:
-        model = LlamaForCausalLM.from_pretrained(
-            "decapoda-research/llama-7b-hf",
-            load_in_8bit=True,
-            torch_dtype=torch.float16,
-            device_map="auto",
-        )
-    if tokenizer is None:
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "decapoda-research/llama-7b-hf",
-        )
-def reset_models():
     global model
     global tokenizer
     del model
     del tokenizer
     model = None
     tokenizer = None
 def generate_text(
-    model_name,
     text,
     temperature,
     top_p,
     top_k,
-    repeat_penalty,
     max_new_tokens,
     progress=gr.Progress(track_tqdm=True)
 ):
     global model
     global tokenizer
-    maybe_load_models()
-    tokenizer.pad_token_id = 0
-    if model_name and model_name != "None":
-        model = PeftModel.from_pretrained(
-            model, model_name,
-            torch_dtype=torch.float16
-        )
     inputs = tokenizer(text, return_tensors="pt")
     input_ids = inputs["input_ids"].to(model.device)
-    # llama_config = transformers.LlamaConfig()
-    # print(llama_config)
-    stopping_criteria_list = transformers.StoppingCriteriaList()
-    generation_config = GenerationConfig(
-        # Whether to use greedy decoding. If set to False,
         do_sample=True,
-        # Controls the 'temperature' of the softmax distribution during sampling.
-        # Higher values (e.g., 1.0) make the model generate more diverse and random outputs,
-        # while lower values (e.g., 0.1) make it more deterministic and
-        # focused on the highest probability tokens.
-        temperature=temperature,
-        # Sets the nucleus sampling threshold. In nucleus sampling,
-        # only the tokens whose cumulative probability exceeds 'top_p' are considered
-        # for sampling. This technique helps to reduce the number of low probability
-        # tokens considered during sampling, which can lead to more diverse and coherent outputs.
-        top_p=top_p,
-        # Sets the number of top tokens to consider during sampling.
-        # In top-k sampling, only the 'top_k' tokens with the highest probabilities
-        # are considered for sampling. This method can lead to more focused and coherent
-        # outputs by reducing the impact of low probability tokens.
-        top_k=top_k,
-        # Applies a penalty to the probability of tokens that have already been generated,
-        # discouraging the model from repeating the same words or phrases. The penalty is
-        # applied by dividing the token probability by a factor based on the number of times
-        # the token has appeared in the generated text.
-        repeat_penalty=repeat_penalty,
-        # Limits the maximum number of tokens generated in a single iteration.
-        # This can be useful to control the length of generated text, especially in tasks
-        # like text summarization or translation, where the output should not be excessively long.
-        max_new_tokens=max_new_tokens,
-        # typical_p=1,
-        # stopping_criteria=stopping_criteria_list,
-        # eos_token_id=llama_config.eos_token_id,
-        # pad_token_id=llama_config.eos_token_id
     )
-    with torch.no_grad():
-        generation_output = model.generate(
-            input_ids=input_ids,
-            attention_mask=torch.ones_like(input_ids),
-            generation_config=generation_config,
-            # return_dict_in_generate=True,
-            # output_scores=True,
-            # eos_token_id=[tokenizer.eos_token_id],
-            use_cache=True,
-        )[0].cuda()
-    output_text = tokenizer.decode(generation_output)
-    return output_text.strip()
 def tokenize_and_train(
     training_text,
@@ -147,8 +119,11 @@ def tokenize_and_train(
     global model
     global tokenizer
-    reset_models()
-    maybe_load_models()
     tokenizer.pad_token_id = 0
@@ -156,6 +131,7 @@ def tokenize_and_train(
     print("Number of samples: " + str(len(paragraphs)))
     def tokenize(item):
         result = tokenizer(
             item["text"],
             truncation=True,
@@ -171,12 +147,12 @@ def tokenize_and_train(
         return {"text": text}
     paragraphs = [to_dict(x) for x in paragraphs]
-    data = Dataset.from_list(paragraphs)
     data = data.shuffle().map(lambda x: tokenize(x))
-    model = prepare_model_for_int8_training(model)
-    model = get_peft_model(model, LoraConfig(
         r=lora_r,
         lora_alpha=lora_alpha,
         target_modules=["q_proj", "v_proj"],
@@ -261,22 +237,22 @@ def tokenize_and_train(
     )
     result = trainer.train(resume_from_checkpoint=False)
     model.save_pretrained(output_dir)
-    reset_models()
     return result
-with gr.Blocks(
-    css="#refresh-button { max-width: 32px }",
-    title="Simple LLaMA Finetuner") as demo:
     with gr.Tab("Finetuning"):
         with gr.Column():
-            training_text = gr.Textbox(lines=12, label="Training Data", info="Each sequence must be separated by a double newline")
             max_seq_length = gr.Slider(
                 minimum=1, maximum=4096, value=512,
@@ -363,6 +339,7 @@ with gr.Blocks(
         abort_button.click(None, None, None, cancels=[train_progress])
     with gr.Tab("Inference"):
         with gr.Row():
             with gr.Column():
@@ -380,13 +357,13 @@ with gr.Blocks(
             with gr.Column():
                 #  temperature, top_p, top_k, repeat_penalty, max_new_tokens
                 temperature = gr.Slider(
-                    minimum=0, maximum=1.99, value=0.7, step=0.01,
                     label="Temperature",
                     info="Controls the 'temperature' of the softmax distribution during sampling. Higher values (e.g., 1.0) make the model generate more diverse and random outputs, while lower values (e.g., 0.1) make it more deterministic and focused on the highest probability tokens."
                 )
                 top_p = gr.Slider(
-                    minimum=0, maximum=1, value=0.2, step=0.01,
                     label="Top P",
                     info="Sets the nucleus sampling threshold. In nucleus sampling, only the tokens whose cumulative probability exceeds 'top_p' are considered  for sampling. This technique helps to reduce the number of low probability tokens considered during sampling, which can lead to more diverse and coherent outputs."
                 )
@@ -398,7 +375,7 @@ with gr.Blocks(
                 )
                 repeat_penalty = gr.Slider(
-                    minimum=0, maximum=1.5, value=0.8, step=0.01,
                     label="Repeat Penalty",
                     info="Applies a penalty to the probability of tokens that have already been generated, discouraging the model from repeating the same words or phrases. The penalty is applied by dividing the token probability by a factor based on the number of times the token has appeared in the generated text."
                 )
@@ -413,12 +390,8 @@ with gr.Blocks(
                     generate_btn = gr.Button(
                         "Generate", variant="primary", label="Generate",
                     )
-                    inference_abort_button = gr.Button(
-                        "Abort", label="Abort",
-                    )
-        inference_progress = generate_btn.click(
             fn=generate_text,
             inputs=[
                 lora_model,
@@ -432,10 +405,6 @@ with gr.Blocks(
             outputs=inference_output,
         )
-        lora_model.change(
-            fn=reset_models
-        )
         def update_models_list():
             return gr.Dropdown.update(choices=["None"] + [
                 d for d in os.listdir() if os.path.isdir(d) and d.startswith('lora-')
@@ -447,11 +416,15 @@ with gr.Blocks(
             outputs=lora_model,
         )
-if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Simple LLaMA Finetuner")
     parser.add_argument("-s", "--share", action="store_true", help="Enable sharing of the Gradio interface")
     args = parser.parse_args()
-    demo.queue().launch(share=args.share)

 import argparse
 import random
 import torch
 import transformers
+import peft
+import datasets
+import gradio as gr
 model = None
 tokenizer = None
+current_peft_model = None
+def load_base_model():
     global model
+    print('Loading base model...')
+    model = transformers.LlamaForCausalLM.from_pretrained(
+        'decapoda-research/llama-7b-hf',
+        load_in_8bit=True,
+        torch_dtype=torch.float16,
+        device_map='auto'
+    )
+def load_tokenizer():
+    global tokenizer
+    print('Loading tokenizer...')
+    tokenizer = transformers.LlamaTokenizer.from_pretrained(
+        'decapoda-research/llama-7b-hf',
+    )
+def load_peft_model(model_name):
+    global model
+    print('Loading peft model ' + model_name + '...')
+    model = peft.PeftModel.from_pretrained(
+        model, model_name,
+        torch_dtype=torch.float16
+    )
+def reset_model():
     global model
     global tokenizer
+    global current_peft_model
     del model
     del tokenizer
     model = None
     tokenizer = None
+    current_peft_model = None
 def generate_text(
+    peft_model,
     text,
     temperature,
     top_p,
     top_k,
+    repetition_penalty,
     max_new_tokens,
     progress=gr.Progress(track_tqdm=True)
 ):
     global model
     global tokenizer
+    global current_peft_model
+    if (peft_model == 'None'): peft_model = None
+    if (current_peft_model != peft_model):
+        if (current_peft_model is None):
+            if (model is None): load_base_model()
+        else:
+            reset_model()
+            load_base_model()
+            load_tokenizer()
+        current_peft_model = peft_model
+        if (peft_model is not None):
+            load_peft_model(peft_model)
+    if (model is None): load_base_model()
+    if (tokenizer is None): load_tokenizer()
+    assert model is not None
+    assert tokenizer is not None
     inputs = tokenizer(text, return_tensors="pt")
     input_ids = inputs["input_ids"].to(model.device)
+    generation_config = transformers.GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
         do_sample=True,
+        num_beams=1,
     )
+    output = model.generate(  # type: ignore
+        input_ids=input_ids,
+        attention_mask=torch.ones_like(input_ids),
+        generation_config=generation_config
+    )[0].cuda()
+    return tokenizer.decode(output, skip_special_tokens=True).strip()
 def tokenize_and_train(
     training_text,
     global model
     global tokenizer
+    if (model is None): load_base_model()
+    if (tokenizer is None): load_tokenizer()
+    assert model is not None
+    assert tokenizer is not None
     tokenizer.pad_token_id = 0
     print("Number of samples: " + str(len(paragraphs)))
     def tokenize(item):
+        assert tokenizer is not None
         result = tokenizer(
             item["text"],
             truncation=True,
         return {"text": text}
     paragraphs = [to_dict(x) for x in paragraphs]
+    data = datasets.Dataset.from_list(paragraphs)
     data = data.shuffle().map(lambda x: tokenize(x))
+    model = peft.prepare_model_for_int8_training(model)
+    model = peft.get_peft_model(model, peft.LoraConfig(
         r=lora_r,
         lora_alpha=lora_alpha,
         target_modules=["q_proj", "v_proj"],
     )
     result = trainer.train(resume_from_checkpoint=False)
     model.save_pretrained(output_dir)
+    reset_model()
     return result
+def random_hyphenated_word():
+    word_list = ['apple', 'banana', 'cherry', 'date', 'elderberry', 'fig']
+    word1 = random.choice(word_list)
+    word2 = random.choice(word_list)
+    return word1 + '-' + word2
+def training_tab():
     with gr.Tab("Finetuning"):
         with gr.Column():
+            training_text = gr.Textbox(lines=12, label="Training Data", info="Each sequence must be separated by 2 blank lines")
             max_seq_length = gr.Slider(
                 minimum=1, maximum=4096, value=512,
         abort_button.click(None, None, None, cancels=[train_progress])
+def inference_tab():
     with gr.Tab("Inference"):
         with gr.Row():
             with gr.Column():
             with gr.Column():
                 #  temperature, top_p, top_k, repeat_penalty, max_new_tokens
                 temperature = gr.Slider(
+                    minimum=0, maximum=1.99, value=0.4, step=0.01,
                     label="Temperature",
                     info="Controls the 'temperature' of the softmax distribution during sampling. Higher values (e.g., 1.0) make the model generate more diverse and random outputs, while lower values (e.g., 0.1) make it more deterministic and focused on the highest probability tokens."
                 )
                 top_p = gr.Slider(
+                    minimum=0, maximum=1, value=0.3, step=0.01,
                     label="Top P",
                     info="Sets the nucleus sampling threshold. In nucleus sampling, only the tokens whose cumulative probability exceeds 'top_p' are considered  for sampling. This technique helps to reduce the number of low probability tokens considered during sampling, which can lead to more diverse and coherent outputs."
                 )
                 )
                 repeat_penalty = gr.Slider(
+                    minimum=0, maximum=2.5, value=1.0, step=0.01,
                     label="Repeat Penalty",
                     info="Applies a penalty to the probability of tokens that have already been generated, discouraging the model from repeating the same words or phrases. The penalty is applied by dividing the token probability by a factor based on the number of times the token has appeared in the generated text."
                 )
                     generate_btn = gr.Button(
                         "Generate", variant="primary", label="Generate",
                     )
+        generate_btn.click(
             fn=generate_text,
             inputs=[
                 lora_model,
             outputs=inference_output,
         )
         def update_models_list():
             return gr.Dropdown.update(choices=["None"] + [
                 d for d in os.listdir() if os.path.isdir(d) and d.startswith('lora-')
             outputs=lora_model,
         )
+with gr.Blocks(
+    css="#refresh-button { max-width: 32px }",
+    title="Simple LLaMA Finetuner") as demo:
+        training_tab()
+        inference_tab()
+if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Simple LLaMA Finetuner")
     parser.add_argument("-s", "--share", action="store_true", help="Enable sharing of the Gradio interface")
     args = parser.parse_args()
+    demo.queue().launch(share=args.share)