Spaces:

admincybers2
/

CyberCode

Build error

App Files Files Community

admincybers2 commited on Aug 22, 2024

Commit

635b1f2

verified ·

1 Parent(s): eda60d7

Create app.py

Browse files

Files changed (1) hide show

app.py +156 -0

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import torch
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+import gradio as gr
+import json
+from huggingface_hub import HfApi
+max_seq_length = 4096
+dtype = None
+load_in_4bit = True
+hf_token = os.getenv("HF_TOKEN")
+current_num = os.getenv("NUM")
+print(f"stage ${current_num}")
+api = HfApi(token=hf_token)
+models = "unsloth/Meta-Llama-3.1-70B-bnb-4bit"
+print("Starting model and tokenizer loading...")
+# Load the model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_base,
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+    token=hf_token
+)
+print("Model and tokenizer loaded successfully.")
+print("Configuring PEFT model...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=16,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+    use_rslora=False,
+    loftq_config=None,
+)
+print("PEFT model configured.")
+# Updated alpaca_prompt for different types
+alpaca_prompt = {
+    "learning_from": """Below is a CVE definition.
+### CVE definition:
+{}
+### detail CVE:
+{}""",
+    "definition": """Below is a definition about software vulnerability. Explain it.
+### Definition:
+{}
+### Explanation:
+{}""",
+    "code_vulnerability": """Below is a code snippet. Identify the line of code that is vulnerable and describe the type of software vulnerability.
+### Code Snippet:
+{}
+### Vulnerability solution:
+{}"""
+}
+EOS_TOKEN = tokenizer.eos_token
+def detect_prompt_type(instruction):
+    if instruction.startswith("what is code vulnerable of this code:"):
+        return "code_vulnerability"
+    elif instruction.startswith("Learning from"):
+        return "learning_from"
+    elif instruction.startswith("what is"):
+        return "definition"
+    else:
+        return "unknown"
+def formatting_prompts_func(examples):
+    instructions = examples["instruction"]
+    outputs = examples["output"]
+    texts = []
+    for instruction, output in zip(instructions, outputs):
+        prompt_type = detect_prompt_type(instruction)
+        if prompt_type in alpaca_prompt:
+            prompt = alpaca_prompt[prompt_type].format(instruction, output)
+        else:
+            prompt = instruction + "\n\n" + output
+        text = prompt + EOS_TOKEN
+        texts.append(text)
+    return {"text": texts}
+print("Loading dataset...")
+dataset = load_dataset("admincybers2/DSV", split="train")
+print("Dataset loaded successfully.")
+print("Applying formatting function to the dataset...")
+dataset = dataset.map(formatting_prompts_func, batched=True)
+print("Formatting function applied.")
+print("Initializing trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    dataset_num_proc=2,
+    packing=False,
+    args=TrainingArguments(
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        learning_rate=2e-4,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        warmup_steps=5,
+        logging_steps=10,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        lr_scheduler_type="linear",
+        seed=3407,
+        output_dir="outputs"
+    ),
+)
+print("Trainer initialized.")
+print("Starting training...")
+trainer_stats = trainer.train()
+print("Training completed.")
+num = int(current_num)
+num += 1
+uploads_models = f"cybersentinal-2.0-{str(num)}"
+up = "sentinal-3.1-70B"
+print("Saving the trained model...")
+model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
+print("Model saved successfully.")
+print("Pushing the model to the hub...")
+model.push_to_hub_merged(
+    up,
+    tokenizer,
+    save_method="merged_16bit",
+    token=hf_token
+)
+print("Model pushed to hub successfully.")
+api.delete_space_variable(repo_id="admincybers2/CyberController", key="NUM")
+api.add_space_variable(repo_id="admincybers2/CyberController", key="NUM", value=str(num))