blizet's picture
Update app.py
0bf6b22 verified
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
pipeline,
logging
)
from peft import LoraConfig
# Define the model name and fine-tuned model output path
model_name = "blizet/Llama-Phishing-Finetune"
################################################################################
# LoRA Configuration Parameters
################################################################################
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha parameter for LoRA scaling
lora_dropout = 0.1 # Dropout probability for LoRA layers
################################################################################
# BitsAndBytes Configuration Parameters
################################################################################
use_4bit = True # Activate 4-bit precision for model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit models
bnb_4bit_quant_type = "nf4" # Quantization type (either fp4 or nf4)
use_nested_quant = False # Nested quantization (double quantization)
################################################################################
# TrainingArguments Configuration Parameters
################################################################################
output_dir = "/content/drive/MyDrive/llm_finetune_results" # Output directory
num_train_epochs = 1 # Number of epochs for fine-tuning
fp16 = False # Disable fp16 training (set True if using A100 GPUs)
bf16 = False # Set bf16 to True for A100 GPUs
per_device_train_batch_size = 8 # Training batch size
per_device_eval_batch_size = 4 # Evaluation batch size
gradient_accumulation_steps = 1 # Gradient accumulation steps
gradient_checkpointing = True # Enable gradient checkpointing
max_grad_norm = 0.3 # Max gradient norm for clipping
learning_rate = 2e-4 # Initial learning rate
weight_decay = 0.001 # Weight decay parameter
optim = "paged_adamw_32bit" # Optimizer to use
lr_scheduler_type = "cosine" # Learning rate scheduler
max_steps = 2000 # Number of training steps (use max steps if you want to limit training)
warmup_ratio = 0.03 # Warmup ratio for learning rate
save_steps = 0 # Save checkpoint every X steps (0 disables checkpoint saving)
logging_steps = 28 # Log progress every X steps
# Ensure that max_steps is at least 1
max_steps = max(max_steps, 1)
print(f"Max steps set to {max_steps} for 5% of the dataset")
################################################################################
# SFT Configuration Parameters
################################################################################
max_seq_length = 256 # Limit sequence length for saving memory
group_by_length = True # Group sequences by length for efficiency
device_map = {"": "cpu"} # Ensure model is loaded on CPU
################################################################################
# Step 1: Load Model and Tokenizer with Adjusted Configuration
################################################################################
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Load the model with the `device_map='cpu'` to force CPU usage
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map='cpu' # Force the model to load on CPU
)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Initialize the text generation pipeline with device=-1 for CPU
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000, device=-1)
################################################################################
# Phishing Detection Function
################################################################################
def analyze_email(email_content):
prompt = f"""
You are an advanced email analyzer tasked with evaluating the email for phishing and legitimacy. Analyze the email based on the following parameters:
1. Is the email a phishing attempt? (Yes/No)
2. Does the email create a sense of urgency? (Yes/No)
3. Are there grammatical errors? (List the errors or respond with "None.")
4. Does the email show intention to deceive or steal sensitive information? (Yes/No)
5. Is the greeting customized or generalized? (Customized/Generalized)
6. Does the email include a valid customer ID or reference number? (Yes/No)
7. Is the sender’s email address suspicious or spoofed? (Yes/No, and explain if suspicious)
8. Is the email requesting sensitive information such as passwords, bank details, or verification codes? (Yes/No)
9. Is there a hyperlink in the email? If yes, does the link appear suspicious (e.g., misspelled domain, shortened URL)? (Yes/No, and explain)
10. Does the email contain attachments? If yes, are they suspicious (e.g., unexpected file types like .exe, .zip)? (Yes/No)
11. Is the tone of the email overly formal or informal, and does it match the sender’s usual style? (Formal/Informal, and Yes/No for match)
12. Does the email refer to any specific past transaction or interaction? (Yes/No)
13. Are there any noticeable inconsistencies, such as mismatched branding, incorrect logos, or unusual formatting? (List issues or respond with "None.")
14. Does the email encourage you to bypass standard security procedures? (Yes/No)
15. Is the email well-structured and professional in appearance? (Yes/No)
Email:
{email_content}
"""
# Generate a response using the model
response = pipe(f"<s>[INST] {prompt} [/INST]")[0]["generated_text"]
return response.strip()
################################################################################
# Example Email to Test the Function
################################################################################
email_sample = """
Dear Candidate,
Greetings from Amazon!
We are reaching out to you to seek your interest for 6-Months (Jan – June 2025) SDE internship at Amazon.
If you are interested, kindly follow the next steps:
Step 1: Complete the Hiring Interest Form by 16th October 2024 6:00PM IST. Click Here!
Step 2: Watch out for the next steps email that you’ll receive on 22nd October 2024.
Please note that Amazon internships require full-time commitment during the duration of the internship. During the course of the internship, interns should not have any conflicts including but not limited to academic projects, classes, or other internships/employment. Any exam-related details must be shared with the hiring manager to plan for absence during those days. Specific team norms around working hours will be communicated by the hiring/reporting manager at the time of commencement of internship. Candidates receiving an internship will be required to submit a declaration of their availability to complete the entire duration of the internship, duly signed by a competent authority at their University. The internship offer will be subject to successful submission of the declaration.
Looking forward to your response!
Thanks & Regards,
Amazon University Talent Acquisition | APAC
"""
# Run the email analysis
result = analyze_email(email_sample)
# Display the results
print("Phishing Detection Analysis:")
print(result)