|
"""Translate a password dictionary using a finetuned model.""" |
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
import argparse |
|
from tqdm import tqdm |
|
|
|
max_seq_length = 2048 |
|
dtype = torch.float16 |
|
load_in_4bit = True |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "unsloth/llama-3-8b-bnb-4bit", |
|
max_seq_length = max_seq_length, |
|
dtype = dtype, |
|
load_in_4bit = load_in_4bit, |
|
) |
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 16, |
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj",], |
|
lora_alpha = 16, |
|
lora_dropout = 0, |
|
bias = "none", |
|
|
|
use_gradient_checkpointing = "unsloth", |
|
random_state = 3407, |
|
use_rslora = False, |
|
loftq_config = None, |
|
) |
|
|
|
import re |
|
def extract_response(text): |
|
|
|
|
|
match = re.search(r"### Response:\n(.*?)$", text, re.DOTALL) |
|
if match: |
|
response = match.group(1) |
|
response = response.replace("<|end_of_text|>", "") |
|
if response[-1] != "\n": |
|
response = response + "\n" |
|
return response |
|
else: |
|
raise "No response found in the text." |
|
|
|
from unsloth import FastLanguageModel |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "lora_model", |
|
max_seq_length = max_seq_length, |
|
dtype = dtype, |
|
load_in_4bit = load_in_4bit, |
|
) |
|
FastLanguageModel.for_inference(model) |
|
tokenizer.padding_side = "left" |
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{} |
|
|
|
### Input: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
|
|
def process_batch(batch): |
|
inputs = [] |
|
chunk_size = 10 |
|
for i in range(0, len(batch), chunk_size): |
|
chunk = ''.join(batch[i:i+chunk_size]) |
|
inputs.append(alpaca_prompt.format( |
|
"Translate this passwords while keeping the original format.", |
|
chunk, |
|
"", |
|
)) |
|
|
|
input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda") |
|
outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True) |
|
return [extract_response(response) for response in tokenizer.batch_decode(outputs)] |
|
|
|
BATCH_SIZE = 1000 |
|
|
|
def process_file(infile, outfile): |
|
try: |
|
with open(infile, 'r', encoding='latin1') as file: |
|
lines = file.readlines() |
|
|
|
translated_lines = [] |
|
|
|
|
|
for i in tqdm(range(0, len(lines), BATCH_SIZE)): |
|
translated_batch = process_batch(lines[i:i+BATCH_SIZE]) |
|
translated_lines.extend(translated_batch) |
|
|
|
|
|
with open(outfile, 'w', encoding='latin1') as file: |
|
file.writelines(translated_lines) |
|
|
|
except FileNotFoundError: |
|
print("The input file was not found.") |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Translate text file content to German.") |
|
parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file") |
|
parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved") |
|
|
|
args = parser.parse_args() |
|
|
|
process_file(args.input_file, args.output_file) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|