sudoaza
/

rockdich

Model card Files Files and versions Community

sudoaza commited on May 5, 2024

Commit

00892f8

0 Parent(s):

initial commit

Browse files

Files changed (8) hide show

README.md +5 -0
build_dataset.py +115 -0
password_translation_instructions.csv +0 -0
requirements.txt +5 -0
split_translations.py +62 -0
translate_final.py +114 -0
translate_oai.py +82 -0
translate_ollama.py +79 -0

README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Rockdich
+Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
+See huggingface repo for model. https://huggingface.co/sudoaza/rockdich

build_dataset.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Code to augment the translated/untranslated passwords and create a dataset for the password translation task."""
+import pandas as pd
+import random
+N_SAMPLES = 10000
+def mutate_password_pair(pair):
+    # 20% of the times we will capitalize the first letter
+    if random.random() < 0.2:
+        pair = (pair[0].capitalize(), pair[1].capitalize())
+    # 20% of the times we will add a number at the end
+    if random.random() < 0.2:
+        number = random.randint(0, 9)
+        pair = (pair[0] + str(number), pair[1] + str(number))
+    # 20% of the times we will add a symbol at the end
+    if random.random() < 0.2:
+        symbol = random.choice(['!', '@', '#', '$', '%', '&', '*'])
+        pair = (pair[0] + symbol, pair[1] + symbol)
+    # 20% of the tims we will replace a letter with a number
+    if random.random() < 0.2:
+        if "e" in pair[0]:
+            letter = "e"
+            number = "3"
+        elif "E" in pair[0]:
+            letter = "E"
+            number = "3"
+        elif "i" in pair[0]:
+            letter = "i"
+            number = "1"
+        elif "I" in pair[0]:
+            letter = "I"
+            number = "1"
+        elif "o" in pair[0]:
+            letter = "o"
+            number = "0"
+        elif "O" in pair[0]:
+            letter = "O"
+            number = "0"
+        elif "a" in pair[0]:
+            letter = "a"
+            number = "4"
+        elif "A" in pair[0]:
+            letter = "A"
+            number = "4"
+        elif "t" in pair[0]:
+            letter = "t"
+            number = "7"
+        elif "T" in pair[0]:
+            letter = "T"
+            number = "7"
+        else:
+            return pair
+        # replace only first occurrence
+        pair = (pair[0].replace(letter, number, 1), pair[1].replace(letter, number, 1))
+    return pair
+def create_dataframes():
+    # Read the files
+    with open('original_train.txt', 'r', encoding='latin1') as file:
+        original = file.readlines()
+    with open('translated_train.txt', 'r', encoding='utf-8') as file:
+        translated = file.readlines()
+    with open('untranslated.txt', 'r', encoding='latin1') as file:
+        untranslated = file.readlines()
+    # Create a dataframe from original and translated lists
+    df_translated = pd.DataFrame({
+        'original': [line.strip() for line in original],
+        'translated': [line.strip() for line in translated]
+    })
+    # List for untranslated
+    untranslated_list = [line.strip() for line in untranslated]
+    # Create an empty dataframe for instructions
+    df_instructions = pd.DataFrame(columns=['instruction', 'input', 'output'])
+    # Generate 100 instruction rows (arbitrary choice to generate a substantial sample)
+    for _ in range(N_SAMPLES):
+        # Randomly pick 8 translated pairs
+        sampled_translated = df_translated.sample(8)
+        original_samples = sampled_translated['original'].tolist()
+        translated_samples = sampled_translated['translated'].tolist()
+        # Randomly pick 2 untranslated passwords
+        untranslated_samples = random.sample(untranslated_list, 2)
+        # Combine and shuffle maintaining pairing
+        total_input = original_samples + untranslated_samples
+        total_output = translated_samples + untranslated_samples
+        combined_list = list(zip(total_input, total_output))
+        random.shuffle(combined_list)
+        combined_list = [mutate_password_pair(pair) for pair in combined_list]
+        shuffled_input, shuffled_output = zip(*combined_list)
+        new_rows = {
+            'instruction': 'Translate this passwords while keeping the original format.',
+            'input': "\n".join(list(shuffled_input)),
+            'output': "\n".join(list(shuffled_output))
+        }
+        df_instructions = df_instructions._append(new_rows, ignore_index=True)
+    return df_instructions
+# Generate the dataframe
+df_instructions = create_dataframes()
+# Output to check
+print(df_instructions.head())
+# Saving the new DataFrame to a CSV (optional)
+df_instructions.to_csv('password_translation_instructions.csv', index=False)

password_translation_instructions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+xformers
+trl
+peft
+accelerate
+bitsandbytes

split_translations.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Split passwords into translated and untranslated"""
+def read_passwords(file_path):
+    """Read passwords from a file and return a list of those passwords."""
+    with open(file_path, 'r', encoding='latin1') as file:
+        return file.read().splitlines()
+def compare_passwords(file_path1, file_path2):
+    """Compare passwords from two files and categorize them, preserving order."""
+    passwords1 = read_passwords(file_path1)
+    passwords2 = read_passwords(file_path2)
+    unique_passwords_1 = []
+    unique_passwords_2 = []
+    common_passwords = []
+    for i in range(len(passwords1)):
+        if passwords1[i] == passwords2[i]:
+            common_passwords.append(passwords1[i])
+        else:
+            unique_passwords_1.append(passwords1[i])
+            unique_passwords_2.append(passwords2[i])
+    return common_passwords, unique_passwords_1, unique_passwords_2
+def save_passwords(file_path, password_list):
+    """Save the list of passwords to a file."""
+    with open(file_path, 'w', encoding='latin1') as file:
+        for password in password_list:
+            file.write(password + '\n')
+def main():
+    # Define the paths to the input files and output files
+    # 1st Run
+    file_path1 = 'orig_4k.txt'
+    file_path2 = 'de_4k.txt'
+    untranslated_file = 'untranslated.txt'
+    orig_translated_file = 'orig_translated.txt'
+    trans_translated_file = 'trans_translated.txt'
+    # 2nd Run
+    # file_path1 = 'untranslated.txt'
+    # file_path2 = 're_translated.txt'
+    # untranslated_file = 'untranslated2.txt'
+    # orig_translated_file = 'orig_translated2.txt'
+    # trans_translated_file = 'trans_translated2.txt'
+    # 3rd Run
+    # file_path1 = 'untranslated2.txt'
+    # file_path2 = 're_translated2.txt'
+    # untranslated_file = 'untranslated3.txt'
+    # orig_translated_file = 'orig_translated3.txt'
+    # trans_translated_file = 'trans_translated3.txt'
+    # Compare passwords and get the lists
+    common_passwords, unique_passwords_1, unique_passwords_2 = compare_passwords(file_path1, file_path2)
+    # Save the resulting lists to files
+    save_passwords(untranslated_file, common_passwords)
+    save_passwords(orig_translated_file, unique_passwords_1)
+    save_passwords(trans_translated_file, unique_passwords_2)
+if __name__ == "__main__":
+    main()

translate_final.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Translate a password dictionary using a finetuned model."""
+from unsloth import FastLanguageModel
+import torch
+import argparse
+import tqdm
+max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
+dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/llama-3-8b-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+import re
+def extract_response(text):
+    # Define a regular expression to find the content after "### Response:"
+    # Using non-greedy matching to stop at the first potential end-of-text token or excessive newlines
+    match = re.search(r"### Response:\n(.*?)$", text, re.DOTALL)
+    if match:
+        response = match.group(1)
+        response = response.replace("<|end_of_text|>", "")
+        if response[-1] != "\n":
+            response = response + "\n"
+        return response
+    else:
+        raise "No response found in the text."
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+FastLanguageModel.for_inference(model) # Enable native 2x faster inference
+tokenizer.padding_side = "left"
+alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+{}
+### Input:
+{}
+### Response:
+{}"""
+def process_batch(batch):
+    inputs = []
+    chunk_size = 10  # You can adjust the chunk size based on your needs
+    for i in range(0, len(batch), chunk_size):
+        chunk = ''.join(batch[i:i+chunk_size])
+        inputs.append(alpaca_prompt.format(
+                    "Translate this passwords while keeping the original format.", # instruction
+                    chunk, # input
+                    "", # output - leave this blank for generation!
+                ))
+    input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
+    outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
+    return tokenizer.batch_decode(outputs).map(extract_response)
+BATCH_SIZE = 1000
+def process_file(infile, outfile):
+    try:
+        with open(infile, 'r', encoding='latin1') as file:
+            lines = file.readlines()
+        translated_lines = []
+        # use tqdm for progress bar
+        for i in tqdm(range(0, len(lines), BATCH_SIZE)):
+            translated_batch = process_batch(lines[i:i+BATCH_SIZE])
+            translated_lines.extend(translated_batch)
+        # Write the translated text to another file
+        with open(outfile, 'w', encoding='utf-8') as file:
+            file.writelines(translated_lines)
+    except FileNotFoundError:
+        print("The input file was not found.")
+def main():
+    parser = argparse.ArgumentParser(description="Translate text file content to German.")
+    parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
+    parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
+    args = parser.parse_args()
+    process_file(args.input_file, args.output_file)
+if __name__ == "__main__":
+    main()

translate_oai.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Script to use the OpenAI API to translate passwords from English to German. Used to build a starting dataset for the password translation task."""
+from openai import OpenAI
+client = OpenAI()
+import argparse
+SYSTEM_PROMPT = """Translate the following password list to German. RESPECT the original casing even when it is grammatically incorrect. Don't add spaces or separators between words if they are not in the original. Respond only with the translated words one per line, nothing else.
+Words:
+password
+iloveyou
+princess
+rockyou
+abc123
+nicole
+loveyou
+Translations:
+passwort
+ichliebedich
+prinzessin
+rockdich
+abc123
+nicole
+liebedich
+"""
+TRANSLATE_PROMPT = """Words:
+<<INPUT>>
+Translations:
+"""
+def translate_to_german(text):
+    """Translate English text to German using the OpenAI API."""
+    chat_response = client.chat.completions.create(model="gpt-4",
+    messages=[
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": TRANSLATE_PROMPT.replace("<<INPUT>>", text)}
+    ])
+    response = chat_response.choices[0].message.content
+    if response[-1] != "\n":
+        response += "\n"
+    return response
+def process_file(input_file_path, output_file_path):
+    """Process the file in chunks and translate each chunk."""
+    try:
+        with open(input_file_path, 'r', encoding='latin1') as file:
+            lines = file.readlines()
+        translated_lines = []
+        chunk_size = 10  # You can adjust the chunk size based on your needs
+        # Process the file in chunks
+        for i in range(0, len(lines), chunk_size):
+            chunk = ''.join(lines[i:i+chunk_size])
+            print("SENT", chunk)  # Debug print to trace what is sent for translation
+            translated_chunk = translate_to_german(chunk)
+            print("GOT", translated_chunk)  # Debug print to see the translation
+            translated_lines.append(translated_chunk)
+        # Write the translated text to another file
+        with open(output_file_path, 'w', encoding='utf-8') as file:
+            file.writelines(translated_lines)
+    except FileNotFoundError:
+        print("The input file was not found.")
+def main():
+    parser = argparse.ArgumentParser(description="Translate text file content to German.")
+    parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
+    parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
+    args = parser.parse_args()
+    process_file(args.input_file, args.output_file)
+if __name__ == "__main__":
+    main()

translate_ollama.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Original code for testing translation with Ollama. Results were not of required quality."""
+import ollama
+import argparse
+SYSTEM_PROMPT = """Translate the following password list to German. RESPECT the original casing even when it is gramatically incorrect. Don't add spaces or separators between words if they are not in the original. Respond only with the translated words one per line, nothing else.
+Words:
+password
+iloveyou
+princess
+rockyou
+abc123
+nicole
+loveyou
+Translations:
+passwort
+ichliebedich
+prinzessin
+rockdich
+abc123
+nicole
+liebedich
+"""
+TRANSLATE_PROMPT = """Words:
+<<INPUT>>
+Translations:
+"""
+def translate_to_german(text):
+    """Translate English text to German using the Ollama model."""
+    response = ollama.chat(
+        model='llama3',
+        messages=[
+            {"role":"system", "content":SYSTEM_PROMPT},
+            {
+                'role': 'user',
+                'content': TRANSLATE_PROMPT.replace("<<INPUT>>", text)
+            },
+        ]
+    )
+    return response['message']['content'] if 'message' in response and 'content' in response['message'] else ''
+def process_file(input_file_path, output_file_path):
+    """Process the file in chunks and translate each chunk."""
+    try:
+        with open(input_file_path, 'r', encoding='latin1') as file:
+            lines = file.readlines()
+        translated_lines = []
+        chunk_size = 10
+        # Process the file in chunks of 10 lines
+        for i in range(0, len(lines), chunk_size):
+            chunk = ''.join(lines[i:i+chunk_size])
+            print("SENT",chunk)
+            translated_chunk = translate_to_german(chunk)
+            print("GOT",translated_chunk)
+            translated_lines.append(translated_chunk)
+        # Write the translated text to another file
+        with open(output_file_path, 'w', encoding='utf-8') as file:
+            file.writelines(translated_lines)
+    except FileNotFoundError:
+        print("The input file was not found.")
+def main():
+    parser = argparse.ArgumentParser(description="Translate text file content to German.")
+    parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file")
+    parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved")
+    args = parser.parse_args()
+    process_file(args.input_file, args.output_file)
+if __name__ == "__main__":
+    main()