AI4PD
/

ZymCTRL

Text Generation

Transformers

PyTorch

gpt2

biology

text-generation-inference

Model card Files Files and versions Community

nferruz commited on Jun 28, 2023

Commit

045ef38

1 Parent(s): 18b580e

Update README.md

Browse files

Files changed (1) hide show

README.md +48 -76

README.md CHANGED Viewed

@@ -160,19 +160,39 @@ ancestrally-reconstructed sets, or after searching against metagenomics database
 as it will learn new properties from your dataset and potentially improve the generation quality
 (especially for poorly populated EC classes).
-To fine-tune ZymCTRL, you will need to process your sequences quite a bit. The scripts below can exactly do that without any
-modifications. The only requisite is to start with an input file, 'sequences.fasta' which contains all the sequences in a fasta format.
 We recommend using at least 200 sequences to obtain the best results. But we've seen it working with fewer sequences, so if you don't have
 that many, give it still a go.
 ```
 import random
-import transformers
 from transformers import AutoTokenizer
-# 1. Read the source file
-with open('sequences.fasta', 'r') as fn:
     data = fn.readlines()
     fn.close()
@@ -181,49 +201,39 @@ sequences={}
 for line in data:
     if '>' in line:
         name = line.strip()
-        sequences[name] = ['2.7.3.12'] # modify with the actual EC class.
         continue
     sequences[name].append(line.strip())
-# Process fasta files to be in single string - run this part only if the fastas were formated to 60 characters
-processed_sequences = {}
-for name, sequence in sequences.items():
-    processed_sequences[f"{sequence[0]};{name}"] = ''.join([x for x in sequence[1:]])
-# Shuffle sequences
-sequences_list = [(key,value) for key,value in processed_sequences.items()]
 random.shuffle(sequences_list)
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained('/path/to/ZymCTRL')
-# the objective is to get here strings, that when tokenized, will span a window length of 1024.
-# for each sequence group its length and untokenized string
 print("procesing dataset")
 processed_dataset = []
 for i in sequences_list:
     # length of the control code
-    label = i[0].split(';')[0]
     sequence = i[1].strip()
     separator = '<sep>'
-    control_code_length = len(tokenizer(label+separator)['input_ids'])
     available_space = 1021 - control_code_length # It is not 1024 because '<|endoftext|>', and start and end
-    # Option 1: the sequence is larger than the available space (3-4% of sequences in BRENDA are over 1024)
     if len(sequence) > available_space:
         total_length = control_code_length + len(sequence[:available_space]) + 1
-        seq = f"{label}{separator}{sequence[:available_space]}<|endoftext|>"
         processed_dataset.append((total_length, seq))
     # Option 2 & 3: The sequence fits in the block_size space with or without padding
     else:
         total_length = control_code_length + len(sequence) + 3
         # in this case the sequence does not fit with the start/end tokens
-        seq = f"{label}{separator}<start>{sequence}<end><|endoftext|>"
         processed_dataset.append((total_length, seq))
-# Helper function to group sequences
 def grouper(iterable):
     prev = None
     group = ''
@@ -241,50 +251,30 @@ def grouper(iterable):
         total_sum = 0
         yield group
-# Group sequences
 print("grouping processed dataset")
 grouped_dataset=dict(enumerate(grouper(processed_dataset),1))
-# Save the processed file out
-fn = open("./2.7.3.13_processed.txt",'w')
-for key,value in grouped_dataset.items():
-    fn.write(value)
-    fn.write("\n")
-fn.close()
-fn = open("./2.7.3.13_processed.txt",'w')
 for key,value in grouped_dataset.items():
     padding_len = 1024 - len(tokenizer(value)['input_ids'])
     padding = "<pad>"*padding_len
-    print(len(tokenizer(value+padding)['input_ids']))
     fn.write(value+padding)
     fn.write
     fn.write("\n")
-fn.close()
-```
-The previous script will prepare a text file with the correct format for tokenization.
-Now we can use the tokenizer to convert its contents to tokens.
-```
-from datasets import load_dataset
-import transformers
-from transformers.testing_utils import CaptureLogger
-# Load the tokenizer again
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained('/agh/projects/noelia/NLP/zymCTRL/dataset_preparation/tokenizer')
-#Load the data files
 data_files = {}
 dataset_args = {}
-validation_split_percentage = 10 # for a split 90/10
-data_files["train"] = './2.7.3.12_processed.txt'
 extension = "text"
-raw_datasets = load_dataset(extension, data_files=data_files, cache_dir='.', **dataset_args)
 tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
-# Load datasets using the HF datasets library:
 raw_datasets["train"] = load_dataset(extension,
                 data_files=data_files,
                 split=f"train[{validation_split_percentage}%:]",
@@ -298,7 +288,6 @@ raw_datasets["validation"] = load_dataset(extension,
                                           **dataset_args,)
 def tokenize_function(examples):
-    " This function tokenizes input"
     with CaptureLogger(tok_logger) as cl:
         output = tokenizer(examples["text"])
     # clm input could be much much longer than block_size
@@ -308,7 +297,6 @@ def tokenize_function(examples):
         )
     return output
-# tokenize in parallel
 tokenized_datasets = raw_datasets.map(
     tokenize_function,
     batched=True,
@@ -318,24 +306,6 @@ tokenized_datasets = raw_datasets.map(
     desc="Running tokenizer on dataset",
 )
-train_dataset = tokenized_datasets["train"]
-eval_dataset = tokenized_datasets["validation"]
-train_dataset.save_to_disk('./dataset/train')
-eval_dataset.save_to_disk('./dataset/eval')
-# This has saved the datasets tokenized. Now we need to group them into the block size of 1024
-from datasets import load_from_disk
-train_dataset = load_from_disk('./2.7.3.13/dataset/train')
-eval_dataset = load_from_disk('./2.7.3.13/dataset/eval')
-from datasets.dataset_dict import DatasetDict
-tokenized_datasets = DatasetDict()
-tokenized_datasets["train"] = train_dataset
-tokenized_datasets["validation"] = eval_dataset
 block_size = 1024
 def group_texts(examples):
     # Concatenate all texts.
@@ -364,8 +334,10 @@ lm_datasets = tokenized_datasets.map(
 train_dataset = lm_datasets["train"]
 eval_dataset = lm_datasets["validation"]
-train_dataset.save_to_disk('./dataset/train2')
-eval_dataset.save_to_disk('./dataset/eval2')
 ```
 The processed datasets will be inside the folder dataset/, called train2 and eval2.
 You could also put the two previous scripts into a single one and run it in one go (that is what we do).

 as it will learn new properties from your dataset and potentially improve the generation quality
 (especially for poorly populated EC classes).
+To fine-tune ZymCTRL, you can use the script below to process your sequences. The only requisite is to start with an input file,
+'sequences.fasta' which contains all the sequences in a fasta format. Please follow the format below. There should not be new lines '\n' or
+any separator between sequences. In the script, change the variable ec_label to the specific BRENDA class you'd like to fine-tune.
+The script will produce a file called {ec_label}_processed.txt and a folder with the training and validation datasets (split 10%)
+```
+>Sequence1
+MMMMYMPLKVCD..
+>Sequence2
+MQWMXMYMPLKVCD..
+>Sequence3
+MPLKVCWMXMYMPLD..
+```
 We recommend using at least 200 sequences to obtain the best results. But we've seen it working with fewer sequences, so if you don't have
 that many, give it still a go.
 ```
 import random
 from transformers import AutoTokenizer
+from datasets import load_dataset
+import transformers
+from transformers.testing_utils import CaptureLogger
+## DEFINE THESE VARIABLES
+tokenizer = AutoTokenizer.from_pretrained('AI4PD/ZymCTRL')
+ec_label = '1.1.1.1' # CHANGE TO YOUR LABEL
+validation_split_percentage = 10 # change if you want
+sequence_file = 'sequence.fasta'
+#Load sequences, Read source file
+with open(sequence_file, 'r') as fn: #! CHANGE TO SEQUENCES.FASTA
     data = fn.readlines()
     fn.close()
 for line in data:
     if '>' in line:
         name = line.strip()
+        sequences[name] = []  #! CHANGE TO corre
         continue
     sequences[name].append(line.strip())
+#Pass sequences to list and shuffle their order randomly
+sequences_list = [(key,value[0]) for key,value in sequences.items()]
 random.shuffle(sequences_list)
+#the objective is to get here strings, that when tokenized, would span a length of 1024.
+#for each sequence group its length and untokenized string
 print("procesing dataset")
 processed_dataset = []
 for i in sequences_list:
     # length of the control code
     sequence = i[1].strip()
     separator = '<sep>'
+    control_code_length = len(tokenizer(ec_label+separator)['input_ids'])
     available_space = 1021 - control_code_length # It is not 1024 because '<|endoftext|>', and start and end
+    # Option 1: the sequence is larger than the available space (3-4% of sequences)
     if len(sequence) > available_space:
         total_length = control_code_length + len(sequence[:available_space]) + 1
+        seq = f"{ec_label}{separator}{sequence[:available_space]}<|endoftext|>"
         processed_dataset.append((total_length, seq))
     # Option 2 & 3: The sequence fits in the block_size space with or without padding
     else:
         total_length = control_code_length + len(sequence) + 3
         # in this case the sequence does not fit with the start/end tokens
+        seq = f"{ec_label}{separator}<start>{sequence}<end><|endoftext|>"
         processed_dataset.append((total_length, seq))
+# Group sequences
 def grouper(iterable):
     prev = None
     group = ''
         total_sum = 0
         yield group
 print("grouping processed dataset")
 grouped_dataset=dict(enumerate(grouper(processed_dataset),1))
+# Write file out for the tokenizer to read
+fn = open(f"{ec_label}_processed.txt",'w')
 for key,value in grouped_dataset.items():
     padding_len = 1024 - len(tokenizer(value)['input_ids'])
     padding = "<pad>"*padding_len
     fn.write(value+padding)
     fn.write
     fn.write("\n")
+fn.close()
+##TOKENIZE
+# adapted from the trainer file
 data_files = {}
 dataset_args = {}
+data_files["train"] = f"{ec_label}_processed.txt"
 extension = "text"
 tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+raw_datasets = load_dataset(extension, data_files=data_files, cache_dir='.', **dataset_args)
 raw_datasets["train"] = load_dataset(extension,
                 data_files=data_files,
                 split=f"train[{validation_split_percentage}%:]",
                                           **dataset_args,)
 def tokenize_function(examples):
     with CaptureLogger(tok_logger) as cl:
         output = tokenizer(examples["text"])
     # clm input could be much much longer than block_size
         )
     return output
 tokenized_datasets = raw_datasets.map(
     tokenize_function,
     batched=True,
     desc="Running tokenizer on dataset",
 )
 block_size = 1024
 def group_texts(examples):
     # Concatenate all texts.
 train_dataset = lm_datasets["train"]
 eval_dataset = lm_datasets["validation"]
+train_dataset.save_to_disk('./dataset/train')
+eval_dataset.save_to_disk('./dataset/eval')
 ```
 The processed datasets will be inside the folder dataset/, called train2 and eval2.
 You could also put the two previous scripts into a single one and run it in one go (that is what we do).