|
import pandas as pd |
|
import os |
|
import subprocess |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
|
|
def modify_sequences(sequence): |
|
modified_sequence = sequence.upper() |
|
modified_sequence = '\n'.join([modified_sequence[i:i+60] for i in range(0, len(modified_sequence), 60)]) |
|
|
|
fasta = "<|endoftext|>" |
|
modified_sequence = fasta + "\n" + modified_sequence |
|
|
|
return modified_sequence |
|
|
|
|
|
def to_txt_file(df, filename): |
|
with open(filename, 'w') as f: |
|
for sequence in df['Sequence']: |
|
f.write(sequence + '\n') |
|
|
|
|
|
|
|
path = "/workspace/sg666/MDpLM" |
|
|
|
train = pd.read_csv(path + "/data/membrane/train.csv") |
|
val = pd.read_csv(path + "/data/membrane/val.csv") |
|
test = pd.read_csv(path + "/data/membrane/test.csv") |
|
|
|
train = pd.concat([train, val]) |
|
|
|
train['Sequence'] = train['Sequence'].apply(modify_sequences) |
|
test['Sequence'] = test['Sequence'].apply(modify_sequences) |
|
|
|
|
|
|
|
to_txt_file(train, path + '/benchmarks/Generation/ProtGPT2/protgpt2_train.txt') |
|
to_txt_file(test, path + '/benchmarks/Generation/ProtGPT2/protgpt2_test.txt') |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2") |
|
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2") |
|
|
|
finetune_protgpt2_command = [ |
|
"python", "run_clm.py", |
|
"--model_name_or_path", "nferruz/ProtGPT2", |
|
"--train_file", "protgpt2_train.txt", |
|
"--validation_file", "protgpt2_test.txt", |
|
"--tokenizer_name", "nferruz/ProtGPT2", |
|
"--num_train_epochs", "10", |
|
"--logging_steps", "1", |
|
"--logging_dir", "test", |
|
"--do_train", |
|
"--do_eval", |
|
"--output_dir", "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2/finetuned_models", |
|
"--overwrite_output_dir", |
|
"--learning_rate", "3e-04", |
|
"--per_device_train_batch_size", "2", |
|
"--evaluation_strategy", "epoch" |
|
] |
|
|
|
try: |
|
result = subprocess.run(finetune_protgpt2_command, check=True, text=True, capture_output=True) |
|
except subprocess.CalledProcessError as e: |
|
print("Command failed with the following error:") |
|
print(e.stderr) |
|
print("Command output:") |
|
print(e.stdout) |
|
|
|
|