|
from transformers import pipeline |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import math |
|
import torch |
|
import sys |
|
import pandas as pd |
|
|
|
|
|
def calculate_perplexity(sequence, model, tokenizer): |
|
sequence = "<|endoftext|>" + sequence + "<|endoftext|>" |
|
input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0) |
|
input_ids = input_ids.to(device) |
|
with torch.no_grad(): |
|
outputs = model(input_ids, labels=input_ids) |
|
loss, _ = outputs[:2] |
|
return math.exp(loss) |
|
|
|
if __name__ == "__main__": |
|
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') |
|
path = "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2" |
|
|
|
|
|
model_path = path + "/finetuned_models/checkpoint-4510" |
|
model = AutoModelForCausalLM.from_pretrained(model_path) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
|
|
protgpt2 = pipeline('text-generation', model=model_path, device=device) |
|
sequences = protgpt2("", max_length=100, do_sample=True, top_k=950, repetition_penalty=1.5, num_return_sequences=100, eos_token_id=0) |
|
|
|
|
|
generated_sequences = [] |
|
perplexities = [] |
|
|
|
|
|
|
|
for item in sequences: |
|
raw_sequence = item['generated_text'] |
|
ppl = calculate_perplexity(raw_sequence, model.to(device), tokenizer) |
|
generated_sequences.append(raw_sequence) |
|
perplexities.append(ppl) |
|
|
|
|
|
cleaned_sequences = [seq.replace('\n', '').replace('<|endoftext|>', '') for seq in generated_sequences] |
|
|
|
|
|
df = pd.DataFrame({"Sequence": cleaned_sequences, "Perplexity": perplexities}) |
|
df.sort_values(by='Perplexity', inplace=True) |
|
|
|
|
|
df.to_csv(path + "/protgpt2_generated_sequences.csv", index=False) |
|
|
|
|
|
avg_generation_ppl = df.loc[:, 'Perplexity'].mean() |
|
print(f'Average de novo generation perplexity: {avg_generation_ppl}') |
|
|