File size: 1,522 Bytes
0d1a7d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import csv

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 't5-literary-coreference'
device = 'cuda'

print("Loading in data")

df = pd.read_csv('example_input.csv')
df = df.sample(frac=1) # Shuffle dataframe contents

to_annotate = Dataset.from_pandas(df)

speech_excerpts = DatasetDict({"annotate": to_annotate})

print("Loading models")
# Change max_model_length to fit your data
tokenizer = AutoTokenizer.from_pretrained("t5-3b", model_max_length=500) 

def preprocess_function(examples, input_text = "input", output_text = "output"):
    model_inputs = tokenizer(examples[input_text], max_length=500, truncation=True)

    targets = tokenizer(examples[output_text], max_length=500, truncation=True)

    model_inputs["labels"] = targets["input_ids"]

    return model_inputs

tokenized_speech_excerpts = speech_excerpts.map(preprocess_function, batched=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device=device)

print("Begin creating annotations")
header = ["input", "model_output"]
rows = []

for item in speech_excerpts["annotate"]:
    input_ids = tokenizer(item["input"], return_tensors="pt").input_ids
    result = model.generate(input_ids.to(device=device), max_length = 500)
    rows.append([item["input"], tokenizer.decode(result[0], skip_special_tokens = True)])

f = open("results.csv", "w")
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(rows)
f.close()

print("Finished")