rmmhicke
/

t5-literary-coreference

+import pandas as pd
+import csv
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+model_name = 't5-literary-coreference'
+device = 'cuda'
+print("Loading in data")
+df = pd.read_csv('example_input.csv')
+df = df.sample(frac=1) # Shuffle dataframe contents
+to_annotate = Dataset.from_pandas(df)
+speech_excerpts = DatasetDict({"annotate": to_annotate})
+print("Loading models")
+# Change max_model_length to fit your data
+tokenizer = AutoTokenizer.from_pretrained("t5-3b", model_max_length=500)
+def preprocess_function(examples, input_text = "input", output_text = "output"):
+    model_inputs = tokenizer(examples[input_text], max_length=500, truncation=True)
+    targets = tokenizer(examples[output_text], max_length=500, truncation=True)
+    model_inputs["labels"] = targets["input_ids"]
+    return model_inputs
+tokenized_speech_excerpts = speech_excerpts.map(preprocess_function, batched=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device=device)
+print("Begin creating annotations")
+header = ["input", "model_output"]
+rows = []
+for item in speech_excerpts["annotate"]:
+    input_ids = tokenizer(item["input"], return_tensors="pt").input_ids
+    result = model.generate(input_ids.to(device=device), max_length = 500)
+    rows.append([item["input"], tokenizer.decode(result[0], skip_special_tokens = True)])
+f = open("results.csv", "w")
+writer = csv.writer(f)
+writer.writerow(header)
+writer.writerows(rows)
+f.close()
+print("Finished")

get_ent_clusters.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+import os
+import re
+import csv
+def extract_paren(annotation):
+    ents = []
+    for i in range(len(annotation)):
+        if annotation[i] == "[":
+            ent = "["
+            open_paren = 0
+            for j in range(i+1, len(annotation)):
+                if annotation[j] == "[":
+                    open_paren += 1
+                elif annotation[j] == "]":
+                    if open_paren > 0:
+                        open_paren -= 1
+                        ent = ent[:len(ent)-3]
+                    else:
+                        ent += "]"
+                        digit = re.search(r": [0-9]{1,3}", ent)
+                        if digit:
+                            matches = re.findall(r": [0-9]{1,3}", annotation[:i])
+                            str_index = annotation[:i].count(" ") - len(matches)
+                            ent += "|" + str(str_index)
+                            ents.append(ent)
+                        break
+                else:
+                    ent += annotation[j]
+    return ents
+def create_clusters(ents):
+    clusters = {}
+    for e in ents:
+        digit_ann = re.search(r": [0-9]{1,3}", e)
+        if digit_ann:
+            clean_e = e.replace("[", "").replace("]", "").replace(digit_ann.group(), "")
+            digit = re.search(r"[0-9]{1,3}", digit_ann.group())
+            digit = int(digit.group())
+            if digit not in clusters:
+                clusters[digit] = []
+            clusters[digit].append(clean_e)
+        else:
+            print("OH NO:", e)
+            print()
+    return clusters
+headers = ["input", "model_output", "model_output_clusters"]
+df = pd.read_csv("results.csv")
+rows = []
+for index, row in df.iterrows():
+    annotation = row["model_output"]
+    if isinstance(annotation, str):
+        ann_ents = extract_paren(annotation)
+        ann_clusters = {}
+        if ann_ents:
+            ann_clusters = create_clusters(ann_ents)
+    else:
+        ann_clusters = {}
+    new_row = [row["input"], annotation, str(ann_clusters)]
+    rows.append(new_row)
+f = open("cluster_results.csv", "w")
+writer = csv.writer(f)
+writer.writerow(headers)
+writer.writerows(rows)
+f.close()