address-extraction / train.py
duoquote
Update model files and tokenizer configuration
d1aed23
import io
import shutil
import requests
import json
import time
import torch
import orjson
import zipfile
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, BertConfig
from transformers import AutoTokenizer, AutoModelForTokenClassification
API_URL = "http://dockerbase.duo:8000"
PROJECT_ID = 1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def load_data():
res = requests.post(
API_URL + "/v1/auth/login/",
json={"username": "admin", "password": "123"}
)
token = res.json()["key"]
res = requests.post(API_URL + "/v1/projects/1/download",
json={"format":"JSONL","exportApproved": True},
headers={"Authorization": "Token " + token}
)
task_id = res.json()["task_id"]
ready = False
print("Waiting for export task to be ready.", end="")
while not ready:
res = requests.get(
API_URL + "/v1/tasks/status/" + str(task_id),
headers={"Authorization": "Token " + token}
)
ready = res.json()["ready"]
if not ready:
time.sleep(1)
print(".", end="")
print("")
res = requests.get(
API_URL + f"/v1/projects/{PROJECT_ID}/download",
params={"taskId": task_id},
headers={"Authorization": "Token " + token}
)
zip_file = io.BytesIO(res.content)
with zipfile.ZipFile(zip_file, "r") as zip_ref:
data = zip_ref.read("admin.jsonl").decode("utf-8")
res = requests.get(
API_URL + f"/v1/projects/{PROJECT_ID}/span-types",
headers={"Authorization": "Token " + token}
)
labels = res.json()
return labels, [orjson.loads(line) for line in data.split("\n") if line]
labels, data = load_data()
label_to_id = {}
for i, label in enumerate(labels):
label_to_id["B-" + label["text"]] = i * 2 + 1
label_to_id["I-" + label["text"]] = i * 2 + 2
label_to_id["O"] = 0
id_to_label = {v: k for k, v in label_to_id.items()}
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(label_to_id)).to(device)
model.config.id2label = id_to_label
model.config.label2id = label_to_id
from datasets import DatasetDict, Dataset
def preprocess_data(item, tokenizer, label_to_id):
text = item['text']
inputs = tokenizer(
text,
return_offsets_mapping=True,
return_tensors="pt",
truncation=True,
padding='max_length',
max_length=128,
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
offset_mapping = inputs["offset_mapping"]
labels = ["O"] * 128
last_label = "O"
for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]):
if off_start == off_end:
continue
for start, end, label in item['label']:
if start <= off_start and off_end <= end:
if last_label == label:
labels[token_idx] = "I-" + label
else:
labels[token_idx] = "B-" + label
last_label = label
break
# Convert labels to ids
labels = [label_to_id[label] for label in labels]
return {
"input_ids": input_ids.flatten(),
"attention_mask": attention_mask.flatten(),
"labels": labels,
}
class AddressDataset(Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
item = self.dataset[index]
return {key: torch.tensor(val) for key, val in item.items()}
dataset = Dataset.from_generator(
lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data),
)
dataset = dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
"train": dataset["train"],
"test": dataset["test"]
})
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=35,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
# logging_dir="./logs",
# logging_first_step=True,
# evaluation_strategy="epoch",
# save_strategy="epoch",
logging_strategy="epoch",
# load_best_model_at_end=True,
)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def compute_metrics(pred, id_to_label):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels]
preds = [[id_to_label[pred] for pred in preds] for preds in preds]
labels = [label for label in labels if label != "O"]
preds = [pred for pred in preds if pred != "O"]
mlb = MultiLabelBinarizer()
mlb.fit([id_to_label.values()])
labels = mlb.transform(labels)
preds = mlb.transform(preds)
return {
"accuracy": accuracy_score(labels, preds),
"precision": precision_score(labels, preds, average="micro"),
"recall": recall_score(labels, preds, average="micro"),
"f1": f1_score(labels, preds, average="micro"),
}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
tokenizer=tokenizer,
compute_metrics=lambda p: compute_metrics(p, id_to_label),
)
trainer.train()
trainer.evaluate()
trainer.save_model("./model")
shutil.copy("./model/config.json", "./config.json")