Spaces:
Runtime error
Runtime error
Ryan Kim
commited on
Commit
·
d13c5f2
1
Parent(s):
11a657b
train and validation data stored. train.py now can generate models
Browse files- data/train.json +2 -2
- data/val.json +2 -2
- src/train.py +92 -19
data/train.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbe06c7232f904c0501c4dc5b950e4243d887cd7ffab04bfeaa12732514b47c8
|
3 |
+
size 58602006
|
data/val.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d25635377a60d308197c5a8c0c7df7575953d29cfab91b6c11316266c6a5b27c
|
3 |
+
size 32744803
|
src/train.py
CHANGED
@@ -4,11 +4,16 @@ import numpy as np
|
|
4 |
import os
|
5 |
import json
|
6 |
import torch
|
|
|
|
|
7 |
from torch.utils.data import Dataset, DataLoader
|
8 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
9 |
from transformers import Trainer, TrainingArguments, AdamW
|
10 |
|
|
|
11 |
model_name = "distilbert-base-uncased"
|
|
|
|
|
12 |
|
13 |
class USPTODataset(Dataset):
|
14 |
def __init__(self, encodings, labels):
|
@@ -67,30 +72,26 @@ def LoadDataset():
|
|
67 |
trainDF2 = trainDF.replace({"decision": yKey})
|
68 |
valDF2 = valDF.replace({"decision": yKey})
|
69 |
|
70 |
-
# We
|
71 |
-
|
72 |
-
print("Combining columns and renaming `decision` to `label`")
|
73 |
trainDF3 = trainDF2.rename(columns={'decision': 'label'})
|
74 |
-
trainDF3['text'] = trainDF3['abstract'] + ' ' + trainDF3['claims']
|
75 |
-
trainDF3.drop(columns=["abstract","claims"],inplace=True)
|
76 |
-
|
77 |
valDF3 = valDF2.rename(columns={'decision': 'label'})
|
78 |
-
valDF3['text'] = valDF3['abstract'] + ' ' + valDF3['claims']
|
79 |
-
valDF3.drop(columns=["abstract","claims"],inplace=True)
|
80 |
|
81 |
# We can grab the data for each column so that we have a list of values for training labels,
|
82 |
# training texts, validation labels, and validation texts.
|
83 |
print("Extracting label and text data from dataframes")
|
84 |
trainData = {
|
85 |
"labels":trainDF3["label"].tolist(),
|
86 |
-
"
|
|
|
87 |
}
|
88 |
valData = {
|
89 |
"labels":valDF3["label"].tolist(),
|
90 |
-
"
|
|
|
91 |
}
|
92 |
-
print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}')
|
93 |
-
print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}')
|
94 |
|
95 |
if not os.path.exists("./data"):
|
96 |
os.makedirs('./data')
|
@@ -102,6 +103,79 @@ def LoadDataset():
|
|
102 |
|
103 |
return trainData, valData
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
def main():
|
106 |
trainDataPath = "./data/train.json"
|
107 |
valDataPath = "./data/val.json"
|
@@ -109,6 +183,7 @@ def main():
|
|
109 |
valData = None
|
110 |
|
111 |
if os.path.exists(trainDataPath) and os.path.exists(valDataPath):
|
|
|
112 |
ftrain = open(trainDataPath)
|
113 |
trainData = json.load(ftrain)
|
114 |
ftrain.close()
|
@@ -118,15 +193,12 @@ def main():
|
|
118 |
else:
|
119 |
trainData, valData = LoadDataset()
|
120 |
|
121 |
-
print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
|
122 |
-
|
123 |
-
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
|
124 |
-
train_encodings = tokenizer(trainData["text"], truncation=True, padding=True)
|
125 |
-
val_encodings = tokenizer(valData["text"], truncation=True, padding=True)
|
126 |
|
127 |
-
|
128 |
-
val_dataset = USPTODataset(val_encodings, valData["labels"])
|
129 |
|
|
|
130 |
train_args = TrainingArguments(
|
131 |
output_dir="./results",
|
132 |
num_train_epochs=2,
|
@@ -147,6 +219,7 @@ def main():
|
|
147 |
eval_dataset=val_dataset
|
148 |
)
|
149 |
trainer.train()
|
|
|
150 |
|
151 |
if __name__ == "__main__":
|
152 |
main()
|
|
|
4 |
import os
|
5 |
import json
|
6 |
import torch
|
7 |
+
import sys
|
8 |
+
|
9 |
from torch.utils.data import Dataset, DataLoader
|
10 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
11 |
from transformers import Trainer, TrainingArguments, AdamW
|
12 |
|
13 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
14 |
model_name = "distilbert-base-uncased"
|
15 |
+
upsto_abstracts_model_path = './models/upsto_abstracts'
|
16 |
+
upsto_claims_model_path = './models/upsto_claims'
|
17 |
|
18 |
class USPTODataset(Dataset):
|
19 |
def __init__(self, encodings, labels):
|
|
|
72 |
trainDF2 = trainDF.replace({"decision": yKey})
|
73 |
valDF2 = valDF.replace({"decision": yKey})
|
74 |
|
75 |
+
# We re-label the `decision` column to `label`.
|
76 |
+
print("Renaming `decision` to `label`")
|
|
|
77 |
trainDF3 = trainDF2.rename(columns={'decision': 'label'})
|
|
|
|
|
|
|
78 |
valDF3 = valDF2.rename(columns={'decision': 'label'})
|
|
|
|
|
79 |
|
80 |
# We can grab the data for each column so that we have a list of values for training labels,
|
81 |
# training texts, validation labels, and validation texts.
|
82 |
print("Extracting label and text data from dataframes")
|
83 |
trainData = {
|
84 |
"labels":trainDF3["label"].tolist(),
|
85 |
+
"abstracts":trainDF3["abstract"].tolist(),
|
86 |
+
"claims":trainDF3["claims"].tolist(),
|
87 |
}
|
88 |
valData = {
|
89 |
"labels":valDF3["label"].tolist(),
|
90 |
+
"abstracts":valDF3["abstract"].tolist(),
|
91 |
+
"claims":valDF3["claims"].tolist(),
|
92 |
}
|
93 |
+
#print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}')
|
94 |
+
#print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}')
|
95 |
|
96 |
if not os.path.exists("./data"):
|
97 |
os.makedirs('./data')
|
|
|
103 |
|
104 |
return trainData, valData
|
105 |
|
106 |
+
def TrainModel(trainData, valData):
|
107 |
+
print("=== ENCODING DATA ===")
|
108 |
+
#print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
|
109 |
+
print("\t- initializing tokenizer")
|
110 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
|
111 |
+
print("\t- encoding training data")
|
112 |
+
train_abstracts_encodings = tokenizer(trainData["abstracts"], truncation=True, padding=True)
|
113 |
+
train_claims_encodings = tokenizer(trainData["claims"], truncation=True, padding=True)
|
114 |
+
#print("\t- encoding validation data")
|
115 |
+
#val_abstracts_encodings = tokenizer(valData["abstracts"], truncation=True, padding=True)
|
116 |
+
#val_claims_encodings = tokenizer(valData["claims"], truncation=True, padding=True)
|
117 |
+
|
118 |
+
print("=== CREATING DATASETS ===")
|
119 |
+
print("\t- initializing dataset for training data")
|
120 |
+
train_abstracts_dataset = USPTODataset(train_abstracts_encodings, trainData["labels"])
|
121 |
+
train_claims_dataset = USPTODataset(train_claims_encodings, trainData["labels"])
|
122 |
+
#print("\t- initializing dataset for validation data")
|
123 |
+
#val_abstracts_dataset = USPTODataset(val_abstracts_encodings, valData["labels"])
|
124 |
+
#val_claims_dataset = USPTODataset(val_claims_encodings, valData["labels"])
|
125 |
+
|
126 |
+
print("=== PREPARING MODEL ===")
|
127 |
+
print("\t- setting up device")
|
128 |
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
129 |
+
print("\t- initializing model")
|
130 |
+
model = DistilBertForSequenceClassification.from_pretrained(model_name)
|
131 |
+
model.to(device)
|
132 |
+
model.train()
|
133 |
+
|
134 |
+
print("== PREPARING TRAINING ===")
|
135 |
+
print("\t- initializing trainers")
|
136 |
+
train_abstracts_loader = DataLoader(train_abstracts_dataset, batch_size=4, shuffle=True)
|
137 |
+
train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True)
|
138 |
+
#train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True)
|
139 |
+
print("\t- initializing optim")
|
140 |
+
optim = AdamW(model.parameters(), lr=5e-5)
|
141 |
+
|
142 |
+
def Train(loader, save_path, num_train_epochs=2):
|
143 |
+
batch_num = len(loader)
|
144 |
+
for epoch in range(num_train_epochs):
|
145 |
+
print(f'\t- Training epoch {epoch+1}/{num_train_epochs}')
|
146 |
+
batch_count = 0
|
147 |
+
for batch in loader:
|
148 |
+
print(f'{batch_count}|{batch_num} - {round((batch_count/batch_num)*100)}%', end="")
|
149 |
+
#print('\t\t- optim zero grad')
|
150 |
+
optim.zero_grad()
|
151 |
+
#print('\t\t- input_ids')
|
152 |
+
input_ids = batch['input_ids'].to(device)
|
153 |
+
#print('\t\t- attention_mask')
|
154 |
+
attention_mask = batch['attention_mask'].to(device)
|
155 |
+
#print('\t\t- labels0')
|
156 |
+
labels = batch['labels'].to(device)
|
157 |
+
#print('\t\t- outputs')
|
158 |
+
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
|
159 |
+
|
160 |
+
#print('\t\t- loss')
|
161 |
+
loss = outputs[0]
|
162 |
+
#print('\t\t- backwards')
|
163 |
+
loss.backward()
|
164 |
+
#print('\t\t- step')
|
165 |
+
optim.step()
|
166 |
+
|
167 |
+
batch_count += 1
|
168 |
+
print("\r", end="")
|
169 |
+
|
170 |
+
model.eval()
|
171 |
+
model.save_pretrained(save_path, from_pt=True)
|
172 |
+
print(f'Saved model in {save_path}!')
|
173 |
+
|
174 |
+
print("=== TRAINING ABSTRACTS ===")
|
175 |
+
Train(train_abstracts_loader,upsto_abstracts_model_path)
|
176 |
+
print("=== TRAINING CLAIMS ===")
|
177 |
+
Train(train_claims_loader,upsto_claims_model_path)
|
178 |
+
|
179 |
def main():
|
180 |
trainDataPath = "./data/train.json"
|
181 |
valDataPath = "./data/val.json"
|
|
|
183 |
valData = None
|
184 |
|
185 |
if os.path.exists(trainDataPath) and os.path.exists(valDataPath):
|
186 |
+
print("Loading from existing data files")
|
187 |
ftrain = open(trainDataPath)
|
188 |
trainData = json.load(ftrain)
|
189 |
ftrain.close()
|
|
|
193 |
else:
|
194 |
trainData, valData = LoadDataset()
|
195 |
|
196 |
+
#print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
|
197 |
+
print("Data loaded successfully!")
|
|
|
|
|
|
|
198 |
|
199 |
+
TrainModel(trainData, valData)
|
|
|
200 |
|
201 |
+
"""
|
202 |
train_args = TrainingArguments(
|
203 |
output_dir="./results",
|
204 |
num_train_epochs=2,
|
|
|
219 |
eval_dataset=val_dataset
|
220 |
)
|
221 |
trainer.train()
|
222 |
+
"""
|
223 |
|
224 |
if __name__ == "__main__":
|
225 |
main()
|