Ryan Kim commited on
Commit
d13c5f2
·
1 Parent(s): 11a657b

train and validation data stored. train.py now can generate models

Browse files
Files changed (3) hide show
  1. data/train.json +2 -2
  2. data/val.json +2 -2
  3. src/train.py +92 -19
data/train.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b9de2fb205c4a8fe3d082ae0441872f983ed8b07ae4bb965c7cab2822ecc453
3
- size 58540950
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbe06c7232f904c0501c4dc5b950e4243d887cd7ffab04bfeaa12732514b47c8
3
+ size 58602006
data/val.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cea25d0297302c68eccc08d662d03b35af3f99106a2e0854a44ceace0a9204a1
3
- size 32710564
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d25635377a60d308197c5a8c0c7df7575953d29cfab91b6c11316266c6a5b27c
3
+ size 32744803
src/train.py CHANGED
@@ -4,11 +4,16 @@ import numpy as np
4
  import os
5
  import json
6
  import torch
 
 
7
  from torch.utils.data import Dataset, DataLoader
8
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
9
  from transformers import Trainer, TrainingArguments, AdamW
10
 
 
11
  model_name = "distilbert-base-uncased"
 
 
12
 
13
  class USPTODataset(Dataset):
14
  def __init__(self, encodings, labels):
@@ -67,30 +72,26 @@ def LoadDataset():
67
  trainDF2 = trainDF.replace({"decision": yKey})
68
  valDF2 = valDF.replace({"decision": yKey})
69
 
70
- # We combine the `abstract` and `claims` columns into a single `text` column.
71
- # We also re-label the `decision` column to `label`.
72
- print("Combining columns and renaming `decision` to `label`")
73
  trainDF3 = trainDF2.rename(columns={'decision': 'label'})
74
- trainDF3['text'] = trainDF3['abstract'] + ' ' + trainDF3['claims']
75
- trainDF3.drop(columns=["abstract","claims"],inplace=True)
76
-
77
  valDF3 = valDF2.rename(columns={'decision': 'label'})
78
- valDF3['text'] = valDF3['abstract'] + ' ' + valDF3['claims']
79
- valDF3.drop(columns=["abstract","claims"],inplace=True)
80
 
81
  # We can grab the data for each column so that we have a list of values for training labels,
82
  # training texts, validation labels, and validation texts.
83
  print("Extracting label and text data from dataframes")
84
  trainData = {
85
  "labels":trainDF3["label"].tolist(),
86
- "text":trainDF3["text"].tolist()
 
87
  }
88
  valData = {
89
  "labels":valDF3["label"].tolist(),
90
- "text":valDF3["text"].tolist()
 
91
  }
92
- print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}')
93
- print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}')
94
 
95
  if not os.path.exists("./data"):
96
  os.makedirs('./data')
@@ -102,6 +103,79 @@ def LoadDataset():
102
 
103
  return trainData, valData
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def main():
106
  trainDataPath = "./data/train.json"
107
  valDataPath = "./data/val.json"
@@ -109,6 +183,7 @@ def main():
109
  valData = None
110
 
111
  if os.path.exists(trainDataPath) and os.path.exists(valDataPath):
 
112
  ftrain = open(trainDataPath)
113
  trainData = json.load(ftrain)
114
  ftrain.close()
@@ -118,15 +193,12 @@ def main():
118
  else:
119
  trainData, valData = LoadDataset()
120
 
121
- print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
122
-
123
- tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
124
- train_encodings = tokenizer(trainData["text"], truncation=True, padding=True)
125
- val_encodings = tokenizer(valData["text"], truncation=True, padding=True)
126
 
127
- train_dataset = USPTODataset(train_encodings, trainData["labels"])
128
- val_dataset = USPTODataset(val_encodings, valData["labels"])
129
 
 
130
  train_args = TrainingArguments(
131
  output_dir="./results",
132
  num_train_epochs=2,
@@ -147,6 +219,7 @@ def main():
147
  eval_dataset=val_dataset
148
  )
149
  trainer.train()
 
150
 
151
  if __name__ == "__main__":
152
  main()
 
4
  import os
5
  import json
6
  import torch
7
+ import sys
8
+
9
  from torch.utils.data import Dataset, DataLoader
10
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
11
  from transformers import Trainer, TrainingArguments, AdamW
12
 
13
+ torch.backends.cuda.matmul.allow_tf32 = True
14
  model_name = "distilbert-base-uncased"
15
+ upsto_abstracts_model_path = './models/upsto_abstracts'
16
+ upsto_claims_model_path = './models/upsto_claims'
17
 
18
  class USPTODataset(Dataset):
19
  def __init__(self, encodings, labels):
 
72
  trainDF2 = trainDF.replace({"decision": yKey})
73
  valDF2 = valDF.replace({"decision": yKey})
74
 
75
+ # We re-label the `decision` column to `label`.
76
+ print("Renaming `decision` to `label`")
 
77
  trainDF3 = trainDF2.rename(columns={'decision': 'label'})
 
 
 
78
  valDF3 = valDF2.rename(columns={'decision': 'label'})
 
 
79
 
80
  # We can grab the data for each column so that we have a list of values for training labels,
81
  # training texts, validation labels, and validation texts.
82
  print("Extracting label and text data from dataframes")
83
  trainData = {
84
  "labels":trainDF3["label"].tolist(),
85
+ "abstracts":trainDF3["abstract"].tolist(),
86
+ "claims":trainDF3["claims"].tolist(),
87
  }
88
  valData = {
89
  "labels":valDF3["label"].tolist(),
90
+ "abstracts":valDF3["abstract"].tolist(),
91
+ "claims":valDF3["claims"].tolist(),
92
  }
93
+ #print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}')
94
+ #print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}')
95
 
96
  if not os.path.exists("./data"):
97
  os.makedirs('./data')
 
103
 
104
  return trainData, valData
105
 
106
+ def TrainModel(trainData, valData):
107
+ print("=== ENCODING DATA ===")
108
+ #print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
109
+ print("\t- initializing tokenizer")
110
+ tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
111
+ print("\t- encoding training data")
112
+ train_abstracts_encodings = tokenizer(trainData["abstracts"], truncation=True, padding=True)
113
+ train_claims_encodings = tokenizer(trainData["claims"], truncation=True, padding=True)
114
+ #print("\t- encoding validation data")
115
+ #val_abstracts_encodings = tokenizer(valData["abstracts"], truncation=True, padding=True)
116
+ #val_claims_encodings = tokenizer(valData["claims"], truncation=True, padding=True)
117
+
118
+ print("=== CREATING DATASETS ===")
119
+ print("\t- initializing dataset for training data")
120
+ train_abstracts_dataset = USPTODataset(train_abstracts_encodings, trainData["labels"])
121
+ train_claims_dataset = USPTODataset(train_claims_encodings, trainData["labels"])
122
+ #print("\t- initializing dataset for validation data")
123
+ #val_abstracts_dataset = USPTODataset(val_abstracts_encodings, valData["labels"])
124
+ #val_claims_dataset = USPTODataset(val_claims_encodings, valData["labels"])
125
+
126
+ print("=== PREPARING MODEL ===")
127
+ print("\t- setting up device")
128
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
129
+ print("\t- initializing model")
130
+ model = DistilBertForSequenceClassification.from_pretrained(model_name)
131
+ model.to(device)
132
+ model.train()
133
+
134
+ print("== PREPARING TRAINING ===")
135
+ print("\t- initializing trainers")
136
+ train_abstracts_loader = DataLoader(train_abstracts_dataset, batch_size=4, shuffle=True)
137
+ train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True)
138
+ #train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True)
139
+ print("\t- initializing optim")
140
+ optim = AdamW(model.parameters(), lr=5e-5)
141
+
142
+ def Train(loader, save_path, num_train_epochs=2):
143
+ batch_num = len(loader)
144
+ for epoch in range(num_train_epochs):
145
+ print(f'\t- Training epoch {epoch+1}/{num_train_epochs}')
146
+ batch_count = 0
147
+ for batch in loader:
148
+ print(f'{batch_count}|{batch_num} - {round((batch_count/batch_num)*100)}%', end="")
149
+ #print('\t\t- optim zero grad')
150
+ optim.zero_grad()
151
+ #print('\t\t- input_ids')
152
+ input_ids = batch['input_ids'].to(device)
153
+ #print('\t\t- attention_mask')
154
+ attention_mask = batch['attention_mask'].to(device)
155
+ #print('\t\t- labels0')
156
+ labels = batch['labels'].to(device)
157
+ #print('\t\t- outputs')
158
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
159
+
160
+ #print('\t\t- loss')
161
+ loss = outputs[0]
162
+ #print('\t\t- backwards')
163
+ loss.backward()
164
+ #print('\t\t- step')
165
+ optim.step()
166
+
167
+ batch_count += 1
168
+ print("\r", end="")
169
+
170
+ model.eval()
171
+ model.save_pretrained(save_path, from_pt=True)
172
+ print(f'Saved model in {save_path}!')
173
+
174
+ print("=== TRAINING ABSTRACTS ===")
175
+ Train(train_abstracts_loader,upsto_abstracts_model_path)
176
+ print("=== TRAINING CLAIMS ===")
177
+ Train(train_claims_loader,upsto_claims_model_path)
178
+
179
  def main():
180
  trainDataPath = "./data/train.json"
181
  valDataPath = "./data/val.json"
 
183
  valData = None
184
 
185
  if os.path.exists(trainDataPath) and os.path.exists(valDataPath):
186
+ print("Loading from existing data files")
187
  ftrain = open(trainDataPath)
188
  trainData = json.load(ftrain)
189
  ftrain.close()
 
193
  else:
194
  trainData, valData = LoadDataset()
195
 
196
+ #print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"]))
197
+ print("Data loaded successfully!")
 
 
 
198
 
199
+ TrainModel(trainData, valData)
 
200
 
201
+ """
202
  train_args = TrainingArguments(
203
  output_dir="./results",
204
  num_train_epochs=2,
 
219
  eval_dataset=val_dataset
220
  )
221
  trainer.train()
222
+ """
223
 
224
  if __name__ == "__main__":
225
  main()