Spaces:
Runtime error
Runtime error
from datasets import load_dataset | |
import pandas as pd | |
import numpy as np | |
import os | |
import json | |
import torch | |
from torch.utils.data import Dataset, DataLoader | |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification | |
from transformers import Trainer, TrainingArguments, AdamW | |
model_name = "distilbert-base-uncased" | |
class USPTODataset(Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item['labels'] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
def LoadDataset(): | |
print("=== LOADING THE DATASET ===") | |
# Extracting the dataset, filtering only for Jan. 2016 | |
dataset_dict = load_dataset('HUPD/hupd', | |
name='sample', | |
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
icpr_label=None, | |
train_filing_start_date='2016-01-01', | |
train_filing_end_date='2016-01-21', | |
val_filing_start_date='2016-01-22', | |
val_filing_end_date='2016-01-31', | |
) | |
print("Separating between training and validation data") | |
df_train = pd.DataFrame(dataset_dict['train'] ) | |
df_val = pd.DataFrame(dataset_dict['validation'] ) | |
print("=== PRE-PROCESSING THE DATASET ===") | |
#We are interested in the following columns: | |
# - Abstract | |
# - Claims | |
# - Decision <- our `y` | |
# Let's preprocess them both out of our training and validation data | |
# Also, consider that the "Decision" column has three types of values: "Accepted", "Rejected", and "Pending". To remove unecessary baggage, we will be only looking for "Accepted" and "Rejected". | |
necessary_columns = ["abstract","claims","decision"] | |
output_values = ['ACCEPTED','REJECTED'] | |
print("Dropping unused columns") | |
trainFeaturesToDrop = [col for col in list(df_train.columns) if col not in necessary_columns] | |
trainDF = df_train.dropna() | |
trainDF.drop(columns=trainFeaturesToDrop, inplace=True) | |
trainDF = trainDF[trainDF['decision'].isin(output_values)] | |
valFeaturesToDrop = [col for col in list(df_val.columns) if col not in necessary_columns] | |
valDF = df_val.dropna() | |
valDF.drop(columns=valFeaturesToDrop, inplace=True) | |
valDF = valDF[valDF['decision'].isin(output_values)] | |
# We need to replace the values in the `decision` column to numerical representations. ] | |
# We will set "ACCEPTED" as `1` and "REJECTED" as `0`. | |
print("Replacing values in `decision` column") | |
yKey = {"ACCEPTED":1,"REJECTED":0} | |
trainDF2 = trainDF.replace({"decision": yKey}) | |
valDF2 = valDF.replace({"decision": yKey}) | |
# We combine the `abstract` and `claims` columns into a single `text` column. | |
# We also re-label the `decision` column to `label`. | |
print("Combining columns and renaming `decision` to `label`") | |
trainDF3 = trainDF2.rename(columns={'decision': 'label'}) | |
trainDF3['text'] = trainDF3['abstract'] + ' ' + trainDF3['claims'] | |
trainDF3.drop(columns=["abstract","claims"],inplace=True) | |
valDF3 = valDF2.rename(columns={'decision': 'label'}) | |
valDF3['text'] = valDF3['abstract'] + ' ' + valDF3['claims'] | |
valDF3.drop(columns=["abstract","claims"],inplace=True) | |
# We can grab the data for each column so that we have a list of values for training labels, | |
# training texts, validation labels, and validation texts. | |
print("Extracting label and text data from dataframes") | |
trainData = { | |
"labels":trainDF3["label"].tolist(), | |
"text":trainDF3["text"].tolist() | |
} | |
valData = { | |
"labels":valDF3["label"].tolist(), | |
"text":valDF3["text"].tolist() | |
} | |
print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}') | |
print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}') | |
if not os.path.exists("./data"): | |
os.makedirs('./data') | |
with open("./data/train.json", "w") as outfile: | |
json.dump(trainData, outfile, indent=2) | |
with open("./data/val.json", "w") as outfile: | |
json.dump(valData, outfile, indent=2) | |
return trainData, valData | |
def main(): | |
trainDataPath = "./data/train.json" | |
valDataPath = "./data/val.json" | |
trainData = None | |
valData = None | |
if os.path.exists(trainDataPath) and os.path.exists(valDataPath): | |
ftrain = open(trainDataPath) | |
trainData = json.load(ftrain) | |
ftrain.close() | |
fval = open(valDataPath) | |
valData = json.load(fval) | |
fval.close() | |
else: | |
trainData, valData = LoadDataset() | |
print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"])) | |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
train_encodings = tokenizer(trainData["text"], truncation=True, padding=True) | |
val_encodings = tokenizer(valData["text"], truncation=True, padding=True) | |
train_dataset = USPTODataset(train_encodings, trainData["labels"]) | |
val_dataset = USPTODataset(val_encodings, valData["labels"]) | |
train_args = TrainingArguments( | |
output_dir="./results", | |
num_train_epochs=2, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=64, | |
warmup_steps=500, | |
learning_rate=5e-5, | |
weight_decay=0.01, | |
logging_dir="./logs", | |
logging_steps=10 | |
) | |
model = DistilBertForSequenceClassification.from_pretrained(model_name) | |
trainer = Trainer( | |
model=model, | |
args=train_args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset | |
) | |
trainer.train() | |
if __name__ == "__main__": | |
main() |