csgy-6613-project-rk2546 / src /patent_train.py
Ryan Kim
attempting to build script for training
e2ba2f3
raw
history blame
1.56 kB
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Import data for specifically January 2016
dataset_dict = load_dataset(
'HUPD/hupd',
name='Jan2016 Sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
)
print(dataset_dict)
"""
data_fields = {
"application_number": "...",
"publication_number": "...",
"title": "...",
"decision": "...",
"date_produced": "...",
"date_published": "...",
"main_cpc_label": "...",
"cpc_labels": ["...", "...", "..."],
"main_ipcr_label": "...",
"ipcr_labels": ["...", "...", "..."],
"patent_number": "...",
"filing_date": "...",
"patent_issue_date": "...",
"abandon_date": "...",
"uspc_class": "...",
"uspc_subclass": "...",
"examiner_id": "...",
"examiner_name_last": "...",
"examiner_name_first": "...",
"examiner_name_middle": "...",
"inventor_list": [
{
"inventor_name_last": "...",
"inventor_name_first": "...",
"inventor_city": "...",
"inventor_state": "...",
"inventor_country": "..."
}
],
"abstract": "...",
"claims": "...",
"background": "...",
"summary": "...",
"full_description": "..."
}
"""