Ryan Kim commited on
Commit
e2ba2f3
1 Parent(s): 49709ee

attempting to build script for training

Browse files
Files changed (1) hide show
  1. src/patent_train.py +57 -0
src/patent_train.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import pipeline
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+
5
+ # Import data for specifically January 2016
6
+ dataset_dict = load_dataset(
7
+ 'HUPD/hupd',
8
+ name='Jan2016 Sample',
9
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
10
+ icpr_label=None,
11
+ train_filing_start_date='2016-01-01',
12
+ train_filing_end_date='2016-01-21',
13
+ val_filing_start_date='2016-01-22',
14
+ val_filing_end_date='2016-01-31',
15
+ )
16
+
17
+ print(dataset_dict)
18
+
19
+ """
20
+ data_fields = {
21
+ "application_number": "...",
22
+ "publication_number": "...",
23
+ "title": "...",
24
+ "decision": "...",
25
+ "date_produced": "...",
26
+ "date_published": "...",
27
+ "main_cpc_label": "...",
28
+ "cpc_labels": ["...", "...", "..."],
29
+ "main_ipcr_label": "...",
30
+ "ipcr_labels": ["...", "...", "..."],
31
+ "patent_number": "...",
32
+ "filing_date": "...",
33
+ "patent_issue_date": "...",
34
+ "abandon_date": "...",
35
+ "uspc_class": "...",
36
+ "uspc_subclass": "...",
37
+ "examiner_id": "...",
38
+ "examiner_name_last": "...",
39
+ "examiner_name_first": "...",
40
+ "examiner_name_middle": "...",
41
+ "inventor_list": [
42
+ {
43
+ "inventor_name_last": "...",
44
+ "inventor_name_first": "...",
45
+ "inventor_city": "...",
46
+ "inventor_state": "...",
47
+ "inventor_country": "..."
48
+ }
49
+ ],
50
+ "abstract": "...",
51
+ "claims": "...",
52
+ "background": "...",
53
+ "summary": "...",
54
+ "full_description": "..."
55
+ }
56
+ """
57
+