File size: 4,310 Bytes
af359c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# title: ENA dataset utility functions
# author: Taewook Kang, Kyubyung Kang
# date: 2024.3.27
# license: MIT
# reference: https://pyautocad.readthedocs.io/en/latest/_modules/pyautocad/api.html
# version
# 0.1. 2024.3.27. create file
#
import json, os, re, logging, numpy as np
from transformers import BertTokenizer
def load_train_chunk_data(data_dir, sort_fname=False):
geom_list = []
fnames = os.listdir(data_dir)
if sort_fname:
fnames.sort(key=lambda x: int(re.search(r'\d+', x).group()))
xsec_count = 0
for file_name in fnames:
if file_name.endswith('.json') == False:
continue
with open(os.path.join(data_dir, file_name), 'r') as f:
chunk = json.load(f)
for xsec in chunk:
xsec_count += 1
geom = xsec['geom']
for g in geom:
g['station'] = xsec['station']
features = g['earthwork_feature']
if len(features) == 0:
continue
geom_list.append(g)
print(f'Loaded {xsec_count} cross sections')
return geom_list
def update_feature_dims_token(geom_list):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Load the BERT tokenizer
feature_dims = []
max_token = 0
padding_token_id = tokenizer.add_tokens(['padding'])
for geom in geom_list:
label = geom['label']
geom['feature_dims'] = []
for feature in geom['earthwork_feature']:
# token = tokenizer.tokenize(feature)
token_ids = tokenizer.convert_tokens_to_ids(feature)
geom['feature_dims'].append(token_ids)
word, count = extract_word_and_count(feature)
if word in tokens:
continue
feature_dims.append(word)
max_token = max(max_token, len(geom['feature_dims']))
for geom in geom_list:
label = geom['label']
geom['feature_dims'] += [padding_token_id] * (max_token - len(geom['feature_dims']))
print(f'Max token length: {max_token}')
return feature_dims
def extract_word_and_count(s):
match = re.match(r'(\w+)(?:\((\d+)\))?', s)
if match:
word, count = match.groups()
count = int(count) if count else 1
return word, count
return None, None
def update_feature_dims_freq(geom_list, augument=False):
feature_dims = []
for geom in geom_list:
label = geom['label']
geom['feature_dims'] = []
for feature in geom['earthwork_feature']:
word, count = extract_word_and_count(feature)
if word is None or count is None:
continue
if word in feature_dims:
continue
feature_dims.append(word)
feature_dims.sort()
max_feature_dims_count = [0.0] * len(feature_dims)
for geom in geom_list:
label = geom['label']
geom['feature_dims'] = [0.0] * len(feature_dims)
geom['feature_text'] = ''
# geom['feature_angle_dims'] = [0.0] * len(feature_dims)
for feature in geom['earthwork_feature']:
word, count = extract_word_and_count(feature)
if word is None or count is None:
continue
geom['feature_text'] += f'{word}({count}) '
index = feature_dims.index(word)
geom['feature_dims'][index] = count
max_feature_dims_count[index] = max(max_feature_dims_count[index], count)
# normalize feature_dims by usng max_feature_dims_count
for geom in geom_list:
label = geom['label']
for i in range(len(geom['feature_dims'])):
geom['feature_dims'][i] /= max_feature_dims_count[i]
# augument feature_dims dataset
if augument:
for geom in geom_list:
label = geom['label']
geom['feature_dims_aug'] = []
for i in range(len(geom['feature_dims'])):
geom['feature_dims_aug'].append(geom['feature_dims'][i])
geom['feature_dims_aug'].append(geom['feature_dims'][i] * geom['feature_dims'][i])
print(f'feature dims({len(feature_dims)}): {feature_dims}')
return feature_dims
def update_onehot_encoding(geom_list):
label_kinds = []
for geom in geom_list:
label = geom['label']
if label not in label_kinds:
label_kinds.append(label)
from collections import Counter # from sklearn.preprocessing import OneHotEncoder
for geom in geom_list: # count label's kind of train_labels. Initialize the one-hot encoder
label = geom['label']
label_counts = Counter(label_kinds)
onehot = np.zeros(len(label_kinds))
onehot[label_kinds.index(label)] = 1.0
geom['label_onehot'] = onehot
return label_kinds
|