File size: 4,310 Bytes
af359c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# title: ENA dataset utility functions
# author: Taewook Kang, Kyubyung Kang
# date: 2024.3.27
# license: MIT
# reference: https://pyautocad.readthedocs.io/en/latest/_modules/pyautocad/api.html
# version
#   0.1. 2024.3.27. create file
# 
import json, os, re, logging, numpy as np
from transformers import BertTokenizer

def load_train_chunk_data(data_dir, sort_fname=False):
	geom_list = []
	fnames = os.listdir(data_dir)
	if sort_fname:
		fnames.sort(key=lambda x: int(re.search(r'\d+', x).group()))
	xsec_count = 0
	for file_name in fnames:
		if file_name.endswith('.json') == False:
			continue
		with open(os.path.join(data_dir, file_name), 'r') as f:
			chunk = json.load(f)
			for xsec in chunk:
				xsec_count += 1
				geom = xsec['geom']
				for g in geom:
					g['station'] = xsec['station']
					features = g['earthwork_feature']
					if len(features) == 0:
						continue
					geom_list.append(g)
	print(f'Loaded {xsec_count} cross sections')
	return geom_list

def update_feature_dims_token(geom_list): 
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Load the BERT tokenizer

	feature_dims = []
	max_token = 0
	padding_token_id = tokenizer.add_tokens(['padding'])
	for geom in geom_list:
		label = geom['label']
		geom['feature_dims'] = []
		for feature in geom['earthwork_feature']:
			# token = tokenizer.tokenize(feature)
			token_ids = tokenizer.convert_tokens_to_ids(feature)
			geom['feature_dims'].append(token_ids)

			word, count = extract_word_and_count(feature)
			if word in tokens:
				continue
			feature_dims.append(word)
		
		max_token = max(max_token, len(geom['feature_dims']))

	for geom in geom_list:
		label = geom['label']
		geom['feature_dims'] += [padding_token_id] * (max_token - len(geom['feature_dims']))    

	print(f'Max token length: {max_token}')
	return feature_dims

def extract_word_and_count(s):
	match = re.match(r'(\w+)(?:\((\d+)\))?', s)
	if match:
		word, count = match.groups()
		count = int(count) if count else 1
		return word, count
	
	return None, None

def update_feature_dims_freq(geom_list, augument=False):
	feature_dims = []
	for geom in geom_list:
		label = geom['label']
		geom['feature_dims'] = []
		for feature in geom['earthwork_feature']:
			word, count = extract_word_and_count(feature)
			if word is None or count is None:
				continue
			if word in feature_dims:
				continue
			feature_dims.append(word)
	
	feature_dims.sort()

	max_feature_dims_count = [0.0] * len(feature_dims)
	for geom in geom_list:
		label = geom['label']
		geom['feature_dims'] = [0.0] * len(feature_dims)
		geom['feature_text'] = ''
		# geom['feature_angle_dims'] = [0.0] * len(feature_dims)

		for feature in geom['earthwork_feature']:
			word, count = extract_word_and_count(feature)
			if word is None or count is None:
				continue
			geom['feature_text'] += f'{word}({count}) '
			index = feature_dims.index(word)

			geom['feature_dims'][index] = count
			max_feature_dims_count[index] = max(max_feature_dims_count[index], count)

	# normalize feature_dims by usng max_feature_dims_count
	for geom in geom_list:
		label = geom['label']
		for i in range(len(geom['feature_dims'])):
			geom['feature_dims'][i] /= max_feature_dims_count[i]

	# augument feature_dims dataset
	if augument:
		for geom in geom_list:
			label = geom['label']
			geom['feature_dims_aug'] = []
			for i in range(len(geom['feature_dims'])):
				geom['feature_dims_aug'].append(geom['feature_dims'][i])
				geom['feature_dims_aug'].append(geom['feature_dims'][i] * geom['feature_dims'][i])

	print(f'feature dims({len(feature_dims)}): {feature_dims}')
	return feature_dims

def update_onehot_encoding(geom_list):
	label_kinds = []
	for geom in geom_list:
		label = geom['label']
		if label not in label_kinds:
			label_kinds.append(label)

	from collections import Counter # from sklearn.preprocessing import OneHotEncoder
	for geom in geom_list: # count label's kind of train_labels. Initialize the one-hot encoder
		label = geom['label']

		label_counts = Counter(label_kinds)
		onehot = np.zeros(len(label_kinds))
		onehot[label_kinds.index(label)] = 1.0
		geom['label_onehot'] = onehot
	return label_kinds