Upload 7 files

Browse files

Files changed (7) hide show

config.json +120 -0
create_earthwork_dataset.py +232 -0
ena_dataset.py +137 -0
ena_run_model.py +548 -0
eval_model.py +75 -0
extract_ewlog.py +244 -0
prepare_dataset.py +460 -0

config.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+	"layers": [
+		{
+			"layer": "Nru_Geo_Crs_Center",
+			"label": "center"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Surface",
+			"label": "pave_surface"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Subgrade",
+			"label": "pave_subgrade"
+		},
+		{
+			"layer": "Nru_Geo_Surface",
+			"label": "ground"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Bottom",
+			"label": "pave_bottom"
+		},
+		{
+			"layer": "Nru_Geo_Underground_1",
+			"label": "rr"
+		},
+		{
+			"layer": "Nru_Geo_Underground_2",
+			"label": "br"
+		},
+		{
+			"layer": "Nru_Crs_Slope",
+			"label": "slope"
+		},
+		{
+			"layer": "Nru_Stru_Bench",
+			"label": "struct"
+		},
+		{
+			"layer": "Nru_Stru_Frt_Sodan",
+			"label": "struct"
+		},
+		{
+			"layer": "Nru_Stru_Smr",
+			"label": "struct"
+		},
+        {
+            "layer": "Nru_Stru_Smr_Ending",
+            "label": "struct"
+        },
+		{
+			"layer": "Nru_Stru_Ditch",
+			"label": "struct"
+		},
+        {
+            "layer": "Nru_Stru_Ditch_Bench",
+            "label": "struct"
+        },
+		{
+			"layer": "Nru_Stru_Frt",
+			"label": "struct"
+		},
+		{
+			"layer": "Nru_Crs_Ew_깎기_토사",
+			"label": "cut_ea"
+		},
+		{
+			"layer": "Nru_Crs_Ew_깎기_리핑암",
+			"label": "cut_rr"
+		},
+		{
+			"layer": "Nru_Crs_Ew_일반발파",
+			"label": "cut_br"
+		},
+		{
+			"layer": "Nru_Crs_Ew_대규모발파",
+			"label": "cut_br"
+		},
+		{
+			"layer": "Nru_Crs_Ew_중규모진동제어발파",
+			"label": "cut_br"
+		},
+		{
+			"layer": "Nru_Crs_Ew_터파기_토사",
+			"label": "cut_ditch"
+		},
+		{
+			"layer": "Nru_Crs_Ew_쌓기_노상",
+			"label": "fill_subbed"
+		},
+		{
+			"layer": "Nru_Crs_Ew_쌓기_노체",
+			"label": "fill_subbody"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Layer-1",
+			"label": "pave_layer1"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Layer-2",
+			"label": "pave_layer2"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Layer-3",
+			"label": "pave_layer3"
+		},
+		{
+			"layer": "Nru_Crs_Pave_Layer-4",
+			"label": "pave_layer4"
+		},
+		{
+			"layer": "Nru_Crs_Steps",
+			"label": "steps"
+		},
+		{
+			"layer": "Nru_Crs_Curb",
+			"label": "curb"
+		}
+	]
+}

create_earthwork_dataset.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# title: create earthwwork train dataset
+# author: Taewook Kang
+# date: 2024.3.27
+# description: create earthwork train dataset
+# license: MIT
+# reference: https://pyautocad.readthedocs.io/en/latest/_modules/pyautocad/api.html
+# version
+#   0.1. 2024.3.27. create file
+#
+import os, math, argparse, json, re, traceback, numpy as np, pandas as pd, trimesh, laspy, shutil
+import pyautocad, open3d as o3d, seaborn as sns, win32com.client, pythoncom
+import matplotlib.pyplot as plt
+from scipy.spatial import distance
+from tqdm import trange, tqdm
+from math import pi
+def get_layer_to_label(cfg, layer):
+	layers = cfg['layers']
+	for lay in layers:
+		if lay['layer'] == layer:
+			return lay['label']
+	return ''
+def get_entity_from_acad(entity_names = ['AcDbLine', 'AcDbPolyline', 'AcDbText']):
+	acad = pyautocad.Autocad(create_if_not_exists=True)
+	selections = acad.get_selection('Select entities to extract geometry')
+	geoms = []
+	for entity in tqdm(selections): # tqdm(acad.iter_objects()): # selections:
+		try:
+			if entity.EntityName in entity_names:
+				geoms.append(entity)
+		except Exception as e:
+			print(f'error: {e}')
+			continue
+	if not geoms:
+		print("No entities found in the drawing.")
+		return
+	return geoms
+def get_bbox(polyline):
+	xmin, ymin, xmax, ymax = polyline[0][0], polyline[0][1], polyline[0][0], polyline[0][1]
+	for x, y in polyline:
+		xmin = min(xmin, x)
+		ymin = min(ymin, y)
+		xmax = max(xmax, x)
+		ymax = max(ymax, y)
+	return (xmin, ymin, xmax, ymax)
+def get_xsections_from_acad(cfg):
+	entities = get_entity_from_acad()
+	# extract cross sections
+	xsec_list = []
+	xsec_entities = []
+	for entity in entities:
+		if entity.Layer == 'Nru_Frame_Crs_Design' and entity.EntityName == 'AcDbPolyline':
+			polyline = []
+			vertex_list = entity.Coordinates
+			for i in range(0, len(vertex_list), 2):
+				polyline.append((vertex_list[i], vertex_list[i+1]))
+			if len(polyline) < 2:
+				continue
+			bbox = get_bbox(polyline)
+			xsec = {'bbox': bbox, 'station': '', 'geom': []}
+			xsec_list.append(xsec)
+		else:
+			xsec_entities.append(entity)
+	if len(xsec_entities) == 0:
+		print("No cross section found in the drawing.")
+		return []
+	for xsec in xsec_list:
+		for entity in xsec_entities:
+			if entity.EntityName != 'AcDbText':
+				continue
+			pt = (entity.InsertionPoint[0], entity.InsertionPoint[1])
+			bbox = xsec['bbox']
+			if pt[0] < bbox[0] or pt[1] < bbox[1] or pt[0] > bbox[2] or pt[1] > bbox[3]:
+				continue
+			xsec_station = entity.TextString
+			pattern = r'\d+\+\d+\.\d+'
+			match = re.search(pattern, xsec_station)
+			if match:
+				xsec_station = match.group()
+			else:
+				xsec_station = '-1+000.00'
+			xsec['station'] = xsec_station
+	if len(xsec_list) == 0:
+		xsec = {'bbox': (-9999999999.0, -9999999999.0, 9999999999.0, 9999999999.0), 'station': '0+000.00'}
+		xsec_list.append(xsec)
+	xsec_list = sorted(xsec_list, key=lambda x: x['station']) # sorting xsec_list by station string, format is 'xxx+xxx.xx'
+	# extract geometry in each cross section
+	for xsec in tqdm(xsec_list):
+		for entity in xsec_entities:
+			label = get_layer_to_label(cfg, entity.Layer)
+			if label == '':
+				continue
+			closed = False
+			polyline = []
+			if entity.EntityName == 'AcDbLine':
+				polyline = [entity.StartPoint, entity.EndPoint],
+				closed = False
+			elif entity.EntityName == 'AcDbPolyline':
+				vertex_list = entity.Coordinates
+				for i in range(0, len(vertex_list), 2):
+					polyline.append((vertex_list[i], vertex_list[i+1]))
+				closed = entity.Closed
+			else:
+				continue
+			xsec_bbox = xsec['bbox']
+			entity_bbox = get_bbox(polyline)
+			if entity_bbox[0] < xsec_bbox[0] or entity_bbox[1] < xsec_bbox[1] or entity_bbox[2] > xsec_bbox[2] or entity_bbox[3] > xsec_bbox[3]:
+				continue
+			geo = {
+				'label': label,
+				'polyline': polyline,
+				'closed': closed,
+				'earthwork_feature': []
+			}
+			xsec['geom'].append(geo)
+	return xsec_list
+# defining function to add line plot
+_draw_xsection_index = 0
+_xsections = None
+_plot_ax = None
+def draw_xsections(ax, index):
+	xsec = _xsections[index]
+	for geo in xsec['geom']:
+		station = xsec['station']
+		ax.set_title(f'station: {station}')
+		polyline = np.array(geo['polyline'])
+		ax.plot(polyline[:,0], polyline[:,1], label=geo['label'])
+	ax.set_aspect('equal', 'box')
+def next_button(event):
+	global _draw_xsection_index, _xsections, _plot_ax
+	_draw_xsection_index += 1
+	if _draw_xsection_index >= len(_xsections):
+		_draw_xsection_index = 0
+	_plot_ax.clear()
+	draw_xsections(_plot_ax, _draw_xsection_index)
+def prev_button(event):
+	global _draw_xsection_index, _xsections, _plot_ax
+	_draw_xsection_index -= 1
+	if _draw_xsection_index < 0:
+		_draw_xsection_index = len(_xsections) - 1
+	_plot_ax.clear()
+	draw_xsections(_plot_ax, _draw_xsection_index)
+def on_key_press(event):
+	if event.key == 'right':
+		next_button(None)
+	elif event.key == 'left':
+		prev_button(None)
+def show_xsections(xsections):
+	from matplotlib.widgets import Button
+	global _draw_xsection_index, _xsections, _plot_ax
+	_xsections = xsections
+	fig = plt.figure()
+	_plot_ax = fig.subplots()
+	plt.subplots_adjust(left = 0.3, bottom = 0.25)
+	draw_xsections(_plot_ax, _draw_xsection_index)
+	# defining button and add its functionality
+	axprev = fig.add_axes([0.7, 0.05, 0.1, 0.075])
+	bprev = Button(axprev, 'prev', color="white")
+	bprev.on_clicked(prev_button)
+	axnext = fig.add_axes([0.81, 0.05, 0.1, 0.075])
+	bnext = Button(axnext, 'next', color="white")
+	bnext.on_clicked(next_button)
+	fig.canvas.mpl_connect('key_press_event', on_key_press)
+	plt.show()
+def main():
+	parser = argparse.ArgumentParser(description='create earthwork train dataset')
+	parser.add_argument('--config', type=str, default='config.json', help='config file')
+	parser.add_argument('--output', type=str, default='output/', help='output directory')
+	parser.add_argument('--view', type=str, default='output/chain_chunk_6.json', help='view file')
+	args = parser.parse_args()
+	try:
+		if len(args.view) > 0:
+			with open(args.view, 'r') as f:
+				xsections = json.load(f)
+			show_xsections(xsections)
+			return
+		cfg = None
+		with open(args.config, 'r', encoding='utf-8') as f:
+			cfg = json.load(f)
+		chunk_index = 0
+		file_names = os.listdir(args.output)
+		if len(file_names):
+			pattern = r'chain_chunk_(\d+)\.json'
+			indices = [int(re.match(pattern, file_name).group(1)) for file_name in file_names if re.match(pattern, file_name)]
+			chunk_index = max(indices) + 1 if indices else 0
+		print(file_names)
+		while True:
+			xsections = get_xsections_from_acad(cfg)
+			if len(xsections) == 0:
+				break
+			geo_file = os.path.join(args.output, f'chain_chunk_{chunk_index}.json')
+			with open(geo_file, 'w') as f:
+				json.dump(xsections, f, indent=4)
+			print(f'{geo_file} was saved in {args.output}')
+			chunk_index += 1
+	except Exception as e:
+		print(f'error: {e}')
+		traceback.print_exc()
+if __name__ == '__main__':
+	main()

ena_dataset.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# title: ENA dataset utility functions
+# author: Taewook Kang, Kyubyung Kang
+# date: 2024.3.27
+# license: MIT
+# reference: https://pyautocad.readthedocs.io/en/latest/_modules/pyautocad/api.html
+# version
+#   0.1. 2024.3.27. create file
+#
+import json, os, re, logging, numpy as np
+from transformers import BertTokenizer
+def load_train_chunk_data(data_dir, sort_fname=False):
+	geom_list = []
+	fnames = os.listdir(data_dir)
+	if sort_fname:
+		fnames.sort(key=lambda x: int(re.search(r'\d+', x).group()))
+	xsec_count = 0
+	for file_name in fnames:
+		if file_name.endswith('.json') == False:
+			continue
+		with open(os.path.join(data_dir, file_name), 'r') as f:
+			chunk = json.load(f)
+			for xsec in chunk:
+				xsec_count += 1
+				geom = xsec['geom']
+				for g in geom:
+					g['station'] = xsec['station']
+					features = g['earthwork_feature']
+					if len(features) == 0:
+						continue
+					geom_list.append(g)
+	print(f'Loaded {xsec_count} cross sections')
+	return geom_list
+def update_feature_dims_token(geom_list):
+	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Load the BERT tokenizer
+	feature_dims = []
+	max_token = 0
+	padding_token_id = tokenizer.add_tokens(['padding'])
+	for geom in geom_list:
+		label = geom['label']
+		geom['feature_dims'] = []
+		for feature in geom['earthwork_feature']:
+			# token = tokenizer.tokenize(feature)
+			token_ids = tokenizer.convert_tokens_to_ids(feature)
+			geom['feature_dims'].append(token_ids)
+			word, count = extract_word_and_count(feature)
+			if word in tokens:
+				continue
+			feature_dims.append(word)
+		max_token = max(max_token, len(geom['feature_dims']))
+	for geom in geom_list:
+		label = geom['label']
+		geom['feature_dims'] += [padding_token_id] * (max_token - len(geom['feature_dims']))
+	print(f'Max token length: {max_token}')
+	return feature_dims
+def extract_word_and_count(s):
+	match = re.match(r'(\w+)(?:\((\d+)\))?', s)
+	if match:
+		word, count = match.groups()
+		count = int(count) if count else 1
+		return word, count
+	return None, None
+def update_feature_dims_freq(geom_list, augument=False):
+	feature_dims = []
+	for geom in geom_list:
+		label = geom['label']
+		geom['feature_dims'] = []
+		for feature in geom['earthwork_feature']:
+			word, count = extract_word_and_count(feature)
+			if word is None or count is None:
+				continue
+			if word in feature_dims:
+				continue
+			feature_dims.append(word)
+	feature_dims.sort()
+	max_feature_dims_count = [0.0] * len(feature_dims)
+	for geom in geom_list:
+		label = geom['label']
+		geom['feature_dims'] = [0.0] * len(feature_dims)
+		geom['feature_text'] = ''
+		# geom['feature_angle_dims'] = [0.0] * len(feature_dims)
+		for feature in geom['earthwork_feature']:
+			word, count = extract_word_and_count(feature)
+			if word is None or count is None:
+				continue
+			geom['feature_text'] += f'{word}({count}) '
+			index = feature_dims.index(word)
+			geom['feature_dims'][index] = count
+			max_feature_dims_count[index] = max(max_feature_dims_count[index], count)
+	# normalize feature_dims by usng max_feature_dims_count
+	for geom in geom_list:
+		label = geom['label']
+		for i in range(len(geom['feature_dims'])):
+			geom['feature_dims'][i] /= max_feature_dims_count[i]
+	# augument feature_dims dataset
+	if augument:
+		for geom in geom_list:
+			label = geom['label']
+			geom['feature_dims_aug'] = []
+			for i in range(len(geom['feature_dims'])):
+				geom['feature_dims_aug'].append(geom['feature_dims'][i])
+				geom['feature_dims_aug'].append(geom['feature_dims'][i] * geom['feature_dims'][i])
+	print(f'feature dims({len(feature_dims)}): {feature_dims}')
+	return feature_dims
+def update_onehot_encoding(geom_list):
+	label_kinds = []
+	for geom in geom_list:
+		label = geom['label']
+		if label not in label_kinds:
+			label_kinds.append(label)
+	from collections import Counter # from sklearn.preprocessing import OneHotEncoder
+	for geom in geom_list: # count label's kind of train_labels. Initialize the one-hot encoder
+		label = geom['label']
+		label_counts = Counter(label_kinds)
+		onehot = np.zeros(len(label_kinds))
+		onehot[label_kinds.index(label)] = 1.0
+		geom['label_onehot'] = onehot
+	return label_kinds

ena_run_model.py ADDED Viewed

	@@ -0,0 +1,548 @@

+# title: ENA model runner
+# author: Taewook Kang, Kyubyung Kang
+# date: 2024.3.27
+# description: ENA model test and evaluation
+# license: MIT
+# version
+#   0.1. 2024.3.27. create file
+#
+import json, os, re, logging
+import torch, torch.nn as nn, torch.optim as optim, numpy as np, matplotlib.pyplot as plt, seaborn as sns
+import torch.nn.functional as F
+from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, BertModel
+from sklearn.metrics import confusion_matrix
+from collections import defaultdict
+from datetime import datetime
+from tqdm import tqdm
+from ena_dataset import load_train_chunk_data, update_feature_dims_freq, update_onehot_encoding
+# write log file using logger
+logging.basicConfig(filename= './ewnet_logs.txt', level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y%m%d %H:%M')
+logger = logging.getLogger('ewnet')
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f'device: {device}')
+# param
+hyperparam = None
+# train model
+class EarthworkNetMLP(nn.Module):
+	def __init__(self, input_dim, hidden_dim, output_dim, dropout_ratio=0.2):
+		super(EarthworkNetMLP, self).__init__()
+		models = []
+		models.append(nn.Linear(input_dim, hidden_dim[0]))
+		models.append(nn.ReLU())
+		models.append(nn.BatchNorm1d(hidden_dim[0]))  # Batch normalization after activation
+		models.append(nn.Dropout(dropout_ratio))
+		for i in range(1, len(hidden_dim)):
+			models.append(nn.Linear(hidden_dim[i-1], hidden_dim[i]))
+			models.append(nn.ReLU())
+			models.append(nn.BatchNorm1d(hidden_dim[i]))
+			models.append(nn.Dropout(dropout_ratio))
+		models.append(nn.Linear(hidden_dim[-1], output_dim))
+		self.layers = nn.Sequential(*models)
+	def forward(self, x):
+		# print("Shape of x:", x.shape)
+		x = self.layers(x)
+		return x
+# train model using LSTM
+class EarthworkNetLSTM(nn.Module):
+	def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout_ratio=0.2):
+		super(EarthworkNetLSTM, self).__init__()
+		# sequence series data. ex) token pattern(slope angle). top(0.5), bottom(0.5), top(0.6), bottom(0.6)...
+		# time series features = (token_type, curve_angle)
+		# label = (label_onehot)
+		models = []
+		models.append(nn.LSTM(input_dim, hidden_dim[0], num_layers, batch_first=True, dropout=dropout_ratio))
+		for i in range(1, len(hidden_dim)):
+			models.append(nn.Linear(hidden_dim[i-1], hidden_dim[i]))
+		models.append(nn.Linear(hidden_dim[-1], output_dim))
+		self.layers = nn.Sequential(*models)
+	def forward(self, x):
+		# print("Shape of x:", x.shape)
+		for layer in self.layers:
+			if type(layer) == torch.nn.modules.rnn.LSTM:
+				x, _ = layer(x)
+			else:
+				x = layer(x)
+		return x
+# create dataset. earthwork_feature -> label
+class EarthworkDataset(Dataset):
+	def __init__(self, raw_data):
+		self.raw_dataset = raw_data
+	def __len__(self):
+		return len(self.raw_dataset)
+	def __getitem__(self, idx):
+		# origin_data = self.raw_dataset[idx]
+		features = self.raw_dataset[idx]['feature_dims'] # already, tokenized from 'feature_text'
+		label = self.raw_dataset[idx]['label_onehot']
+		features = torch.tensor(features, dtype=torch.float32).to(device)
+		label = torch.tensor(label, dtype=torch.float32).to(device)
+		return features, label
+def decode_data_to_geom(input_dataset, predictions, labels, input_feature_dims, label_kinds):
+	global hyperparam
+	match_count = 0
+	for i in range(len(input_dataset)): # batch size
+		input_geom_features = input_dataset[i].cpu().numpy()
+		prediction_index = predictions[i].item()
+		label_index = labels[i].cpu().numpy()
+		geom_feautres = []
+		for j in range(len(input_feature_dims)):
+			if input_geom_features[j] == 0.0:
+				continue
+			geom_feautres.append(f'{input_feature_dims[j]}({input_geom_features[j]:.2f})')
+		prediction_label = label_kinds[prediction_index]
+		label = label_kinds[label_index]
+		match = prediction_label == label
+		if match:
+			match_count += 1
+		logger.debug(f'{hyperparam["model"]} {hyperparam["hidden_dim"]} Equal : {prediction_label == label}, Label: {label}, Predicted: {prediction_label}, Geom: {geom_feautres}')
+	return match_count
+def test_mlp_model(model, batch_size, test_raw_dataset, input_feature_dims, label_kinds):
+	print(f'test data count: {len(test_raw_dataset)}')
+	test_dataset = EarthworkDataset(test_raw_dataset)
+	test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
+	# test model
+	accuracies = []
+	rmse = 0.0
+	correct = 0
+	total = 0
+	total_match = 0
+	with torch.no_grad():
+		for i, (data, labels) in enumerate(test_dataloader):
+			outputs = model(data)
+			_, predicted = torch.max(outputs.data, 1)
+			_, labels = torch.max(labels.data, 1)
+			total += labels.size(0)
+			correct += (predicted == labels).sum().item()
+			accuracies.append(correct / total)
+			match_count = decode_data_to_geom(data, predicted, labels, input_feature_dims, label_kinds)
+			total_match += match_count
+	average_accuracy = correct / total
+	print(f'Match count: {total_match}, Total count: {total}')
+	print(f'Accuracy of the network on the test data: {average_accuracy:.4f}')
+	return accuracies, average_accuracy
+def run_MLP_LSTM(model_file_list, base_model):
+	global hyperparam
+	# prepare train dataset
+	data_dir = './dataset'
+	geom_list = load_train_chunk_data(data_dir)
+	input_feature_dims = update_feature_dims_freq(geom_list) # input_feature_dims = update_feature_dims_token(geom_list)
+	label_kinds = update_onehot_encoding(geom_list)
+	train_raw_dataset = geom_list[:int(len(geom_list) * 0.8)]
+	test_raw_dataset = geom_list[int(len(geom_list) * 0.8):]
+	print(f'total data count: {len(geom_list)}')
+	print(f'train data count: {len(train_raw_dataset)}, test data count: {len(test_raw_dataset)}')
+	# train model and write it
+	param_layers = [[128], [128, 64, 32], [256, 128, 64]]
+	if base_model == 'MLP':
+		param_layers = [[128, 64, 32], [64, 128, 64], [64, 128, 64, 32], [32, 64, 32]]
+	for index, param_layer in enumerate(param_layers):
+		logger.debug(f'model : {base_model}')
+		params = {
+			'model': base_model,
+			'input_dim': len(input_feature_dims),
+			'hidden_dim': param_layer, # 0.95, [128, 64, 32],
+			'output_dim': len(label_kinds),
+			'batch_size': 32,
+			'epochs': 150, # 150, # 5000
+			'lr': 0.001
+		}
+		hyperparam = params
+		# create train model
+		model = EarthworkNetMLP(params['input_dim'], params['hidden_dim'], params['output_dim']).to(device)
+		if base_model == 'LSTM':
+			model = EarthworkNetLSTM(params['input_dim'], params['hidden_dim'], params['output_dim']).to(device)
+		model_file = './' + model_file_list[index]
+		model.load_state_dict(torch.load(model_file))
+		model.eval()
+		accuracies, acc = test_mlp_model(model, params['batch_size'], test_raw_dataset, input_feature_dims, label_kinds)
+# Generate random training data
+def generate_random_text(label_index, length=100):
+	base_text = f'This is text for label R{label_index + 1}. '
+	random_text_length = max(0, length - len(base_text)) # Calculate the length of the random text to generate
+	random_text = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(random_text_length)) # Generate the random text
+	return base_text + random_text
+# Define dataset class
+class EarthworkTransformDataset(Dataset):
+	def __init__(self, input_ids, attention_mask, labels):
+		self.input_ids = input_ids
+		self.attention_mask = attention_mask
+		self.labels = labels
+	def __len__(self):
+		return len(self.input_ids)
+	def __getitem__(self, idx):
+		input_ids_tensor = torch.tensor(self.input_ids[idx]).to(device)
+		attention_mask_tensor = torch.tensor(self.attention_mask[idx]).to(device)
+		label_tensor = torch.tensor(self.labels[idx]).to(device)
+		return input_ids_tensor, attention_mask_tensor, label_tensor
+# custom transformer
+class PositionalEncoding(nn.Module):
+	def __init__(self, d_model, vocab_size=5000, dropout=0.1):
+		super().__init__()
+		self.dropout = nn.Dropout(p=dropout)
+		pe = torch.zeros(vocab_size, d_model)
+		position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
+		div_term = torch.exp(
+			torch.arange(0, d_model, 2).float()
+			* (-math.log(10000.0) / d_model)
+		)
+		pe[:, 0::2] = torch.sin(position * div_term)
+		pe[:, 1::2] = torch.cos(position * div_term)
+		pe = pe.unsqueeze(0)
+		self.register_buffer("pe", pe)
+	def forward(self, x):
+		x = x + self.pe[:, : x.size(1), :]
+		return self.dropout(x)
+class EarthworkNetTransformer(nn.Module):
+	def __init__(
+		self,
+		input_feature_size,
+		d_model,
+		num_labels,
+		nhead=8,
+		dim_feedforward=2048,
+		dim_fc=[64, 32],
+		num_layers=6,
+		dropout=0.1,
+		activation="relu",
+		classifier_dropout=0.1,
+	):
+		super().__init__()
+		self.d_model = d_model
+		# self.pos_encoder = PositionalEncoding(d_model=d_model, dropout=dropout, vocab_size=vocab_size)
+		self.input_fc = nn.Linear(input_feature_size, d_model)
+		encoder_layer = nn.TransformerEncoderLayer(
+			d_model=d_model,
+			nhead=nhead,
+			dim_feedforward=dim_feedforward,
+			dropout=dropout
+		)
+		self.src_mask = None
+		self.nhead = nhead
+		self.transformer_encoder = nn.TransformerEncoder(
+			encoder_layer,
+			num_layers=num_layers,
+			# TBD. output_attentions=True
+		)
+		self.fc_layers = []
+		fc_layers_dims = [d_model] + dim_fc + [num_labels]
+		for i in range(1, len(fc_layers_dims)):
+			fc = nn.Linear(fc_layers_dims[i-1], fc_layers_dims[i]).to(device)
+			self.fc_layers.append(fc)
+		self.init_weights()
+	def generate_square_subsequent_mask(self, sz):
+		mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+		mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+		return mask
+	def init_weights(self):
+		initrange = 0.1
+		for fc in self.fc_layers:
+			fc.bias.data.zero_()
+			fc.weight.data.uniform_(-initrange, initrange)
+	def forward(self, x, attention_mask):
+		# x = self.pos_encoder(x)
+		if self.src_mask is None or self.src_mask.size(0) != len(x):
+			device = x.device
+			mask = self.generate_square_subsequent_mask(len(x)).to(device)
+			self.src_mask = mask
+			# batch_size = x.shape[0]
+			# mask = torch.tril(torch.ones(self.nhead, batch_size, batch_size)).to(x.device)
+		x = x.float()
+		x = self.input_fc(x)
+		x = self.transformer_encoder(x, mask=self.src_mask) # , src_key_padding_mask=attention_mask1) # , mask=attention_mask)
+		# x = x.mean(dim=1)
+		for fc in self.fc_layers:
+			x = fc(x)
+		return x
+def run_transform(model_file_list):
+	data_dir = './dataset'
+	geom_list = load_train_chunk_data(data_dir)
+	input_feature_dims = update_feature_dims_freq(geom_list) # input_feature_dims = update_feature_dims_token(geom_list)
+	label_kinds = update_onehot_encoding(geom_list)
+	num_labels = len(label_kinds)
+	max_input_string = max(len(d['feature_text']) for d in geom_list)
+	max_input_string = 320 # nhead=8. 320=8*40
+	train_raw_dataset = geom_list[:int(len(geom_list) * 0.8)]
+	test_raw_dataset = geom_list[int(len(geom_list) * 0.8):]
+	print(f'total data count: {len(geom_list)}')
+	print(f'train data count: {len(train_raw_dataset)}, test data count: {len(test_raw_dataset)}')
+	# Tokenize and pad sequences
+	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+	max_length = max_input_string
+	batch_sizes = [32, 64, 128]
+	for index, batch_size in enumerate(batch_sizes):
+		encoding = {'input_ids': [], 'attention_mask': []}
+		for d in train_raw_dataset:
+			token_text = tokenizer(d['feature_text'], padding='max_length', truncation=True, max_length=max_length)
+			if len(token_text['input_ids']) < max_length: # fill the rest with padding token
+				token_text['input_ids'] += [tokenizer.pad_token_id] * (max_length - len(token_text['input_ids']))
+				token_text['attention_mask'] += [0] * (max_length - len(token_text['attention_mask']))
+			encoding['input_ids'].append(token_text['input_ids'])
+			encoding['attention_mask'].append(token_text['attention_mask'])
+		input_ids = encoding['input_ids']
+		attention_mask = encoding['attention_mask']
+		label2id = {label: i for i, label in enumerate(sorted(set(d['label'] for d in train_raw_dataset)))}
+		id2label = {v: k for k, v in label2id.items()}
+		labels = [label2id[d['label']] for d in train_raw_dataset] # Convert labels to numerical format
+		# hyperparameters
+		logger.debug(f'model : transformer')
+		params = {
+			'model': 'transformer',
+			'input_dim': len(input_feature_dims),
+			'hidden_dim': [64],
+			'output_dim': len(label2id),
+			'batch_size': batch_size,
+			'epochs': 300,
+			'lr': 1e-5
+		}
+		# batch_size = params['batch_size']	# 32, 64, 128
+		dim_fc = params['hidden_dim']
+		epochs = params['epochs'] 			# 5000 # 500 150
+		# model
+		model = EarthworkNetTransformer(input_feature_size=max_length, d_model=512, num_labels=len(label2id), dim_fc=dim_fc).to(device)
+		dataset = EarthworkTransformDataset(input_ids, attention_mask, labels)
+		dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+		# test the model
+		model_file = './' + model_file_list[index]
+		model.load_state_dict(torch.load(model_file))
+		model.eval()
+		for i, test_raw in enumerate(test_raw_dataset):
+			label = test_raw['label']
+			input_text = test_raw['feature_text']
+			encoding = tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
+			input_ids = encoding['input_ids'].to(device)
+			attention_mask = encoding['attention_mask'].to(device)
+			output = model(input_ids, attention_mask)
+			predicted_label = id2label[output.argmax().item()]
+			feature_dims = input_text.split(' ')
+			logger.debug(f'{params["model"]} {params["batch_size"]} Equal : {predicted_label == label}, Label: {label}, Predicted: {predicted_label}, Geom: {feature_dims}')
+		print(f'test data count: {len(test_raw_dataset)}')
+		encoding = tokenizer([d['feature_text'] for d in test_raw_dataset], padding='max_length', truncation=True, max_length=max_length)
+		input_ids = encoding['input_ids']
+		attention_mask = encoding['attention_mask']
+		label2id = {label: i for i, label in enumerate(sorted(set(d['label'] for d in test_raw_dataset)))}
+		id2label = {v: k for k, v in label2id.items()}
+		labels = [label2id[d['label']] for d in test_raw_dataset] # Convert labels to numerical format
+		test_dataset = EarthworkTransformDataset(input_ids, attention_mask, labels)
+		test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
+		correct = 0
+		total = 0
+		accuracies = []
+		with torch.no_grad():
+			for i, (input_ids, attention_mask, labels) in enumerate(tqdm(test_dataloader, desc="test")):
+				outputs = model(input_ids, attention_mask)
+				_, predicted = torch.max(outputs, 1)
+				total += len(labels)
+				correct += (predicted == labels).sum().item()
+				accuracies.append(correct / total)
+		average_accuracy = correct / total
+		print(f'Accuracy of the network on the test data: {average_accuracy:.4f}')
+# BERT model
+class EarthworkBertDataset(Dataset):
+	def __init__(self, input_ids, attention_mask, labels):
+		self.input_ids = input_ids
+		self.attention_mask = attention_mask
+		self.labels = labels
+	def __len__(self):
+		return len(self.input_ids)
+	def __getitem__(self, idx):
+		input_ids_tensor = torch.tensor(self.input_ids[idx]).to(device)
+		attention_mask_tensor = torch.tensor(self.attention_mask[idx]).to(device)
+		label_tensor = torch.tensor(self.labels[idx]).to(device)
+		return input_ids_tensor, attention_mask_tensor, label_tensor
+# Define EarthworkNetTransformer model architecture
+class EarthworkNetTransformerBert(torch.nn.Module):
+	def __init__(self, num_labels):
+		super(EarthworkNetTransformerBert, self).__init__()
+		self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels, output_attentions=True)
+	def forward(self, input_ids, attention_mask):
+		outputs = self.bert(input_ids, attention_mask=attention_mask)
+		return outputs['logits'], outputs['attentions']
+def run_bert(model_file):
+	# prepare train dataset
+	data_dir = './dataset'
+	geom_list = load_train_chunk_data(data_dir)
+	input_feature_dims = update_feature_dims_freq(geom_list) # input_feature_dims = update_feature_dims_token(geom_list)
+	label_kinds = update_onehot_encoding(geom_list)
+	num_labels = len(label_kinds)
+	max_input_string = max(len(d['feature_text']) for d in geom_list)
+	train_raw_dataset = geom_list[:int(len(geom_list) * 0.8)]
+	test_raw_dataset = geom_list[int(len(geom_list) * 0.8):]
+	print(f'total data count: {len(geom_list)}')
+	print(f'train data count: {len(train_raw_dataset)}, test data count: {len(test_raw_dataset)}')
+	# Tokenize and pad sequences
+	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+	max_length = max_input_string
+	encoding = tokenizer([d['feature_text'] for d in train_raw_dataset], padding=True, truncation=True, max_length=max_length)
+	input_ids = encoding['input_ids']	# TBD. shape is 50?
+	attention_mask = encoding['attention_mask']
+	label2id = {label: i for i, label in enumerate(sorted(set(d['label'] for d in train_raw_dataset)))}
+	id2label = {v: k for k, v in label2id.items()}
+	labels = [label2id[d['label']] for d in train_raw_dataset] # Convert labels to numerical format
+	# Initialize model
+	model = EarthworkNetTransformerBert(num_labels=len(label2id)).to(device)
+	epochs = 150 # 50  #
+	batch_size = 32
+	params = {
+		'model': 'BERT',
+		'input_dim': len(input_feature_dims),
+		'hidden_dim': 512,
+		'output_dim': len(label2id),
+		'batch_size': batch_size,
+		'epochs': epochs,
+		'lr': 1e-5,
+	}
+	dataset = EarthworkBertDataset(input_ids, attention_mask, labels)
+	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+	# test the model
+	logger.debug(f'model : bert')
+	model_file = './' + model_file
+	model.load_state_dict(torch.load(model_file))
+	model.eval()
+	for i, test_raw in enumerate(test_raw_dataset):
+		label = test_raw['label']
+		input_text = test_raw['feature_text']
+		encoding = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
+		input_ids = encoding['input_ids'].to(device)
+		attention_mask = encoding['attention_mask'].to(device)
+		output, att = model(input_ids, attention_mask)
+		predicted_label = id2label[output.argmax().item()]
+		feature_dims = input_text.split(' ')
+		logger.debug(f'{params["model"]} Equal : {predicted_label == label}, Label: {label}, Predicted: {predicted_label}, Geom: {feature_dims}')
+		attention_matrix = att[-1]
+		attention_layer = attention_matrix[-1]
+		attention_mat = attention_layer[-1]
+		# for j, attention_mat in enumerate(attention_layer):
+		att_mat = attention_mat.detach().cpu().numpy()
+		fig, ax = plt.subplots()
+		cax = ax.matshow(att_mat, cmap='viridis')
+		fig.colorbar(cax)
+		plt.savefig(f'./graph/bert_attention_{i}.png')
+		plt.close()
+	print(f'test data count: {len(test_raw_dataset)}')
+	encoding = tokenizer([d['feature_text'] for d in test_raw_dataset], padding=True, truncation=True, max_length=max_length)
+	input_ids = encoding['input_ids']
+	attention_mask = encoding['attention_mask']
+	label2id = {label: i for i, label in enumerate(sorted(set(d['label'] for d in test_raw_dataset)))}
+	id2label = {v: k for k, v in label2id.items()}
+	labels = [label2id[d['label']] for d in test_raw_dataset] # Convert labels to numerical format
+	test_dataset = EarthworkBertDataset(input_ids, attention_mask, labels)
+	test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
+	correct = 0
+	total = 0
+	accuracies = []
+	with torch.no_grad():
+		for i, (input_ids, attention_mask, labels) in enumerate(tqdm(test_dataloader, desc="test")):
+			outputs, att = model(input_ids, attention_mask)
+			_, predicted = torch.max(outputs, 1)
+			total += len(labels)
+			correct += (predicted == labels).sum().item()
+			accuracies.append(correct / total)
+			y_score = torch.nn.functional.softmax(outputs, dim=1)
+	average_accuracy = correct / total
+	print(f'Accuracy of the network on the test data: {average_accuracy:.4f}')
+if __name__ == '__main__':
+	models = ['earthwork_model_20240503_1650.pth','earthwork_model_20240503_1714.pth','earthwork_model_20240503_1716.pth','earthwork_model_20240503_1718.pth']
+	run_MLP_LSTM(models, 'MLP')
+	models = ['earthwork_model_20240503_1730.pth','earthwork_model_20240503_1732.pth','earthwork_model_20240503_1734.pth']
+	run_MLP_LSTM(models, 'LSTM')
+	models = ['earthwork_trans_model_20240503_2003.pth','earthwork_trans_model_20240503_2014.pth','earthwork_trans_model_20240503_2021.pth']
+	run_transform(models)
+	run_bert('earthwork_trans_model_20240504_0103.pth')

eval_model.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import numpy as np
+from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
+from sklearn.metrics import average_precision_score
+# Assuming y_true and y_pred are your data
+y_true = [0, 1, 1, 0, 1, 1]
+y_pred = [0, 0, 1, 0, 0, 1]
+# Assuming y_true and y_pred are your data
+y_true = [[0, 1, 1], [0, 1, 1], [1, 0, 1]]
+y_pred = [[0, 0, 1], [0, 0, 1], [1, 0, 0]]
+class model_metrics:
+    def __init__(self):
+        self.clear()
+    def clear(self):
+        self.accuracy = 0.0
+        self.recall = 0.0
+        self.precision = 0.0
+        self.f1 = 0.0
+        self.mAP = 0.0
+        self.cm = np.asarray([])
+        self.count = 0
+        self.total_accuracy = 0.0
+        self.total_recall = 0.0
+        self.total_precision = 0.0
+        self.total_f1 = 0.0
+        self.total_mAP = 0.0
+        self.total_cm = np.asarray([])
+    def get_indicators(self):
+        return self.total_accuracy / self.count, self.total_recall / self.count, self.total_precision / self.count, self.total_f1 / self.count, self.total_mAP / self.count, self.total_cm / self.count
+    def dump(self):
+        print(f"Accuracy: {self.accuracy}")
+        print(f"Recall: {self.recall}")
+        print(f"Precision: {self.precision}")
+        print(f"F1 Score: {self.f1}")
+        print(f"mAP: {self.mAP}")
+        print(f"Confusion Matrix: \n{self.cm}")
+        print(f'average accuracy: {self.total_accuracy / self.count}')
+        print(f'average recall: {self.total_recall / self.count}')
+        print(f'average precision: {self.total_precision / self.count}')
+        print(f'average f1: {self.total_f1 / self.count}')
+        print(f'average mAP: {self.total_mAP / self.count}')
+        print(f'average confusion matrix: \n{self.total_cm / self.count}')
+    def calc_metrics(self, y_true, y_pred, y_score):
+        self.accuracy = accuracy_score(y_true, y_pred)
+        self.recall = recall_score(y_true, y_pred, average='weighted')
+        self.precision = precision_score(y_true, y_pred, average='micro')
+        self.cm = confusion_matrix(y_true, y_pred)
+        self.count += 1
+        self.total_accuracy += self.accuracy
+        self.total_recall += self.recall
+        self.total_precision += self.precision
+        self.total_f1 += self.f1
+        self.total_mAP += self.mAP
+        self.total_cm = self.cm # TBD
+        return self.accuracy, self.recall, self.precision, self.f1, self.mAP, self.cm
+    def calc_metrics_multi(self, y_true, y_pred):
+        self.accuracy = accuracy_score(y_true, y_pred)
+        self.recall = recall_score(y_true, y_pred, average='micro')
+        self.precision = precision_score(y_true, y_pred, average='micro')
+        self.f1 = f1_score(y_true, y_pred, average='micro')
+        self.mAP = average_precision_score(y_true, y_pred, average='micro')
+        self.count += 1
+        return self.accuracy, self.recall, self.precision, self.f1, self.mAP, self.cm

extract_ewlog.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os, re, numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.pyplot as plt_xsec
+from datetime import datetime
+input_log_file = './ewnet_logs_TRANS3_20240708.txt'
+flag_all_xsections = True
+prev_station = ''
+now = datetime.now()
+now_str = now.strftime('%Y%m%d_%H%M')
+label_list = ['pave_layer1', 'pave_layer2', 'pave_layer3', 'pave_layer4', 'cut_ea', 'cut_rr', 'cut_br', 'cut_ditch', 'fill_subbed', 'fill_subbody', 'curb', 'above', 'below', 'pave_int', 'pave_surface', 'pave_subgrade', 'ground', 'pave_bottom', 'rr', 'br', 'slope', 'struct', 'steps']
+color_list = [[0.8,0.8,0.8],[0.6,0.6,0.6],[0.4,0.4,0.4],[0.2,0.2,0.2],[0.8,0.4,0.2],[0.8,0.6,0.2],[0.8,0.8,0.2],[0.6,0.8,0.2],[0.3,0.8,0.3],[0.3,0.6,0.3],[0.3,0.4,0.3],[0.0,0.8,0.0],[0.6,0.0,0.0],[0.8,0.0,0.0],[1.0,0.0,0.0],[0.2,0.2,0.6],[0.0,1.0,0.0],[0.2,0.2,1.0],[0.4,0.2,1.0],[0.6,0.2,1.0],[0.2,0.8,0.6],[0.8,0.2,1.0],[1.0,0.2,1.0]]
+# make folder
+if not os.path.exists('./graph'):
+	os.makedirs('./graph')
+def draw_colorbox_list():
+	global label_list, color_list
+	fig, ax = plt.subplots(figsize=(9.2, 5))
+	ax.invert_yaxis()
+	ax.set_xlim(0, 1.5)
+	fig.set_size_inches(12, 7)
+	token_list = ['item1', 'item2', 'item3', 'item4', 'item5', 'item6']
+	for i, (colname, color) in enumerate(zip(label_list, color_list)):
+		width = 1.0 / len(label_list)
+		widths = [width] * len(token_list)
+		starts = width * i
+		rects = ax.barh(token_list, widths, left=starts, height=0.5, label=colname, color=color)
+		text_color = 'white' if np.max(color) < 0.4 else 'black'
+	ax.legend()
+	plt.savefig('./graph/box_colors.png')
+	plt.close()
+def output_graph_matrics(index, tag, text):
+	global label_list, color_list
+	prediction = ''
+	tokens = []
+	polyline = []
+	geom_index = text.find('Geom:')
+	if geom_index >= 0:
+		pred_label = ''
+		label_index = text.find('Predicted: ')
+		if label_index >= 0:
+			pred = text[label_index + 11:geom_index]
+			labels = pred.split(', ')
+			if len(labels) > 0:
+				prediction = labels[0]
+				pred_label = labels[0] + '(0.3'
+		polyline_index = text.find('Polyline:')
+		if polyline_index > 0:
+			pred = text[geom_index + 6:polyline_index - 2]
+			polyline_text = text[polyline_index + 10:]
+			polyline = eval(polyline_text)
+		else:
+			pred = text[geom_index + 6:]
+		pred = pred.replace('[', '').replace(']', '')
+		pred = pred.replace(')', '').replace("'", '')
+		tokens = pred.split(',')
+		if len(tokens) <= 1:
+			tokens = pred.split(' ')
+		if len(tokens) > 0:
+			tokens.insert(0, pred_label)
+		last = tokens[-1]
+		if len(last) == 0:
+			tokens.pop()
+	else:
+		return
+	token_list = [token.split('(')[0] for token in tokens]
+	token_list = [token.replace(' ', '') for token in token_list]
+	ratios = [float(token.split('(')[1]) for token in tokens]
+	results = {token_list[0]: ratios}
+	labels = [label.replace(" ", "") for label in list(results.keys())]
+	data = np.array(list(results.values()))
+	data_cum = data.cumsum(axis=1)
+	token_colors = [color_list[label_list.index(label)] for label in token_list]
+	global plt_xsec, now_str, flag_all_xsections
+	if flag_all_xsections == False:
+		fig, ax = plt.subplots(figsize=(9.2, 5))
+		ax.invert_yaxis()
+		ax.xaxis.set_visible(False)
+		ax.set_xlim(0, np.sum(data, axis=1).max())
+		fig.set_size_inches(15, 0.5)
+		for i, (colname, color) in enumerate(zip(token_list, token_colors)):
+			widths = data[:, i]
+			starts = data_cum[:, i] - widths
+			if i > 0:
+				starts += 0.02
+			rects = ax.barh(labels, widths, left=starts, height=0.5, label=colname, color=color)
+			if i != 0:
+				text_color = 'white' if np.max(color) < 0.4 else 'black'
+				ax.bar_label(rects, label_type='center', color=text_color)
+			ax.legend(ncols=len(token_list), bbox_to_anchor=(0, 1), loc='lower right', fontsize='small')
+		tag = tag.replace(' ', '_')
+		tag = tag.replace(':', '')
+		if text.find('True') > 0:
+			plt.savefig(f'./graph/box_list_{now_str}_{tag}_{index}_T.png')
+		else:
+			plt.savefig(f'./graph/box_list_{now_str}_{tag}_{index}_F.png')
+		plt.close()
+	else:
+		if polyline[0] != polyline[-1]:
+			polyline.append(polyline[0])
+		x, y = zip(*polyline)
+		color = color_list[label_list.index(prediction)]
+		plt_xsec.fill(x, y, color=color)
+		centroid_x = sum(x) / len(x)
+		centroid_y = sum(y) / len(y)
+		area = 0.5 * abs(sum(x[i]*y[i+1] - x[i+1]*y[i] for i in range(len(polyline)-1)))
+		if prediction.find('pave') < 0:
+			plt_xsec.text(centroid_x, centroid_y, f'{prediction}={area:.2f}', horizontalalignment='center', verticalalignment='center', fontsize=5, color='black')
+	return prediction, area, token_list
+output_stations = ['4+440.00000', '3+780.00000', '3+800.00000', '3+880.00000', '3+940.00000']
+def output_logs(tag, equal='none'):
+	global input_log_file, plt_xsec, now_str, prev_station, flag_all_xsection, output_stations
+	text_list = []
+	logs = []
+	with open(input_log_file, 'r') as file:
+		for index, label in enumerate(label_list):
+			file.seek(0)
+			for line in file:
+				if flag_all_xsections == False and line.find(tag) < 0:
+					continue
+				tag_model = tag.split(' ')[0]
+				if flag_all_xsections == True and line.find(tag_model) < 0:
+					continue
+				if flag_all_xsections == False and line.find('Label: ' + label) < 0:
+					continue
+				line = line.replace('\n', '')
+				if equal == 'none':
+					text_list.append(line)
+				elif line.find(equal) > 0:
+					text_list.append(line)
+				if flag_all_xsections == False:
+					break
+			if flag_all_xsections:
+				break
+	if len(text_list) == 0:
+		return logs
+	def extract_station(text):
+		sta_index = text.find('Station:') + 9  # Start of station value
+		end_index = text.find(',', sta_index)
+		return text[sta_index:end_index] if end_index != -1 else text[sta_index:]
+	text_list = sorted(text_list, key=extract_station)
+	station = ''
+	for index, text in enumerate(text_list):
+		sta_index = text.find('Station:')
+		equal_index = text.find('Equal: ')
+		equal_check = 'T' if text.find('True') > 0 else 'F'
+		if sta_index > 0 and equal_index > 0:
+			station = text[sta_index + 9:equal_index-2]
+			print(station)
+		try:
+			if len(output_stations) and output_stations.index(station) < 0:
+				continue
+		except Exception as e:
+			continue
+		if prev_station != station:
+			if len(prev_station) > 0:
+				plt_xsec.savefig(f'./graph/polygon_{now_str}_{tag}_{prev_station}_{equal_check}.png', dpi=300)
+				plt_xsec.close()
+			plt_xsec.figure()
+			plt_xsec.gca().set_xlim([-60, 60])
+			plt_xsec.gca().axis('equal')
+			plt_xsec.gca().text(0, 0, f'{station}', fontsize=12, color='black')
+			prev_station = station
+		text = text.replace('\n', '')
+		label, area, tokens = output_graph_matrics(index, tag, text)
+		log = {
+			'index': index,
+			'station': station,
+			'label': label,
+			'area': area,
+			'tokens': tokens
+		}
+		logs.append(log)
+		if index == len(text_list) - 1:
+			plt_xsec.savefig(f'./graph/polygon_{now_str}_{tag}_{prev_station}_{equal_check}.png', dpi=300)
+			plt_xsec.close()
+	return logs
+def main():
+	draw_colorbox_list()
+	summary_log_file = open('./graph/summary_log.csv', 'a')
+	if summary_log_file is None:
+		return
+	summary_log_file.write(f'model, ground true, length, ground false, length\n')
+	tags = ['MLP [128, 64, 32]', 'MLP [64, 128, 64]', 'MLP [64, 128, 64, 32]', 'LSTM [128]', 'LSTM [128, 64, 32]', 'LSTM [256, 128, 64]', 'transformer 32', 'transformer 64', 'transformer 128', 'BERT']
+	for tag in tags:
+		print(tag)
+		if len(output_stations) > 0:
+			logs1 = output_logs(tag,)
+			continue
+		logs1 = output_logs(tag, 'Equal: True')
+		logs2 = output_logs(tag, 'Equal: False')
+		if len(logs1) == 0 or len(logs2) == 0:
+			continue
+		area1 = area2 = 0
+		area1 += sum([log['area'] for log in logs1])
+		area2 += sum([log['area'] for log in logs2])
+		log_record = f'{tag}, {area1}, {len(logs1)}, {area2}, {len(logs2)}'
+		summary_log_file.write(f'{log_record}\n')
+		if flag_all_xsections:
+			break
+	summary_log_file.close()
+if __name__ == '__main__':
+	main()

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,460 @@

+# title: create earthwwork train dataset
+# author: Taewook Kang
+# date: 2024.3.27
+# description: create earthwork train dataset
+# license: MIT
+# version
+#   0.1. 2024.3.27. create file
+#
+import os, math, argparse, json, re, traceback, numpy as np, pandas as pd, trimesh, laspy, shutil
+import logging, matplotlib.pyplot as plt, shapely
+from shapely.geometry import Polygon, LineString
+from scipy.spatial import distance
+from tqdm import trange, tqdm
+from math import pi
+logging.basicConfig(level=logging.DEBUG, filename='logs.txt',
+					format='%(asctime)s %(levelname)s %(message)s',
+					datefmt='%H:%M:%S')
+logger = logging.getLogger("prep")
+_precision = 0.00001
+def get_bbox(polyline):
+	polyline_np = np.array(polyline)
+	xmin, ymin = np.amin(polyline_np, axis=0)
+	xmax, ymax = np.amax(polyline_np, axis=0)
+	return (xmin, ymin, xmax, ymax)
+def get_center_point(pline):
+	if len(pline) == 0:
+		return (0, 0)
+	xs = [p[0] for p in pline]
+	ys = [p[1] for p in pline]
+	return (sum(xs) / len(pline), sum(ys) / len(pline))
+def intersect_line(line1, line2):
+	(x1, y1), (x2, y2) = line1
+	(x3, y3), (x4, y4) = line2
+	denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+	if denominator == 0:
+		return None  # lines are parallel
+	x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator
+	y = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denominator
+	# check (x, y) in line1 and line2
+	if x < min(x1, x2) or x > max(x1, x2) or x < min(x3, x4) or x > max(x3, x4):
+		return None
+	return (x, y)
+def get_positions_pline(base_pline, target_pline):
+	target_pos_marks = []
+	for i in range(len(target_pline)):
+		target = [target_pline[i], (target_pline[i][0], target_pline[i][1] + 1e+10)] # vertical line to check below
+		pos = 0.0
+		for j in range(len(base_pline) - 1):
+			base = [base_pline[j], base_pline[j + 1]]
+			intersect = intersect_line(base, target)
+			if intersect == None:
+				continue
+			if equal(intersect[1], target[0][1]):
+				pos = 0.0
+				break
+			pos = -1.0 if intersect[1] > target[0][1] else 1.0
+			break
+		target_pos_marks.append(pos)
+	return target_pos_marks
+def get_below_pline(base_pline, target_pline):
+	pos_marks = get_positions_pline(base_pline, target_pline)
+	average = sum(pos_marks) / len(pos_marks)
+	return average < 0.0
+def get_geometry(xsec, label):
+	for geom in xsec['geom']:
+		if geom['label'] == label:
+			return geom
+	return None
+def is_point_in_rect(point1, point2, perp):
+	return is_point_in_rectangle(point1[0], point1[1], point2[0], point2[1], perp[0], perp[1])
+def is_point_in_rectangle(x1, y1, x2, y2, x, y):
+	# Ensure that x1 <= x2 and y1 <= y2
+	x1, x2 = min(x1, x2), max(x1, x2)
+	y1, y2 = min(y1, y2), max(y1, y2)
+	# Check if (x, y) is within the rectangle
+	return x1 <= x <= x2 and y1 <= y <= y2
+def sign_distance(a, b, c, p):
+	d = math.sqrt(a*a + b*b)
+	if d == 0.0:
+		lp = 0.0
+	else:
+		lp = (a * p[0] + b * p[1] + c) / d
+	return lp
+def equal(a, b):
+	return abs(a - b) < _precision
+def equal_point(p1, p2):
+	return equal(p1[0], p2[0]) and equal(p1[1], p2[1])
+def get_angle(x1, y1, x2, y2):
+	pi = math.acos(-1.0)
+	# Caculate the quadrant of the line.
+	dx = x2 - x1
+	dy = y2 - y1
+	# Calculate the angle in radians for lines in the left and right quadrants
+	if dx == 0 and dy == 0:
+		return -1.0
+	if dy < 0 and dx == 0:
+		angle_radius = pi + pi / 2
+	elif dy > 0 and dx == 0:
+		angle_radius = pi / 2
+	else:
+		angle_radius = math.atan(dy / dx)
+	# Adjust the angle for different quadrants
+	if dy >= 0 and dx > 0:
+		pass
+	if dy < 0 and dx > 0:
+		angle_radius += 2 * pi
+	elif dx < 0:
+		angle_radius += pi
+	return angle_radius
+def line_coefficients(point1, point2):
+    x1, y1 = point1
+    x2, y2 = point2
+    A = y2 - y1
+    B = x1 - x2
+    C = x2*y1 - x1*y2
+    return A, B, C
+def sign_point_on_line(point1, point2, new_point):
+	if equal(point1[0], new_point[0]) and equal(point1[1], new_point[1]):
+		return 0.0
+	if equal(point2[0], new_point[0]) and equal(point2[1], new_point[1]):
+		return 0.0
+	line_A, line_B, line_C = line_coefficients(point1, point2)
+	x, y = new_point
+	value = line_A * x + line_B * y + line_C
+	if math.fabs(value) < _precision:
+		return 0.0
+	elif value > 0.0:
+		return 1.0
+	return -1.0
+def sign_distance_on_line(line_point1, line_point2, point):
+	direction = sign_point_on_line(line_point1, line_point2, point)
+	if direction == 0:
+		return 0.0
+	if math.fabs(line_point1[0] - line_point2[0]) < _precision and math.fabs(line_point1[1] <= line_point2[1]) < _precision:
+		return 0.0
+	# TBD. bug
+	x = point[0]
+	y = point[1]
+	x1 = line_point1[0]
+	y1 = line_point1[1]
+	x2 = line_point2[0]
+	y2 = line_point2[1]
+	if x1 <= x2:
+		a = 1
+		b = 0
+		c = -x1
+	else:
+		m = (y2 - y1) / (x2 - x1)
+		a = -m
+		b = 1
+		c = -y1 + (m * x1)
+	dist = abs(a * x + b * y + c) / math.sqrt(a * a + b * b)
+	dist *= float(direction)
+	return dist
+def is_point_on_line(point1, point2, perp):
+	if is_point_in_rect(point1, point2, perp) == False:
+		return False
+	direction = sign_point_on_line(point1, point2, perp)
+	if math.fabs(direction) < _precision:
+		return True
+	return False
+def is_overlap_line(line, part_seg):
+	p1, p2 = line
+	p3, p4 = part_seg
+	f1 = is_point_on_line(p1, p2, p3)
+	f2 = is_point_on_line(p1, p2, p4)
+	if (f1 or f2) and f1 != f2: # dangling point is not overlap.
+		if f1 and (equal_point(p1, p3) or equal_point(p2, p3)):
+			return False
+		if f2 and (equal_point(p1, p4) or equal_point(p2, p4)):
+			return False
+	return f1 or f2
+def is_on_pline(polyline, base_line):
+	p1 = base_line[0]
+	p2 = base_line[1]
+	for i in range(len(polyline) - 1):
+		p3 = polyline[i]
+		p4 = polyline[i + 1]
+		if is_overlap_line((p1, p2), (p3, p4)) or is_overlap_line((p3, p4), (p1, p2)):
+			return True
+	return False
+def get_match_line_labels(xsec, base_geom, base_line):
+	labels = []
+	for geom in xsec['geom']:
+		if geom == base_geom:
+			continue
+		geom_label = geom['label']
+		base_label = base_geom['label']
+		if geom_label == base_label:
+			continue
+		closed = geom['closed']
+		if closed == True: # only polyline is considered
+			continue
+		if geom_label == 'center':
+			continue
+		polyline = geom['polyline']
+		if is_on_pline(polyline, base_line):
+			labels.append(geom['label'])
+	return labels
+def get_seq_feature_tokens(xsec, geom, closed_type):
+	polyline = geom['polyline']
+	closed = geom['closed']
+	if closed != closed_type:
+		return []
+	lines = []
+	for i in range(len(polyline) - 1):
+		line = (polyline[i], polyline[i + 1])
+		lines.append(line)
+	geom_tokens = []
+	for line in lines:
+		labels = get_match_line_labels(xsec, geom, line)
+		if len(labels) == 0:
+			continue
+		# if len(labels) == 1:
+		# 	geom_tokens.append(labels[0])
+		# else:
+		geom_tokens.extend(labels)
+	return geom_tokens
+def translate_geometry(xsec, cp):
+	for geom in xsec['geom']:
+		polyline = geom['polyline']
+		geom['polyline'] = [(p[0] - cp[0], p[1] - cp[1]) for p in polyline]
+	return xsec
+def is_closed(polyline):
+	if equal_point(polyline[0], polyline[-1]):
+		return True
+	return False
+def summery_feature(features):
+	sum_features = []
+	if len(features) == 0:
+		return sum_features
+	index = 0
+	while index < len(features):
+		f = features[index]
+		sum_feature = f
+		if type(f) == list:
+			sum_feature = summery_feature(f)
+			if len(sum_feature) == 1:
+				sum_feature = sum_feature[0]
+		elif type(f) == str:
+			label = f
+			# find last index of same level in features array with label
+			last_index = index
+			for i in range(index + 1, len(features)):
+				if type(features[i]) == str:
+					if features[i] == label:
+						last_index = i
+					else:
+						break
+				else:
+					break
+			if last_index != index:
+				sum_feature = (f'{f}({last_index - index + 1})')
+			index = last_index
+		else:
+			pass
+		sum_features.append(sum_feature)
+		index += 1
+	return sum_features
+def get_intersection_count(xsec, base_geom, target_label):
+	pave_top = get_geometry(xsec, 'pave_surface')
+	if pave_top == None:
+		return 0
+	polyline = base_geom['polyline']
+	polygon = Polygon(polyline)
+	base_p1 = polygon.centroid
+	base_p2 = (base_p1.x, base_p1.y + 1e+10)
+	vertical_line = LineString([base_p1, base_p2])
+	count = 0
+	for target_geom in xsec['geom']:
+		if base_geom == target_geom:
+			continue
+		label = target_geom['label']
+		if re.search(target_label, label) == None:
+			continue
+		# check intersection
+		target_polyline = target_geom['polyline']
+		polyline = LineString(target_polyline)
+		ip = shapely.intersection(polyline, vertical_line) # https://shapely.readthedocs.io/en/stable/reference/shapely.intersection.html
+		if ip.is_empty:
+			continue
+		count += 1
+	return count
+def update_xsection_feature(xsec):
+	gnd_geom = get_geometry(xsec, 'ground')
+	if gnd_geom == None:
+		return None
+	center = get_geometry(xsec, 'center')
+	if center == None or 'polyline' not in center:
+		return None
+	cp = get_center_point(center['polyline'])
+	xsec = translate_geometry(xsec, cp)
+	station = xsec['station']
+	index = 0
+	while index < len(xsec['geom']):
+		geom = xsec['geom'][index]
+		label = geom['label']
+		polyline = geom['polyline']
+		closed = geom['closed']
+		if len(polyline) <= 2 or closed == False:
+			index += 1
+			continue
+		pt1 = polyline[0]
+		pt2 = polyline[-1]
+		if equal_point(pt1, pt2) == False:	# closed polyline
+			polyline.append(pt1)
+		# noise filtering
+		polygon = Polygon(polyline) # calculate area of polyline as polygon
+		if math.fabs(polygon.area) < _precision:
+			xsec['geom'].pop(index) # remove index element in xsec['geom']
+			continue
+		if station == '1+660.00000' and label == 'cut_ditch':
+			label = 'cut_ditch'
+		# processing
+		if get_below_pline(gnd_geom['polyline'], polyline):
+			geom['earthwork_feature'].append('below')
+		else:
+			geom['earthwork_feature'].append('above')
+		if re.search('pave_.*', label):
+			pave_int_count = get_intersection_count(xsec, geom, 'pave_.*')
+			geom['earthwork_feature'].append(f'pave_int({pave_int_count})')
+		tokens = get_seq_feature_tokens(xsec, geom, True)
+		if len(tokens) == 1:
+			geom['earthwork_feature'].append(tokens[0])
+		else:
+			geom['earthwork_feature'].extend(tokens)
+		geom['earthwork_feature'] = summery_feature(geom['earthwork_feature'])
+		# print(f'{station}. {label} feature: {geom["earthwork_feature"]}')
+		logger.debug(f'{station}. {label} feature: {geom["earthwork_feature"]}')
+		index += 1
+	return xsec
+def update_xsections_feature(xsections):
+	# update closed polygon
+	for xsec in xsections:
+		for geom in xsec['geom']:
+			label = geom['label']
+			polyline = geom['polyline']
+			if len(polyline) < 2:
+				continue
+			closed = is_closed(polyline)
+			if closed == False: # exception case, pavement
+				closed = False if re.search('pave_layer.*', label) == None else True
+			geom['closed'] = closed
+	# update features
+	out_xsections = []
+	for xsec in xsections:
+		out_xsec = update_xsection_feature(xsec)
+		if out_xsec == None:
+			continue
+		out_xsections.append(out_xsec)
+	return out_xsections
+def main():
+	parser = argparse.ArgumentParser(description='create earthwork train dataset')
+	parser.add_argument('--input', type=str, default='output/', help='input folder')
+	parser.add_argument('--output', type=str, default='dataset/', help='output folder')
+	args = parser.parse_args()
+	try:
+		file_names = os.listdir(args.input)
+		for file_name in tqdm(file_names):
+			if file_name.endswith('.json') == False:
+				continue
+			print(f'processing {file_name}')
+			data = None
+			with open(os.path.join(args.input, file_name), 'r') as f:
+				data = json.load(f)
+			out_xsections = update_xsections_feature(data)
+			output_file = os.path.join(args.output, file_name)
+			with open(output_file, 'w') as f:
+				json.dump(out_xsections, f, indent=4)
+	except Exception as e:
+		print(f'error: {e}')
+		traceback.print_exc()
+if __name__ == '__main__':
+	main()