trandackhoa
/

NER-services-location

Model card Files Files and versions Community

NER-services-location / train.py

trandackhoa's picture

Upload 20 files

627e25a verified 15 days ago

history blame contribute delete

1.3 kB

	import json
	from datasets import Dataset


	def prepare_data(file_path):
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	processed_data = []
	for query in data['queries']:
	words = query['text'].split()
	labels = ['O'] * len(words) # 'O' for Outside (not an entity)
	for start, end, entity_type, entity_text in query['entities']:
	entity_words = entity_text.lower().split()
	found = False
	for i in range(len(words) - len(entity_words) + 1):
	if [w.lower() for w in words[i:i + len(entity_words)]] == entity_words:
	for j, word in enumerate(words[i:i + len(entity_words)]):
	labels[i + j] = f'B-{entity_type}' if j == 0 else f'I-{entity_type}'
	found = True
	break
	if not found:
	print(f"Warning: Entity '{entity_text}' not found in text '{query['text']}'")
	processed_data.append({'words': words, 'labels': labels})

	return Dataset.from_list(processed_data)


	train_dataset = prepare_data('/home/ebk/PycharmProjects/pythonProject/tripgo-hotel/train_dataset.json')
	eval_dataset = prepare_data('/home/ebk/PycharmProjects/pythonProject/tripgo-hotel/eval_dataset.json')