File size: 1,297 Bytes
627e25a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
from datasets import Dataset


def prepare_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []
    for query in data['queries']:
        words = query['text'].split()
        labels = ['O'] * len(words)  # 'O' for Outside (not an entity)
        for start, end, entity_type, entity_text in query['entities']:
            entity_words = entity_text.lower().split()
            found = False
            for i in range(len(words) - len(entity_words) + 1):
                if [w.lower() for w in words[i:i + len(entity_words)]] == entity_words:
                    for j, word in enumerate(words[i:i + len(entity_words)]):
                        labels[i + j] = f'B-{entity_type}' if j == 0 else f'I-{entity_type}'
                    found = True
                    break
            if not found:
                print(f"Warning: Entity '{entity_text}' not found in text '{query['text']}'")
        processed_data.append({'words': words, 'labels': labels})

    return Dataset.from_list(processed_data)


train_dataset = prepare_data('/home/ebk/PycharmProjects/pythonProject/tripgo-hotel/train_dataset.json')
eval_dataset = prepare_data('/home/ebk/PycharmProjects/pythonProject/tripgo-hotel/eval_dataset.json')