File size: 1,506 Bytes
b599481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
from collections import defaultdict

kg = defaultdict(list)
entity2id = defaultdict(lambda: len(entity2id))
relation2id = defaultdict(lambda: len(relation2id))

with open('id2info.json', encoding='utf-8') as f:
    id2info = json.load(f)
    for info_dict in id2info.values():
        item = info_dict['name']
        for attr, value in info_dict.items():
            if attr == 'name':
                continue
            if isinstance(value, list):
                for v in value:
                    kg[entity2id[item]].append((relation2id[attr], entity2id[v]))
            else:
                kg[entity2id[item]].append((relation2id[attr], entity2id[value]))

print(len(kg), len(entity2id), len(relation2id))

with open('kg.json', 'w', encoding='utf-8') as f:
    json.dump(kg, f, ensure_ascii=False)
with open('entity2id.json', 'w', encoding='utf-8') as f:
    json.dump(entity2id, f, ensure_ascii=False)
with open('relation2id.json', 'w', encoding='utf-8') as f:
    json.dump(relation2id, f, ensure_ascii=False)

item_ids = set()
with open('data.jsonl', encoding='utf-8') as f:
    for line in f:
        line = json.loads(line)
        for turn in line['dialog']:
            for item in turn['item']:
                if item in entity2id:
                    item_ids.add(entity2id[item])
print(len(item_ids))
item_ids = sorted(item_ids)
with open('item_ids.json', 'w', encoding='utf-8') as f:
    json.dump(item_ids, f, ensure_ascii=False)