# ======================================================================== # Copyright 2018 Emory University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ======================================================================== import glob import json import os import re import random import numpy as np from copy import deepcopy from collections import Counter, OrderedDict, defaultdict __author__ = 'Jinho D. Choi' SEASON_ID = 'season_id' EPISODES = 'episodes' EPISODE_ID = 'episode_id' EPISODE = 'episode' SCENES = 'scenes' SCENE_ID = 'scene_id' UTTERANCES = 'utterances' UTTERANCE_ID = 'utterance_id' SPEAKERS = 'speakers' TRANSCRIPT = 'transcript' TRANSCRIPT_WITH_NOTE = 'transcript_with_note' TOKENS = 'tokens' TOKENS_WITH_NOTE = 'tokens_with_note' # character identification CHARACTER_ENTITIES = 'character_entities' # emotion detection EMOTION = 'emotion' # movie CAPTION = 'caption' # reading comprehension RC_ENTITIES = 'rc_entities' PLOTS = 'plots' P_ENT = 'p_ent' U_ENT = 'u_ent' S_ENT = 's_ent' QUERY = 'query' ANSWER = 'answer' # =================================== Ordered JSON =================================== class NoIndent(object): def __init__(self, value): self.value = value class NoIndentEncoder(json.JSONEncoder): REGEX = re.compile(r'@@@(\d+)@@@') def __init__(self, *args, **kwargs): super(NoIndentEncoder, self).__init__(*args, **kwargs) self.kwargs = dict(kwargs) del self.kwargs['indent'] self._replacements = {} def default(self, o): if isinstance(o, NoIndent): key = len(self._replacements) self._replacements[key] = json.dumps(o.value, **self.kwargs) return "@@@%d@@@" % (key) else: return super(NoIndentEncoder, self).default(o) def encode(self, o): result = super(NoIndentEncoder, self).encode(o) out = [] m = self.REGEX.search(result) while m: key = int(m.group(1)) out.append(result[:m.start(0)-1]) out.append(self._replacements[key]) result = result[m.end(0)+1:] m = self.REGEX.search(result) return ''.join(out) def pair(key, d, noindent=False): s = d[key] if isinstance(s, str): s = ' '.join(s.split()) return (key, NoIndent(s)) if noindent else (key, s) def ordered_json(input, plot=True, wo_note=True, wi_note=True, caption=True, character_entities=True, emotion=True, rc_entities=True): s = json.load(open(input)) if isinstance(input, str) else input season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)]) if len(s) != len(season): print('Error: 0') episodes = season[EPISODES] for i, e in enumerate(episodes): episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)]) if len(e) != len(episode): print('Error: 1') episodes[i] = episode scenes = episode[SCENES] for j, c in enumerate(scenes): scene = [pair(SCENE_ID, c), pair(UTTERANCES, c)] if plot and PLOTS in c: scene.append(pair(PLOTS, c)) if rc_entities and RC_ENTITIES in c: scene.append((RC_ENTITIES, c[RC_ENTITIES])) for d in c[RC_ENTITIES].values(): for k, v in d.items(): d[k] = NoIndent(v) scene = OrderedDict(scene) if len(c) != len(scene): print('Error 2: '+scene[SCENE_ID]) scenes[j] = scene utterances = scene[UTTERANCES] for k, u in enumerate(utterances): utterance = [pair(UTTERANCE_ID, u), pair(SPEAKERS, u, True)] if wo_note: utterance.append(pair(TRANSCRIPT, u)) utterance.append((TOKENS, [NoIndent(t) for t in u[TOKENS]])) if wi_note: utterance.append(pair(TRANSCRIPT_WITH_NOTE, u)) twn = u[TOKENS_WITH_NOTE] utterance.append((TOKENS_WITH_NOTE, [NoIndent(t) for t in twn] if twn else twn)) if character_entities and CHARACTER_ENTITIES in u: utterance.append((CHARACTER_ENTITIES, [NoIndent(t) for t in u[CHARACTER_ENTITIES]])) if emotion and EMOTION in u: utterance.append((EMOTION, NoIndent(u[EMOTION]))) if caption and CAPTION in u: utterance.append((CAPTION, NoIndent(u[CAPTION]))) utterance = OrderedDict(utterance) if len(u) != len(utterance): print('Error: 3') utterances[k] = utterance out = json.dumps(season, cls=NoIndentEncoder, indent=2) # out += '\n }\n }\n }\n ]\n }\n ]\n}' # TODO: should not be necessary # out += '\n] }\n ]\n }\n ]\n \n ]\n}' # character identification out += '\n }\n ]\n }\n ]\n }\n ]\n}' # emotion detection return out # =================================== General =================================== def general_stats(json_dir): def stats(json_file): num_scenes = 0 num_utterances = 0 num_utterances_wn = 0 num_sentences = 0 num_sentences_wn = 0 num_tokens = 0 num_tokens_wn = 0 speaker_list = set() season = json.load(open(json_file)) episodes = season[EPISODES] for episode in episodes: scenes = episode[SCENES] num_scenes += len(scenes) for scene in scenes: utterances = scene[UTTERANCES] num_utterances_wn += len(utterances) for utterance in utterances: speaker_list.update(utterance[SPEAKERS]) tokens = utterance[TOKENS] if tokens: num_utterances += 1 num_sentences += len(tokens) num_tokens += sum([len(t) for t in tokens]) tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens num_sentences_wn += len(tokens_wn) num_tokens_wn += sum([len(t) for t in tokens_wn]) return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, speaker_list, num_utterances_wn, num_sentences_wn, num_tokens_wn] g_speaker_list = set() print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers', 'Utterances (WN)', 'Sentences (WN)', 'Tokens (WN)'])) for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): l = stats(json_file) g_speaker_list.update(l[6]) l[6] = len(l[6]) print('\t'.join(map(str, l))) print('All speakers: %s' % (len(g_speaker_list))) def compare_peer(input_dir1, input_dir2): for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))): input_file2 = os.path.join(input_dir2, os.path.basename(input_file1)) print(os.path.basename(input_file1)) season1 = json.load(open(input_file1)) season2 = json.load(open(input_file2)) season_id = season1[SEASON_ID] episodes1 = season1[EPISODES] episodes2 = season2[EPISODES] if len(episodes1) != len(episodes2): print('Episode mismatch: %s - %d, %d' % (season_id, len(episodes1), len(episodes2))) for episode1, episode2 in zip(episodes1, episodes2): episode_id = episode1[EPISODE_ID] scenes1 = episode1[SCENES] scenes2 = episode2[SCENES] if len(scenes1) != len(scenes2): print('Scene mismatch: %s - %d, %d' % (episode_id, len(scenes1), len(scenes2))) for scene1, scene2 in zip(scenes1, scenes2): scene_id = scene1[SCENE_ID] utterances1 = scene1[UTTERANCES] utterances2 = scene2[UTTERANCES] if len(utterances1) != len(utterances2): print('Utterance mismatch: %s - %d, %d' % (scene_id, len(utterances1), len(utterances2))) for utterance1, utterance2 in zip(utterances1, utterances2): utterance_id = utterance1[UTTERANCE_ID] tokens1 = utterance1[TOKENS] tokens2 = utterance2[TOKENS] if len(tokens1) != len(tokens2): print('Token mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2))) m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]] if m: print('Token mismatch: %s - %s' % (utterance_id, str(m))) tokens1 = utterance1[TOKENS_WITH_NOTE] tokens2 = utterance2[TOKENS_WITH_NOTE] if tokens1 is None and tokens2 is None: continue if len(tokens1) != len(tokens2): print('Token WN mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2))) m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]] if m: print('Token WN mismatch: %s - %s' % (utterance_id, str(m))) # =================================== Character Identification =================================== def extract_character_identification(input_dir, output_dir): """ trn: episodes 1-19 dev: episodes 20-21 tst: episodes 22-end """ trn = {SEASON_ID: 'trn', EPISODES: []} dev = {SEASON_ID: 'dev', EPISODES: []} tst = {SEASON_ID: 'tst', EPISODES: []} def get_entities(entity_list): return [entity for entity in entity_list if entity[-1] != 'Non-Entity'] for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))): if i >= 4: break season = json.load(open(input_file)) print(input_file) for episode in season[EPISODES]: episode_id = int(episode[EPISODE_ID].split('_')[1][1:]) d = tst if episode_id >= 22 else dev if episode_id >= 20 else trn d[EPISODES].append(episode) scenes = [] for scene in episode[SCENES]: utterances = [] for utterance in scene[UTTERANCES]: if utterance[TOKENS]: utterances.append(utterance) if CHARACTER_ENTITIES in utterance: utterance[CHARACTER_ENTITIES] = [get_entities(entity_list) for entity_list in utterance[CHARACTER_ENTITIES]] else: print(utterance[UTTERANCE_ID]) if utterances: scene[UTTERANCES] = utterances scenes.append(scene) episode[SCENES] = scenes with open(os.path.join(output_dir, 'character-identification-trn.json'), 'w') as fout: fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False)) with open(os.path.join(output_dir, 'character-identification-dev.json'), 'w') as fout: fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False)) with open(os.path.join(output_dir, 'character-identification-tst.json'), 'w') as fout: fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False)) def entity_stats(json_dir): def stats(json_file): speaker_list = [] entity_list = [] num_scenes = 0 num_utterances = 0 num_tokens = 0 num_mentions = 0 season = json.load(open(json_file)) episodes = season[EPISODES] for episode in episodes: scenes = episode[SCENES] num_scenes += len(scenes) for scene in scenes: utterances = scene[UTTERANCES] num_utterances += len(utterances) for utterance in utterances: num_tokens += sum([len(t) for t in utterance[TOKENS]]) speaker_list.extend(utterance[SPEAKERS]) if len(utterance[TOKENS]) != len(utterance[CHARACTER_ENTITIES]): print(utterances[UTTERANCE_ID]) for character_entities in utterance[CHARACTER_ENTITIES]: num_mentions += len(character_entities) for entities in character_entities: entity_list.extend(entities[2:]) g_speaker_list.extend(speaker_list) g_entity_list.extend(entity_list) return [season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_mentions, len(set(entity_list))] g_speaker_list = [] g_entity_list = [] print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Mentions', 'Entities'])) for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): l = stats(json_file) print('\t'.join(map(str, l))) print('All speakers: %s' % (len(set(g_speaker_list)))) print('All entities: %s' % (len(set(g_entity_list)))) # =================================== Emotion Detection =================================== def extract_emotion_detection(input_dir, output_dir): trn = {SEASON_ID: 'trn', EPISODES: []} dev = {SEASON_ID: 'dev', EPISODES: []} tst = {SEASON_ID: 'tst', EPISODES: []} DEV = {'s01_e15', 's01_e20', 's02_e10', 's02_e20', 's03_e01', 's03_e09', 's03_e21', 's04_e01', 's04_e06', 's04_e10', 's04_e21'} TST = {'s01_e01', 's01_e10', 's02_e08', 's02_e23', 's03_e08', 's03_e20', 's04_e02', 's04_e20', 's04_e17'} def get_entities(entity_list): return [entity for entity in entity_list if entity[-1] != 'Non-Entity'] for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))): if i >= 4: break season = json.load(open(input_file)) print(input_file) for episode in season[EPISODES]: episode_id = episode[EPISODE_ID] d = tst if episode_id in TST else dev if episode_id in DEV else trn d[EPISODES].append(episode) scenes = [] for scene in episode[SCENES]: utterances = [] emotions = 0 misses = [] for utterance in scene[UTTERANCES]: if utterance[TOKENS]: if EMOTION in utterance: utterance[EMOTION] = utterance[EMOTION][0] emotions += 1 else: misses.append(utterance[UTTERANCE_ID]) utterances.append(utterance) if emotions > 0: if emotions != len(utterances): print(misses) scene[UTTERANCES] = utterances scenes.append(scene) episode[SCENES] = scenes with open(os.path.join(output_dir, 'emotion-detection-trn.json'), 'w') as fout: fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False)) with open(os.path.join(output_dir, 'emotion-detection-dev.json'), 'w') as fout: fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False)) with open(os.path.join(output_dir, 'emotion-detection-tst.json'), 'w') as fout: fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False)) def emotion_stats(json_dir): def stats(json_file): emotions = {} num_scenes = 0 num_utterances = 0 episode_ids = [] season = json.load(open(json_file)) episodes = season[EPISODES] for episode in episodes: episode_ids.append(episode[EPISODE_ID]) scenes = episode[SCENES] num_scenes += len(scenes) for scene in scenes: utterances = scene[UTTERANCES] num_utterances += len(utterances) for utterance in utterances: e = utterance[EMOTION] emotions[e] = emotions.setdefault(e, 0) + 1 print(episode_ids) return [season[SEASON_ID], len(episodes), num_scenes, num_utterances] + [emotions[e] for e in emotion_list] emotion_list = ['Joyful', 'Mad', 'Neutral', 'Peaceful', 'Powerful', 'Sad', 'Scared'] print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances'] + emotion_list)) for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): l = stats(json_file) print('\t'.join(map(str, l))) # =================================== Reading Comprehension =================================== def relabel(samples): re_samples = [] for sample in samples: sam = {} q_words = sample[QUERY].split(' ') d_words = [] for utter in sample[UTTERANCES]: d_words += utter[SPEAKERS] d_words += utter[TOKENS] entity_dict = {} entity_id = 0 for word in d_words + q_words: if (word.startswith('@ent')) and (word not in entity_dict): entity_dict[word] = '@ent%02d' % entity_id entity_id += 1 re_document = [] for utter in sample[UTTERANCES]: sent = {SPEAKERS: ' '.join( [entity_dict[w] if w in entity_dict else w for w in utter[SPEAKERS]]), TOKENS: ' '.join([entity_dict[w] if w in entity_dict else w for w in utter[TOKENS]])} re_document.append(sent) sam[SCENE_ID] = sample[SCENE_ID] sam[QUERY] = ' '.join([entity_dict[w] if w in entity_dict else w for w in q_words]) sam[ANSWER] = entity_dict[sample[ANSWER]] sam[UTTERANCES] = re_document re_samples.append(sam) return re_samples def extract_reading_comprehension(json_dir, output_dir): season_samples = defaultdict(list) random.seed(1234) for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): season = json.load(open(json_file)) for episode in season[EPISODES]: for scene in episode[SCENES]: if PLOTS in scene and scene[PLOTS]: masking_map = {} for vi, ki in enumerate(scene[RC_ENTITIES].keys()): masking_map[ki] = '@ent%02d' % vi masked_passages = [] for i, passage in enumerate(scene[PLOTS]): masked_sentence = [] ent_list = {} for ent, index_list in scene[RC_ENTITIES].items(): for index in index_list[P_ENT]: if i == index[0]: ent_list[index[1]] = (index[1], index[2], ent) jump = 0 for j, token in enumerate(passage.split(' ')): if jump > 0: jump -= 1 continue if j in ent_list: masked_sentence.append(masking_map[ent_list[j][2]]) jump = ent_list[j][1] - ent_list[j][0] - 1 else: masked_sentence.append(token) masked_passages.append(masked_sentence) masked_dialog = [] for i, utterance in enumerate(scene[UTTERANCES]): if utterance[TOKENS_WITH_NOTE] is not None: tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent] else: tokens = [w for sent in utterance[TOKENS] for w in sent] masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []} ent_list = {} for ent, index_list in scene[RC_ENTITIES].items(): for index in index_list[U_ENT]: if i == index[0]: ent_list[index[1]] = (index[1], index[2], ent) for index in index_list[S_ENT]: if i == index[0]: masked_utter[SPEAKERS][index[1]] = masking_map[ent] jump = 0 for j, token in enumerate(tokens): if jump > 0: jump -= 1 continue if j in ent_list: masked_utter[TOKENS].append(masking_map[ent_list[j][2]]) jump = ent_list[j][1] - ent_list[j][0] - 1 else: masked_utter[TOKENS].append(token) masked_dialog.append(masked_utter) dialog_entities = Counter() for ent, ent_list in scene[RC_ENTITIES].items(): if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0: dialog_entities.update([masking_map[ent]]) for sentence in masked_passages: for i, token in enumerate(sentence): if token.startswith('@ent') and token in dialog_entities: sample = {} query = deepcopy(sentence) query[i] = '@placeholder' sample[QUERY] = ' '.join(query) sample[ANSWER] = token sample[UTTERANCES] = masked_dialog sample[SCENE_ID] = scene[SCENE_ID] season_samples[season[SEASON_ID]].append(sample) trn = [] dev = [] tst = [] for season_id, s_samples in season_samples.items(): n = len(s_samples) random.shuffle(s_samples) trn.extend(s_samples[:int(0.8 * n)]) dev.extend(s_samples[int(0.8 * n):int(0.9 * n)]) tst.extend(s_samples[int(0.9 * n):]) trn = relabel(trn) dev = relabel(dev) tst = relabel(tst) with open(os.path.join(output_dir, 'trn.json'), 'w') as fout: fout.write(json.dumps(trn, indent=2)) with open(os.path.join(output_dir, 'dev.json'), 'w') as fout: fout.write(json.dumps(dev, indent=2)) with open(os.path.join(output_dir, 'tst.json'), 'w') as fout: fout.write(json.dumps(tst, indent=2)) def reading_stats(json_dir): def create(dataset, num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances): return [dataset, num_queries, num_utterances / num_queries, num_entity_type_query / num_queries, num_entity_count_query / num_queries, num_entity_type_utt / num_queries, num_entity_count_utt / num_queries] def stats(json_file): documents = json.load(open(json_file)) num_queries = len(documents) num_entity_count_query = 0 num_entity_type_query = 0 num_entity_count_utt = 0 num_entity_type_utt = 0 num_utterances = 0 for doc in documents: ents = [doc[ANSWER] if q == '@placeholder' else q for q in doc[QUERY].split() if q.startswith('@ent') or q == '@placeholder'] num_entity_count_query += len(ents) num_entity_type_query += len(set(ents)) num_utterances += len(doc[UTTERANCES]) ents = [] for utterance in doc[UTTERANCES]: ents.extend(utterance[SPEAKERS].split()) ents.extend([t for t in utterance[TOKENS].split() if t.startswith('@ent')]) num_entity_type_utt += len(set(ents)) num_entity_count_utt += len(ents) return [num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances] print('\t'.join(['Dataset', 'Queries', 'U / Q', '{E} / Q', '[E] / Q', '{E} / U', '[E] / U'])) g_num = np.zeros(6) for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): l = stats(json_file) g_num += np.array(l) print('\t'.join(map(str, create(json_file[-15:-12].upper(), *l)))) print('\t'.join(map(str, create('Total', *g_num)))) # =================================== Main =================================== if __name__ == '__main__': # json_dir = '/Users/jdchoi/Git/character-mining/json' # general_stats(json_dir) # character identification # input_dir = '/Users/jdchoi/Git/character-mining/json' # output_dir = '/Users/jdchoi/Git/character-identification/json' # extract_character_identification(input_dir, output_dir) # entity_stats(output_dir) # emotino detection # input_dir = '/Users/jdchoi/Git/character-mining/json' # output_dir = '/Users/jdchoi/Git/emotion-detection/json' # extract_emotion_detection(input_dir, output_dir) # emotion_stats(output_dir) # reading comprehension # json_dir = '/Users/jdchoi/Git/character-mining/json' output_dir = '/Users/jdchoi/Git/reading-comprehension/json' # extract_reading_comprehension(json_dir, output_dir) reading_stats(output_dir) # input_dir = '/Users/jdchoi/Git/character-mining/json' # ann_dir = '/Users/jdchoi/Downloads/dataset' # output_dir = '/Users/jdchoi/Git/character-mining/json/em' # merge_em(input_dir, ann_dir, output_dir) # input_dir1 = '/Users/jdchoi/Git/character-mining-dev/json-bak' # input_dir2 = '/Users/jdchoi/Downloads/Friends_newly_compiled' # output_dir = '/Users/jdchoi/Git/character-mining/json' # merge_rc(input_dir1, input_dir2, output_dir) # def merge_rc(input_dir1, input_dir2, output_dir): # def get_entities(rc_entities): # plot = rc_entities['plot_entities'] # speaker = rc_entities['speaker_entities'] # utterance = rc_entities['utterance_entities'] # entities = {} # # if plot: # for name, ts in plot.items(): # d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])])) # d[P_ENT] = [t[:-1] for t in ts] # # for name, ts in utterance.items(): # d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])])) # d[U_ENT] = [t[:-1] for t in ts] # # for name, ts in speaker.items(): # d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])])) # d[S_ENT] = [t[:-1] for t in ts] # # return entities # # for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))): # input_file2 = os.path.join(input_dir2, os.path.basename(input_file1)) # print(os.path.basename(input_file1)) # # season1 = json.load(open(input_file1)) # season2 = json.load(open(input_file2)) # # episodes1 = season1[EPISODES] # episodes2 = season2[EPISODES] # # for episode1, episode2 in zip(episodes1, episodes2): # scenes1 = episode1[SCENES] # scenes2 = episode2[SCENES] # # for scene1, scene2 in zip(scenes1, scenes2): # scene1[PLOTS] = scene2[PLOTS] # scene1[RC_ENTITIES] = get_entities(scene2[RC_ENTITIES]) # # with open(os.path.join(output_dir, os.path.basename(input_file1)), 'w') as fout: # fout.write(ordered_json(season1)) # # # def merge_em(input_dir, ann_dir, output_dir): # def extend_ann(ann_file, ls): # fin = open(ann_file) # # for i, line in enumerate(fin): # if i == 0: continue # l = line.split() # season_id = int(l[0]) - 1 # episode_id = int(l[1]) - 1 # scene_id = int(l[2]) - 1 # utterance_id = int(l[3]) # annotation = l[4:8] # gold = l[10] # ls.append((season_id, episode_id, scene_id, utterance_id, annotation, gold)) # # # annotations = [] # for ann_file in glob.glob(os.path.join(ann_dir, '*.tsv')): extend_ann(ann_file, annotations) # seasons = [json.load(open(input_file)) for input_file in sorted(glob.glob(os.path.join(input_dir, '*.json')))] # # for season_id, episode_id, scene_id, utterance_id, annotation, gold in annotations: # utterance = seasons[season_id][EPISODES][episode_id][SCENES][scene_id][UTTERANCES][utterance_id] # if EMOTION in utterance: # if utterance[EMOTION] != gold: print(utterance[UTTERANCE_ID]) # utterance[EMOTION] = [gold, annotation] # else: # print(utterance[UTTERANCE_ID]) # # for i, season in enumerate(seasons): # with open(os.path.join(output_dir, 'friends_season_0%d.json' % (i+1)), 'w') as fout: # fout.write(ordered_json(season)) # # def extract_reading_comprehension_padded(json_dir, output_dir, des_size): # season_samples = defaultdict(list) # random.seed(1234) # # for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): # data = json.load(open(json_file)) # for episode_dict in data[EPISODES]: # for idx, scene_dict in enumerate(episode_dict[SCENES]): # if scene_dict[PLOTS] is not None: # # entities = Counter() # entities.update(scene_dict[RC_ENTITIES].keys()) # # cur = idx # dialog_len = len(scene_dict[UTTERANCES]) # while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1: # cur += 1 # entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys()) # dialog_len += len(episode_dict[SCENES][cur][UTTERANCES]) # if dialog_len < des_size: # cur = idx # while (cur > 0 and dialog_len < des_size): # cur -= 1 # entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys()) # dialog_len += len(episode_dict[SCENES][cur][UTTERANCES]) # # masking_map = {} # for vi, ki in enumerate(entities.keys()): # masking_map[ki] = '@ent%02d' % vi # # masked_passages = [] # for i, passage in enumerate(scene_dict[PLOTS]): # masked_sentence = [] # ent_list = {} # for ent, index_list in scene_dict[RC_ENTITIES].items(): # for index in index_list[P_ENT]: # if i == index[0]: # ent_list[index[1]] = (index[1], index[2], ent) # jump = 0 # for j, token in enumerate(passage.split(' ')): # if jump > 0: # jump -= 1 # continue # if j in ent_list: # masked_sentence.append(masking_map[ent_list[j][2]]) # jump = ent_list[j][1] - ent_list[j][0] - 1 # else: # masked_sentence.append(token) # masked_passages.append(masked_sentence) # # cur = idx # dialog_len = len(scene_dict[UTTERANCES]) # next_dialog = [] # while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1: # cur += 1 # for i, utterance in enumerate(episode_dict[SCENES][cur][UTTERANCES]): # if utterance[TOKENS_WITH_NOTE] is not None: # tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent] # else: # tokens = [w for sent in utterance[TOKENS] for w in sent] # # masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []} # ent_list = {} # for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items(): # for index in index_list[U_ENT]: # if i == index[0]: # ent_list[index[1]] = (index[1], index[2], ent) # for index in index_list[S_ENT]: # if i == index[0]: # masked_utter[SPEAKERS][index[1]] = masking_map[ent] # jump = 0 # for j, token in enumerate(tokens): # if jump > 0: # jump -= 1 # continue # if j in ent_list: # masked_utter[TOKENS].append(masking_map[ent_list[j][2]]) # jump = ent_list[j][1] - ent_list[j][0] - 1 # else: # masked_utter[TOKENS].append(token) # next_dialog.append(masked_utter) # dialog_len += 1 # if dialog_len == des_size: # break # # prev_dialog = [] # if dialog_len < des_size: # cur = idx # while dialog_len < des_size and cur > 0: # cur -= 1 # for i, utterance in enumerate(reversed(episode_dict[SCENES][cur][UTTERANCES])): # if utterance[TOKENS_WITH_NOTE] is not None: # tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent] # else: # tokens = [w for sent in utterance[TOKENS] for w in sent] # # masked_utter = {} # masked_utter[SPEAKERS] = utterance[SPEAKERS] # masked_utter[TOKENS] = [] # ent_list = {} # for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items(): # for index in index_list[U_ENT]: # if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1: # ent_list[index[1]] = (index[1], index[2], ent) # for index in index_list[S_ENT]: # if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1: # masked_utter[SPEAKERS][index[1]] = masking_map[ent] # jump = 0 # for j, token in enumerate(tokens): # if jump > 0: # jump -= 1 # continue # if j in ent_list: # masked_utter[TOKENS].append(masking_map[ent_list[j][2]]) # jump = ent_list[j][1] - ent_list[j][0] - 1 # else: # masked_utter[TOKENS].append(token) # prev_dialog.append(masked_utter) # dialog_len += 1 # if dialog_len == des_size: # break # # masked_dialog = [] # for i, utterance in enumerate(scene_dict[UTTERANCES]): # if utterance[TOKENS_WITH_NOTE] is not None: # tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent] # else: # tokens = [w for sent in utterance[TOKENS] for w in sent] # # masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []} # ent_list = {} # for ent, index_list in scene_dict[RC_ENTITIES].items(): # for index in index_list[U_ENT]: # if i == index[0]: # ent_list[index[1]] = (index[1], index[2], ent) # for index in index_list[S_ENT]: # if i == index[0]: # masked_utter[SPEAKERS][index[1]] = masking_map[ent] # jump = 0 # for j, token in enumerate(tokens): # if jump > 0: # jump -= 1 # continue # if j in ent_list: # masked_utter[TOKENS].append(masking_map[ent_list[j][2]]) # jump = ent_list[j][1] - ent_list[j][0] - 1 # else: # masked_utter[TOKENS].append(token) # masked_dialog.append(masked_utter) # # dialog_entities = Counter() # for ent, ent_list in scene_dict[RC_ENTITIES].items(): # if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0: # dialog_entities.update([masking_map[ent]]) # # full_dialog = [] # for u in reversed(prev_dialog): # full_dialog.append(u) # for u in masked_dialog: # full_dialog.append(u) # for u in next_dialog: # full_dialog.append(u) # # for sentence in masked_passages: # for i, token in enumerate(sentence): # if token.startswith('@ent') and token in dialog_entities: # sample = {} # query = deepcopy(sentence) # query[i] = '@placeholder' # sample[QUERY] = ' '.join(query) # sample[ANSWER] = token # sample[UTTERANCES] = full_dialog # sample[SCENE_ID] = scene_dict[SCENE_ID] # season_samples[data[SEASON_ID]].append(sample) # # trn = [] # dev = [] # tst = [] # for season_id, s_samples in season_samples.items(): # l = len(s_samples) # random.shuffle(s_samples) # trn.extend(s_samples[:int(0.8 * l)]) # dev.extend(s_samples[int(0.8 * l):int(0.9 * l)]) # tst.extend(s_samples[int(0.9 * l):]) # # trn = relabel(trn) # dev = relabel(dev) # tst = relabel(tst) # print(len(trn), len(dev), len(tst)) # # with open(os.path.join(output_dir, 'trn-%d.json' % des_size), 'w') as fout: # fout.write(json.dumps(trn, indent=2)) # # with open(os.path.join(output_dir, 'dev-%d.json' % des_size), 'w') as fout: # fout.write(json.dumps(dev, indent=2)) # # with open(os.path.join(output_dir, 'tst-%d.json' % des_size), 'w') as fout: # fout.write(json.dumps(tst, indent=2))