# ========================================================================
# Copyright 2018 Emory University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================
import glob
import json
import os
import re
import random
import numpy as np
from copy import deepcopy
from collections import Counter, OrderedDict, defaultdict

__author__ = 'Jinho D. Choi'

SEASON_ID = 'season_id'
EPISODES = 'episodes'
EPISODE_ID = 'episode_id'
EPISODE = 'episode'
SCENES = 'scenes'
SCENE_ID = 'scene_id'
UTTERANCES = 'utterances'
UTTERANCE_ID = 'utterance_id'
SPEAKERS = 'speakers'
TRANSCRIPT = 'transcript'
TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
TOKENS = 'tokens'
TOKENS_WITH_NOTE = 'tokens_with_note'

# character identification
CHARACTER_ENTITIES = 'character_entities'

# emotion detection
EMOTION = 'emotion'

# movie
CAPTION = 'caption'

# reading comprehension
RC_ENTITIES = 'rc_entities'
PLOTS = 'plots'
P_ENT = 'p_ent'
U_ENT = 'u_ent'
S_ENT = 's_ent'
QUERY = 'query'
ANSWER = 'answer'


# =================================== Ordered JSON ===================================

class NoIndent(object):
    def __init__(self, value):
        self.value = value


class NoIndentEncoder(json.JSONEncoder):
    REGEX = re.compile(r'@@@(\d+)@@@')

    def __init__(self, *args, **kwargs):
        super(NoIndentEncoder, self).__init__(*args, **kwargs)
        self.kwargs = dict(kwargs)
        del self.kwargs['indent']
        self._replacements = {}

    def default(self, o):
        if isinstance(o, NoIndent):
            key = len(self._replacements)
            self._replacements[key] = json.dumps(o.value, **self.kwargs)
            return "@@@%d@@@" % (key)
        else:
            return super(NoIndentEncoder, self).default(o)

    def encode(self, o):
        result = super(NoIndentEncoder, self).encode(o)
        out = []

        m = self.REGEX.search(result)
        while m:
            key = int(m.group(1))
            out.append(result[:m.start(0)-1])
            out.append(self._replacements[key])
            result = result[m.end(0)+1:]
            m = self.REGEX.search(result)
        return ''.join(out)


def pair(key, d, noindent=False):
    s = d[key]
    if isinstance(s, str): s = ' '.join(s.split())
    return (key, NoIndent(s)) if noindent else (key, s)


def ordered_json(input, plot=True, wo_note=True, wi_note=True, caption=True, character_entities=True, emotion=True, rc_entities=True):
    s = json.load(open(input)) if isinstance(input, str) else input
    season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)])
    if len(s) != len(season): print('Error: 0')
    episodes = season[EPISODES]

    for i, e in enumerate(episodes):
        episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)])
        if len(e) != len(episode): print('Error: 1')
        episodes[i] = episode
        scenes = episode[SCENES]

        for j, c in enumerate(scenes):
            scene = [pair(SCENE_ID, c), pair(UTTERANCES, c)]
            if plot and PLOTS in c: scene.append(pair(PLOTS, c))
            if rc_entities and RC_ENTITIES in c:
                scene.append((RC_ENTITIES, c[RC_ENTITIES]))
                for d in c[RC_ENTITIES].values():
                    for k, v in d.items(): d[k] = NoIndent(v)
            scene = OrderedDict(scene)
            if len(c) != len(scene): print('Error 2: '+scene[SCENE_ID])
            scenes[j] = scene
            utterances = scene[UTTERANCES]

            for k, u in enumerate(utterances):
                utterance = [pair(UTTERANCE_ID, u), pair(SPEAKERS, u, True)]

                if wo_note:
                    utterance.append(pair(TRANSCRIPT, u))
                    utterance.append((TOKENS, [NoIndent(t) for t in u[TOKENS]]))
                if wi_note:
                    utterance.append(pair(TRANSCRIPT_WITH_NOTE, u))
                    twn = u[TOKENS_WITH_NOTE]
                    utterance.append((TOKENS_WITH_NOTE, [NoIndent(t) for t in twn] if twn else twn))

                if character_entities and CHARACTER_ENTITIES in u:
                    utterance.append((CHARACTER_ENTITIES, [NoIndent(t) for t in u[CHARACTER_ENTITIES]]))

                if emotion and EMOTION in u:
                    utterance.append((EMOTION, NoIndent(u[EMOTION])))

                if caption and CAPTION in u:
                    utterance.append((CAPTION, NoIndent(u[CAPTION])))

                utterance = OrderedDict(utterance)
                if len(u) != len(utterance): print('Error: 3')
                utterances[k] = utterance

    out = json.dumps(season, cls=NoIndentEncoder, indent=2)
    # out += '\n            }\n          }\n        }\n      ]\n    }\n  ]\n}'  # TODO: should not be necessary
    # out += '\n]            }\n          ]\n        }\n      ]\n    \n  ]\n}'  # character identification
    out += '\n            }\n          ]\n        }\n      ]\n    }\n  ]\n}'  # emotion detection
    return out


# =================================== General ===================================

def general_stats(json_dir):
    def stats(json_file):
        num_scenes = 0
        num_utterances = 0
        num_utterances_wn = 0
        num_sentences = 0
        num_sentences_wn = 0
        num_tokens = 0
        num_tokens_wn = 0
        speaker_list = set()

        season = json.load(open(json_file))
        episodes = season[EPISODES]

        for episode in episodes:
            scenes = episode[SCENES]
            num_scenes += len(scenes)

            for scene in scenes:
                utterances = scene[UTTERANCES]
                num_utterances_wn += len(utterances)

                for utterance in utterances:
                    speaker_list.update(utterance[SPEAKERS])

                    tokens = utterance[TOKENS]
                    if tokens:
                        num_utterances += 1
                        num_sentences += len(tokens)
                        num_tokens += sum([len(t) for t in tokens])

                    tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens
                    num_sentences_wn += len(tokens_wn)
                    num_tokens_wn += sum([len(t) for t in tokens_wn])

        return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, speaker_list,
                num_utterances_wn, num_sentences_wn, num_tokens_wn]

    g_speaker_list = set()
    print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers', 'Utterances (WN)', 'Sentences (WN)', 'Tokens (WN)']))
    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        l = stats(json_file)
        g_speaker_list.update(l[6])
        l[6] = len(l[6])
        print('\t'.join(map(str, l)))
    print('All speakers: %s' % (len(g_speaker_list)))


def compare_peer(input_dir1, input_dir2):
    for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))):
        input_file2 = os.path.join(input_dir2, os.path.basename(input_file1))
        print(os.path.basename(input_file1))

        season1 = json.load(open(input_file1))
        season2 = json.load(open(input_file2))

        season_id = season1[SEASON_ID]
        episodes1 = season1[EPISODES]
        episodes2 = season2[EPISODES]
        if len(episodes1) != len(episodes2):
            print('Episode mismatch: %s - %d, %d' % (season_id, len(episodes1), len(episodes2)))

        for episode1, episode2 in zip(episodes1, episodes2):
            episode_id = episode1[EPISODE_ID]
            scenes1 = episode1[SCENES]
            scenes2 = episode2[SCENES]
            if len(scenes1) != len(scenes2):
                print('Scene mismatch: %s - %d, %d' % (episode_id, len(scenes1), len(scenes2)))

            for scene1, scene2 in zip(scenes1, scenes2):
                scene_id = scene1[SCENE_ID]
                utterances1 = scene1[UTTERANCES]
                utterances2 = scene2[UTTERANCES]
                if len(utterances1) != len(utterances2):
                    print('Utterance mismatch: %s - %d, %d' % (scene_id, len(utterances1), len(utterances2)))

                for utterance1, utterance2 in zip(utterances1, utterances2):
                    utterance_id = utterance1[UTTERANCE_ID]
                    tokens1 = utterance1[TOKENS]
                    tokens2 = utterance2[TOKENS]
                    if len(tokens1) != len(tokens2):
                        print('Token mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2)))

                    m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]]
                    if m:
                        print('Token mismatch: %s - %s' % (utterance_id, str(m)))

                    tokens1 = utterance1[TOKENS_WITH_NOTE]
                    tokens2 = utterance2[TOKENS_WITH_NOTE]

                    if tokens1 is None and tokens2 is None:
                        continue

                    if len(tokens1) != len(tokens2):
                        print('Token WN mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2)))

                    m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]]
                    if m:
                        print('Token WN mismatch: %s - %s' % (utterance_id, str(m)))


# =================================== Character Identification ===================================

def extract_character_identification(input_dir, output_dir):
    """
    trn: episodes 1-19
    dev: episodes 20-21
    tst: episodes 22-end
    """
    trn = {SEASON_ID: 'trn', EPISODES: []}
    dev = {SEASON_ID: 'dev', EPISODES: []}
    tst = {SEASON_ID: 'tst', EPISODES: []}

    def get_entities(entity_list):
        return [entity for entity in entity_list if entity[-1] != 'Non-Entity']

    for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))):
        if i >= 4: break
        season = json.load(open(input_file))
        print(input_file)

        for episode in season[EPISODES]:
            episode_id = int(episode[EPISODE_ID].split('_')[1][1:])
            d = tst if episode_id >= 22 else dev if episode_id >= 20 else trn
            d[EPISODES].append(episode)
            scenes = []

            for scene in episode[SCENES]:
                utterances = []

                for utterance in scene[UTTERANCES]:
                    if utterance[TOKENS]:
                        utterances.append(utterance)

                        if CHARACTER_ENTITIES in utterance:
                            utterance[CHARACTER_ENTITIES] = [get_entities(entity_list) for entity_list in utterance[CHARACTER_ENTITIES]]
                        else:
                            print(utterance[UTTERANCE_ID])

                if utterances:
                    scene[UTTERANCES] = utterances
                    scenes.append(scene)

            episode[SCENES] = scenes

    with open(os.path.join(output_dir, 'character-identification-trn.json'), 'w') as fout:
        fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))

    with open(os.path.join(output_dir, 'character-identification-dev.json'), 'w') as fout:
        fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))

    with open(os.path.join(output_dir, 'character-identification-tst.json'), 'w') as fout:
        fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))


def entity_stats(json_dir):
    def stats(json_file):
        speaker_list = []
        entity_list = []
        num_scenes = 0
        num_utterances = 0
        num_tokens = 0
        num_mentions = 0

        season = json.load(open(json_file))
        episodes = season[EPISODES]

        for episode in episodes:
            scenes = episode[SCENES]
            num_scenes += len(scenes)

            for scene in scenes:
                utterances = scene[UTTERANCES]
                num_utterances += len(utterances)

                for utterance in utterances:
                    num_tokens += sum([len(t) for t in utterance[TOKENS]])
                    speaker_list.extend(utterance[SPEAKERS])

                    if len(utterance[TOKENS]) != len(utterance[CHARACTER_ENTITIES]):
                        print(utterances[UTTERANCE_ID])

                    for character_entities in utterance[CHARACTER_ENTITIES]:
                        num_mentions += len(character_entities)
                        for entities in character_entities:
                            entity_list.extend(entities[2:])

        g_speaker_list.extend(speaker_list)
        g_entity_list.extend(entity_list)
        return [season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_mentions, len(set(entity_list))]

    g_speaker_list = []
    g_entity_list = []
    print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Mentions', 'Entities']))

    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        l = stats(json_file)
        print('\t'.join(map(str, l)))

    print('All speakers: %s' % (len(set(g_speaker_list))))
    print('All entities: %s' % (len(set(g_entity_list))))


# =================================== Emotion Detection ===================================

def extract_emotion_detection(input_dir, output_dir):
    trn = {SEASON_ID: 'trn', EPISODES: []}
    dev = {SEASON_ID: 'dev', EPISODES: []}
    tst = {SEASON_ID: 'tst', EPISODES: []}

    DEV = {'s01_e15', 's01_e20', 's02_e10', 's02_e20', 's03_e01', 's03_e09', 's03_e21', 's04_e01', 's04_e06', 's04_e10', 's04_e21'}
    TST = {'s01_e01', 's01_e10', 's02_e08', 's02_e23', 's03_e08', 's03_e20', 's04_e02', 's04_e20', 's04_e17'}

    def get_entities(entity_list):
        return [entity for entity in entity_list if entity[-1] != 'Non-Entity']

    for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))):
        if i >= 4: break
        season = json.load(open(input_file))
        print(input_file)

        for episode in season[EPISODES]:
            episode_id = episode[EPISODE_ID]
            d = tst if episode_id in TST else dev if episode_id in DEV else trn
            d[EPISODES].append(episode)
            scenes = []

            for scene in episode[SCENES]:
                utterances = []
                emotions = 0
                misses = []

                for utterance in scene[UTTERANCES]:
                    if utterance[TOKENS]:
                        if EMOTION in utterance:
                            utterance[EMOTION] = utterance[EMOTION][0]
                            emotions += 1
                        else:
                            misses.append(utterance[UTTERANCE_ID])

                        utterances.append(utterance)

                if emotions > 0:
                    if emotions != len(utterances): print(misses)
                    scene[UTTERANCES] = utterances
                    scenes.append(scene)

            episode[SCENES] = scenes

    with open(os.path.join(output_dir, 'emotion-detection-trn.json'), 'w') as fout:
        fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))

    with open(os.path.join(output_dir, 'emotion-detection-dev.json'), 'w') as fout:
        fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))

    with open(os.path.join(output_dir, 'emotion-detection-tst.json'), 'w') as fout:
        fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))


def emotion_stats(json_dir):
    def stats(json_file):
        emotions = {}
        num_scenes = 0
        num_utterances = 0
        episode_ids = []

        season = json.load(open(json_file))
        episodes = season[EPISODES]

        for episode in episodes:
            episode_ids.append(episode[EPISODE_ID])
            scenes = episode[SCENES]
            num_scenes += len(scenes)

            for scene in scenes:
                utterances = scene[UTTERANCES]
                num_utterances += len(utterances)

                for utterance in utterances:
                    e = utterance[EMOTION]
                    emotions[e] = emotions.setdefault(e, 0) + 1

        print(episode_ids)
        return [season[SEASON_ID], len(episodes), num_scenes, num_utterances] + [emotions[e] for e in emotion_list]

    emotion_list = ['Joyful', 'Mad', 'Neutral', 'Peaceful', 'Powerful', 'Sad', 'Scared']
    print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances'] + emotion_list))

    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        l = stats(json_file)
        print('\t'.join(map(str, l)))


# =================================== Reading Comprehension ===================================

def relabel(samples):
    re_samples = []
    for sample in samples:
        sam = {}

        q_words = sample[QUERY].split(' ')
        d_words = []
        for utter in sample[UTTERANCES]:
            d_words += utter[SPEAKERS]
            d_words += utter[TOKENS]

        entity_dict = {}
        entity_id = 0
        for word in d_words + q_words:
            if (word.startswith('@ent')) and (word not in entity_dict):
                entity_dict[word] = '@ent%02d' % entity_id
                entity_id += 1

        re_document = []
        for utter in sample[UTTERANCES]:
            sent = {SPEAKERS: ' '.join(
                [entity_dict[w] if w in entity_dict else w for w in utter[SPEAKERS]]),
                TOKENS: ' '.join([entity_dict[w] if w in entity_dict else w for w in utter[TOKENS]])}
            re_document.append(sent)

        sam[SCENE_ID] = sample[SCENE_ID]
        sam[QUERY] = ' '.join([entity_dict[w] if w in entity_dict else w for w in q_words])
        sam[ANSWER] = entity_dict[sample[ANSWER]]
        sam[UTTERANCES] = re_document
        re_samples.append(sam)
    return re_samples


def extract_reading_comprehension(json_dir, output_dir):
    season_samples = defaultdict(list)
    random.seed(1234)

    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        season = json.load(open(json_file))
        for episode in season[EPISODES]:
            for scene in episode[SCENES]:
                if PLOTS in scene and scene[PLOTS]:
                    masking_map = {}
                    for vi, ki in enumerate(scene[RC_ENTITIES].keys()):
                        masking_map[ki] = '@ent%02d' % vi

                    masked_passages = []
                    for i, passage in enumerate(scene[PLOTS]):
                        masked_sentence = []
                        ent_list = {}
                        for ent, index_list in scene[RC_ENTITIES].items():
                            for index in index_list[P_ENT]:
                                if i == index[0]:
                                    ent_list[index[1]] = (index[1], index[2], ent)
                        jump = 0
                        for j, token in enumerate(passage.split(' ')):
                            if jump > 0:
                                jump -= 1
                                continue
                            if j in ent_list:
                                masked_sentence.append(masking_map[ent_list[j][2]])
                                jump = ent_list[j][1] - ent_list[j][0] - 1
                            else:
                                masked_sentence.append(token)
                        masked_passages.append(masked_sentence)

                    masked_dialog = []
                    for i, utterance in enumerate(scene[UTTERANCES]):
                        if utterance[TOKENS_WITH_NOTE] is not None:
                            tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
                        else:
                            tokens = [w for sent in utterance[TOKENS] for w in sent]

                        masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
                        ent_list = {}
                        for ent, index_list in scene[RC_ENTITIES].items():
                            for index in index_list[U_ENT]:
                                if i == index[0]:
                                    ent_list[index[1]] = (index[1], index[2], ent)
                            for index in index_list[S_ENT]:
                                if i == index[0]:
                                    masked_utter[SPEAKERS][index[1]] = masking_map[ent]

                        jump = 0
                        for j, token in enumerate(tokens):
                            if jump > 0:
                                jump -= 1
                                continue
                            if j in ent_list:
                                masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
                                jump = ent_list[j][1] - ent_list[j][0] - 1
                            else:
                                masked_utter[TOKENS].append(token)
                        masked_dialog.append(masked_utter)

                    dialog_entities = Counter()
                    for ent, ent_list in scene[RC_ENTITIES].items():
                        if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0:
                            dialog_entities.update([masking_map[ent]])

                    for sentence in masked_passages:
                        for i, token in enumerate(sentence):
                            if token.startswith('@ent') and token in dialog_entities:
                                sample = {}
                                query = deepcopy(sentence)
                                query[i] = '@placeholder'
                                sample[QUERY] = ' '.join(query)
                                sample[ANSWER] = token
                                sample[UTTERANCES] = masked_dialog
                                sample[SCENE_ID] = scene[SCENE_ID]
                                season_samples[season[SEASON_ID]].append(sample)

    trn = []
    dev = []
    tst = []
    for season_id, s_samples in season_samples.items():
        n = len(s_samples)
        random.shuffle(s_samples)
        trn.extend(s_samples[:int(0.8 * n)])
        dev.extend(s_samples[int(0.8 * n):int(0.9 * n)])
        tst.extend(s_samples[int(0.9 * n):])

    trn = relabel(trn)
    dev = relabel(dev)
    tst = relabel(tst)

    with open(os.path.join(output_dir, 'trn.json'), 'w') as fout:
        fout.write(json.dumps(trn, indent=2))

    with open(os.path.join(output_dir, 'dev.json'), 'w') as fout:
        fout.write(json.dumps(dev, indent=2))

    with open(os.path.join(output_dir, 'tst.json'), 'w') as fout:
        fout.write(json.dumps(tst, indent=2))


def reading_stats(json_dir):
    def create(dataset, num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances):
        return [dataset,
                num_queries,
                num_utterances / num_queries,
                num_entity_type_query / num_queries,
                num_entity_count_query / num_queries,
                num_entity_type_utt / num_queries,
                num_entity_count_utt / num_queries]

    def stats(json_file):
        documents = json.load(open(json_file))
        num_queries = len(documents)
        num_entity_count_query = 0
        num_entity_type_query = 0
        num_entity_count_utt = 0
        num_entity_type_utt = 0
        num_utterances = 0

        for doc in documents:
            ents = [doc[ANSWER] if q == '@placeholder' else q for q in doc[QUERY].split() if q.startswith('@ent') or q == '@placeholder']
            num_entity_count_query += len(ents)
            num_entity_type_query += len(set(ents))

            num_utterances += len(doc[UTTERANCES])
            ents = []

            for utterance in doc[UTTERANCES]:
                ents.extend(utterance[SPEAKERS].split())
                ents.extend([t for t in utterance[TOKENS].split() if t.startswith('@ent')])

            num_entity_type_utt += len(set(ents))
            num_entity_count_utt += len(ents)

        return [num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances]

    print('\t'.join(['Dataset', 'Queries', 'U / Q', '{E} / Q', '[E] / Q', '{E} / U', '[E] / U']))
    g_num = np.zeros(6)

    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        l = stats(json_file)
        g_num += np.array(l)
        print('\t'.join(map(str, create(json_file[-15:-12].upper(), *l))))

    print('\t'.join(map(str, create('Total', *g_num))))



# =================================== Main ===================================

if __name__ == '__main__':
    # json_dir = '/Users/jdchoi/Git/character-mining/json'
    # general_stats(json_dir)

    # character identification
    # input_dir = '/Users/jdchoi/Git/character-mining/json'
    # output_dir = '/Users/jdchoi/Git/character-identification/json'
    # extract_character_identification(input_dir, output_dir)
    # entity_stats(output_dir)

    # emotino detection
    # input_dir = '/Users/jdchoi/Git/character-mining/json'
    # output_dir = '/Users/jdchoi/Git/emotion-detection/json'
    # extract_emotion_detection(input_dir, output_dir)
    # emotion_stats(output_dir)

    # reading comprehension
    # json_dir = '/Users/jdchoi/Git/character-mining/json'
    output_dir = '/Users/jdchoi/Git/reading-comprehension/json'
    # extract_reading_comprehension(json_dir, output_dir)
    reading_stats(output_dir)



    # input_dir = '/Users/jdchoi/Git/character-mining/json'
    # ann_dir = '/Users/jdchoi/Downloads/dataset'
    # output_dir = '/Users/jdchoi/Git/character-mining/json/em'
    # merge_em(input_dir, ann_dir, output_dir)

    # input_dir1 = '/Users/jdchoi/Git/character-mining-dev/json-bak'
    # input_dir2 = '/Users/jdchoi/Downloads/Friends_newly_compiled'
    # output_dir = '/Users/jdchoi/Git/character-mining/json'
    # merge_rc(input_dir1, input_dir2, output_dir)










# def merge_rc(input_dir1, input_dir2, output_dir):
#     def get_entities(rc_entities):
#         plot = rc_entities['plot_entities']
#         speaker = rc_entities['speaker_entities']
#         utterance = rc_entities['utterance_entities']
#         entities = {}
#
#         if plot:
#             for name, ts in plot.items():
#                 d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
#                 d[P_ENT] = [t[:-1] for t in ts]
#
#         for name, ts in utterance.items():
#             d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
#             d[U_ENT] = [t[:-1] for t in ts]
#
#         for name, ts in speaker.items():
#             d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
#             d[S_ENT] = [t[:-1] for t in ts]
#
#         return entities
#
#     for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))):
#         input_file2 = os.path.join(input_dir2, os.path.basename(input_file1))
#         print(os.path.basename(input_file1))
#
#         season1 = json.load(open(input_file1))
#         season2 = json.load(open(input_file2))
#
#         episodes1 = season1[EPISODES]
#         episodes2 = season2[EPISODES]
#
#         for episode1, episode2 in zip(episodes1, episodes2):
#             scenes1 = episode1[SCENES]
#             scenes2 = episode2[SCENES]
#
#             for scene1, scene2 in zip(scenes1, scenes2):
#                 scene1[PLOTS] = scene2[PLOTS]
#                 scene1[RC_ENTITIES] = get_entities(scene2[RC_ENTITIES])
#
#         with open(os.path.join(output_dir, os.path.basename(input_file1)), 'w') as fout:
#             fout.write(ordered_json(season1))
#
#
# def merge_em(input_dir, ann_dir, output_dir):
#     def extend_ann(ann_file, ls):
#         fin = open(ann_file)
#
#         for i, line in enumerate(fin):
#             if i == 0: continue
#             l = line.split()
#             season_id = int(l[0]) - 1
#             episode_id = int(l[1]) - 1
#             scene_id = int(l[2]) - 1
#             utterance_id = int(l[3])
#             annotation = l[4:8]
#             gold = l[10]
#             ls.append((season_id, episode_id, scene_id, utterance_id, annotation, gold))
#
#
#     annotations = []
#     for ann_file in glob.glob(os.path.join(ann_dir, '*.tsv')): extend_ann(ann_file, annotations)
#     seasons = [json.load(open(input_file)) for input_file in sorted(glob.glob(os.path.join(input_dir, '*.json')))]
#
#     for season_id, episode_id, scene_id, utterance_id, annotation, gold in annotations:
#         utterance = seasons[season_id][EPISODES][episode_id][SCENES][scene_id][UTTERANCES][utterance_id]
#         if EMOTION in utterance:
#             if utterance[EMOTION] != gold: print(utterance[UTTERANCE_ID])
#             utterance[EMOTION] = [gold, annotation]
#         else:
#             print(utterance[UTTERANCE_ID])
#
#     for i, season in enumerate(seasons):
#         with open(os.path.join(output_dir, 'friends_season_0%d.json' % (i+1)), 'w') as fout:
#             fout.write(ordered_json(season))
#
# def extract_reading_comprehension_padded(json_dir, output_dir, des_size):
#     season_samples = defaultdict(list)
#     random.seed(1234)
#
#     for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
#         data = json.load(open(json_file))
#         for episode_dict in data[EPISODES]:
#             for idx, scene_dict in enumerate(episode_dict[SCENES]):
#                 if scene_dict[PLOTS] is not None:
#
#                     entities = Counter()
#                     entities.update(scene_dict[RC_ENTITIES].keys())
#
#                     cur = idx
#                     dialog_len = len(scene_dict[UTTERANCES])
#                     while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1:
#                         cur += 1
#                         entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys())
#                         dialog_len += len(episode_dict[SCENES][cur][UTTERANCES])
#                     if dialog_len < des_size:
#                         cur = idx
#                         while (cur > 0 and dialog_len < des_size):
#                             cur -= 1
#                             entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys())
#                             dialog_len += len(episode_dict[SCENES][cur][UTTERANCES])
#
#                     masking_map = {}
#                     for vi, ki in enumerate(entities.keys()):
#                         masking_map[ki] = '@ent%02d' % vi
#
#                     masked_passages = []
#                     for i, passage in enumerate(scene_dict[PLOTS]):
#                         masked_sentence = []
#                         ent_list = {}
#                         for ent, index_list in scene_dict[RC_ENTITIES].items():
#                             for index in index_list[P_ENT]:
#                                 if i == index[0]:
#                                     ent_list[index[1]] = (index[1], index[2], ent)
#                         jump = 0
#                         for j, token in enumerate(passage.split(' ')):
#                             if jump > 0:
#                                 jump -= 1
#                                 continue
#                             if j in ent_list:
#                                 masked_sentence.append(masking_map[ent_list[j][2]])
#                                 jump = ent_list[j][1] - ent_list[j][0] - 1
#                             else:
#                                 masked_sentence.append(token)
#                         masked_passages.append(masked_sentence)
#
#                     cur = idx
#                     dialog_len = len(scene_dict[UTTERANCES])
#                     next_dialog = []
#                     while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1:
#                         cur += 1
#                         for i, utterance in enumerate(episode_dict[SCENES][cur][UTTERANCES]):
#                             if utterance[TOKENS_WITH_NOTE] is not None:
#                                 tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
#                             else:
#                                 tokens = [w for sent in utterance[TOKENS] for w in sent]
#
#                             masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
#                             ent_list = {}
#                             for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items():
#                                 for index in index_list[U_ENT]:
#                                     if i == index[0]:
#                                         ent_list[index[1]] = (index[1], index[2], ent)
#                                 for index in index_list[S_ENT]:
#                                     if i == index[0]:
#                                         masked_utter[SPEAKERS][index[1]] = masking_map[ent]
#                             jump = 0
#                             for j, token in enumerate(tokens):
#                                 if jump > 0:
#                                     jump -= 1
#                                     continue
#                                 if j in ent_list:
#                                     masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
#                                     jump = ent_list[j][1] - ent_list[j][0] - 1
#                                 else:
#                                     masked_utter[TOKENS].append(token)
#                             next_dialog.append(masked_utter)
#                             dialog_len += 1
#                             if dialog_len == des_size:
#                                 break
#
#                     prev_dialog = []
#                     if dialog_len < des_size:
#                         cur = idx
#                         while dialog_len < des_size and cur > 0:
#                             cur -= 1
#                             for i, utterance in enumerate(reversed(episode_dict[SCENES][cur][UTTERANCES])):
#                                 if utterance[TOKENS_WITH_NOTE] is not None:
#                                     tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
#                                 else:
#                                     tokens = [w for sent in utterance[TOKENS] for w in sent]
#
#                                 masked_utter = {}
#                                 masked_utter[SPEAKERS] = utterance[SPEAKERS]
#                                 masked_utter[TOKENS] = []
#                                 ent_list = {}
#                                 for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items():
#                                     for index in index_list[U_ENT]:
#                                         if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1:
#                                             ent_list[index[1]] = (index[1], index[2], ent)
#                                     for index in index_list[S_ENT]:
#                                         if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1:
#                                             masked_utter[SPEAKERS][index[1]] = masking_map[ent]
#                                 jump = 0
#                                 for j, token in enumerate(tokens):
#                                     if jump > 0:
#                                         jump -= 1
#                                         continue
#                                     if j in ent_list:
#                                         masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
#                                         jump = ent_list[j][1] - ent_list[j][0] - 1
#                                     else:
#                                         masked_utter[TOKENS].append(token)
#                                 prev_dialog.append(masked_utter)
#                                 dialog_len += 1
#                                 if dialog_len == des_size:
#                                     break
#
#                     masked_dialog = []
#                     for i, utterance in enumerate(scene_dict[UTTERANCES]):
#                         if utterance[TOKENS_WITH_NOTE] is not None:
#                             tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
#                         else:
#                             tokens = [w for sent in utterance[TOKENS] for w in sent]
#
#                         masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
#                         ent_list = {}
#                         for ent, index_list in scene_dict[RC_ENTITIES].items():
#                             for index in index_list[U_ENT]:
#                                 if i == index[0]:
#                                     ent_list[index[1]] = (index[1], index[2], ent)
#                             for index in index_list[S_ENT]:
#                                 if i == index[0]:
#                                     masked_utter[SPEAKERS][index[1]] = masking_map[ent]
#                         jump = 0
#                         for j, token in enumerate(tokens):
#                             if jump > 0:
#                                 jump -= 1
#                                 continue
#                             if j in ent_list:
#                                 masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
#                                 jump = ent_list[j][1] - ent_list[j][0] - 1
#                             else:
#                                 masked_utter[TOKENS].append(token)
#                         masked_dialog.append(masked_utter)
#
#                     dialog_entities = Counter()
#                     for ent, ent_list in scene_dict[RC_ENTITIES].items():
#                         if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0:
#                             dialog_entities.update([masking_map[ent]])
#
#                     full_dialog = []
#                     for u in reversed(prev_dialog):
#                         full_dialog.append(u)
#                     for u in masked_dialog:
#                         full_dialog.append(u)
#                     for u in next_dialog:
#                         full_dialog.append(u)
#
#                     for sentence in masked_passages:
#                         for i, token in enumerate(sentence):
#                             if token.startswith('@ent') and token in dialog_entities:
#                                 sample = {}
#                                 query = deepcopy(sentence)
#                                 query[i] = '@placeholder'
#                                 sample[QUERY] = ' '.join(query)
#                                 sample[ANSWER] = token
#                                 sample[UTTERANCES] = full_dialog
#                                 sample[SCENE_ID] = scene_dict[SCENE_ID]
#                                 season_samples[data[SEASON_ID]].append(sample)
#
#     trn = []
#     dev = []
#     tst = []
#     for season_id, s_samples in season_samples.items():
#         l = len(s_samples)
#         random.shuffle(s_samples)
#         trn.extend(s_samples[:int(0.8 * l)])
#         dev.extend(s_samples[int(0.8 * l):int(0.9 * l)])
#         tst.extend(s_samples[int(0.9 * l):])
#
#     trn = relabel(trn)
#     dev = relabel(dev)
#     tst = relabel(tst)
#     print(len(trn), len(dev), len(tst))
#
#     with open(os.path.join(output_dir, 'trn-%d.json' % des_size), 'w') as fout:
#         fout.write(json.dumps(trn, indent=2))
#
#     with open(os.path.join(output_dir, 'dev-%d.json' % des_size), 'w') as fout:
#         fout.write(json.dumps(dev, indent=2))
#
#     with open(os.path.join(output_dir, 'tst-%d.json' % des_size), 'w') as fout:
#         fout.write(json.dumps(tst, indent=2))