Spaces:

malvika2003
/

openvino_notebooks

Runtime error

App Files Files Community

openvino_notebooks / notebooks /llm-question-answering /character-mining /scripts /cm_lib.py

malvika2003

Upload folder using huggingface_hub

db5855f verified about 1 year ago

raw

history blame contribute delete

41.8 kB

	# ========================================================================
	# Copyright 2018 Emory University
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========================================================================
	import glob
	import json
	import os
	import re
	import random
	import numpy as np
	from copy import deepcopy
	from collections import Counter, OrderedDict, defaultdict

	__author__ = 'Jinho D. Choi'

	SEASON_ID = 'season_id'
	EPISODES = 'episodes'
	EPISODE_ID = 'episode_id'
	EPISODE = 'episode'
	SCENES = 'scenes'
	SCENE_ID = 'scene_id'
	UTTERANCES = 'utterances'
	UTTERANCE_ID = 'utterance_id'
	SPEAKERS = 'speakers'
	TRANSCRIPT = 'transcript'
	TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
	TOKENS = 'tokens'
	TOKENS_WITH_NOTE = 'tokens_with_note'

	# character identification
	CHARACTER_ENTITIES = 'character_entities'

	# emotion detection
	EMOTION = 'emotion'

	# movie
	CAPTION = 'caption'

	# reading comprehension
	RC_ENTITIES = 'rc_entities'
	PLOTS = 'plots'
	P_ENT = 'p_ent'
	U_ENT = 'u_ent'
	S_ENT = 's_ent'
	QUERY = 'query'
	ANSWER = 'answer'


	# =================================== Ordered JSON ===================================

	class NoIndent(object):
	def __init__(self, value):
	self.value = value


	class NoIndentEncoder(json.JSONEncoder):
	REGEX = re.compile(r'@@@(\d+)@@@')

	def __init__(self, args, *kwargs):
	super(NoIndentEncoder, self).__init__(args, *kwargs)
	self.kwargs = dict(kwargs)
	del self.kwargs['indent']
	self._replacements = {}

	def default(self, o):
	if isinstance(o, NoIndent):
	key = len(self._replacements)
	self._replacements[key] = json.dumps(o.value, **self.kwargs)
	return "@@@%d@@@" % (key)
	else:
	return super(NoIndentEncoder, self).default(o)

	def encode(self, o):
	result = super(NoIndentEncoder, self).encode(o)
	out = []

	m = self.REGEX.search(result)
	while m:
	key = int(m.group(1))
	out.append(result[:m.start(0)-1])
	out.append(self._replacements[key])
	result = result[m.end(0)+1:]
	m = self.REGEX.search(result)
	return ''.join(out)


	def pair(key, d, noindent=False):
	s = d[key]
	if isinstance(s, str): s = ' '.join(s.split())
	return (key, NoIndent(s)) if noindent else (key, s)


	def ordered_json(input, plot=True, wo_note=True, wi_note=True, caption=True, character_entities=True, emotion=True, rc_entities=True):
	s = json.load(open(input)) if isinstance(input, str) else input
	season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)])
	if len(s) != len(season): print('Error: 0')
	episodes = season[EPISODES]

	for i, e in enumerate(episodes):
	episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)])
	if len(e) != len(episode): print('Error: 1')
	episodes[i] = episode
	scenes = episode[SCENES]

	for j, c in enumerate(scenes):
	scene = [pair(SCENE_ID, c), pair(UTTERANCES, c)]
	if plot and PLOTS in c: scene.append(pair(PLOTS, c))
	if rc_entities and RC_ENTITIES in c:
	scene.append((RC_ENTITIES, c[RC_ENTITIES]))
	for d in c[RC_ENTITIES].values():
	for k, v in d.items(): d[k] = NoIndent(v)
	scene = OrderedDict(scene)
	if len(c) != len(scene): print('Error 2: '+scene[SCENE_ID])
	scenes[j] = scene
	utterances = scene[UTTERANCES]

	for k, u in enumerate(utterances):
	utterance = [pair(UTTERANCE_ID, u), pair(SPEAKERS, u, True)]

	if wo_note:
	utterance.append(pair(TRANSCRIPT, u))
	utterance.append((TOKENS, [NoIndent(t) for t in u[TOKENS]]))
	if wi_note:
	utterance.append(pair(TRANSCRIPT_WITH_NOTE, u))
	twn = u[TOKENS_WITH_NOTE]
	utterance.append((TOKENS_WITH_NOTE, [NoIndent(t) for t in twn] if twn else twn))

	if character_entities and CHARACTER_ENTITIES in u:
	utterance.append((CHARACTER_ENTITIES, [NoIndent(t) for t in u[CHARACTER_ENTITIES]]))

	if emotion and EMOTION in u:
	utterance.append((EMOTION, NoIndent(u[EMOTION])))

	if caption and CAPTION in u:
	utterance.append((CAPTION, NoIndent(u[CAPTION])))

	utterance = OrderedDict(utterance)
	if len(u) != len(utterance): print('Error: 3')
	utterances[k] = utterance

	out = json.dumps(season, cls=NoIndentEncoder, indent=2)
	# out += '\n }\n }\n }\n ]\n }\n ]\n}' # TODO: should not be necessary
	# out += '\n] }\n ]\n }\n ]\n \n ]\n}' # character identification
	out += '\n }\n ]\n }\n ]\n }\n ]\n}' # emotion detection
	return out


	# =================================== General ===================================

	def general_stats(json_dir):
	def stats(json_file):
	num_scenes = 0
	num_utterances = 0
	num_utterances_wn = 0
	num_sentences = 0
	num_sentences_wn = 0
	num_tokens = 0
	num_tokens_wn = 0
	speaker_list = set()

	season = json.load(open(json_file))
	episodes = season[EPISODES]

	for episode in episodes:
	scenes = episode[SCENES]
	num_scenes += len(scenes)

	for scene in scenes:
	utterances = scene[UTTERANCES]
	num_utterances_wn += len(utterances)

	for utterance in utterances:
	speaker_list.update(utterance[SPEAKERS])

	tokens = utterance[TOKENS]
	if tokens:
	num_utterances += 1
	num_sentences += len(tokens)
	num_tokens += sum([len(t) for t in tokens])

	tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens
	num_sentences_wn += len(tokens_wn)
	num_tokens_wn += sum([len(t) for t in tokens_wn])

	return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, speaker_list,
	num_utterances_wn, num_sentences_wn, num_tokens_wn]

	g_speaker_list = set()
	print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers', 'Utterances (WN)', 'Sentences (WN)', 'Tokens (WN)']))
	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	l = stats(json_file)
	g_speaker_list.update(l[6])
	l[6] = len(l[6])
	print('\t'.join(map(str, l)))
	print('All speakers: %s' % (len(g_speaker_list)))


	def compare_peer(input_dir1, input_dir2):
	for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))):
	input_file2 = os.path.join(input_dir2, os.path.basename(input_file1))
	print(os.path.basename(input_file1))

	season1 = json.load(open(input_file1))
	season2 = json.load(open(input_file2))

	season_id = season1[SEASON_ID]
	episodes1 = season1[EPISODES]
	episodes2 = season2[EPISODES]
	if len(episodes1) != len(episodes2):
	print('Episode mismatch: %s - %d, %d' % (season_id, len(episodes1), len(episodes2)))

	for episode1, episode2 in zip(episodes1, episodes2):
	episode_id = episode1[EPISODE_ID]
	scenes1 = episode1[SCENES]
	scenes2 = episode2[SCENES]
	if len(scenes1) != len(scenes2):
	print('Scene mismatch: %s - %d, %d' % (episode_id, len(scenes1), len(scenes2)))

	for scene1, scene2 in zip(scenes1, scenes2):
	scene_id = scene1[SCENE_ID]
	utterances1 = scene1[UTTERANCES]
	utterances2 = scene2[UTTERANCES]
	if len(utterances1) != len(utterances2):
	print('Utterance mismatch: %s - %d, %d' % (scene_id, len(utterances1), len(utterances2)))

	for utterance1, utterance2 in zip(utterances1, utterances2):
	utterance_id = utterance1[UTTERANCE_ID]
	tokens1 = utterance1[TOKENS]
	tokens2 = utterance2[TOKENS]
	if len(tokens1) != len(tokens2):
	print('Token mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2)))

	m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]]
	if m:
	print('Token mismatch: %s - %s' % (utterance_id, str(m)))

	tokens1 = utterance1[TOKENS_WITH_NOTE]
	tokens2 = utterance2[TOKENS_WITH_NOTE]

	if tokens1 is None and tokens2 is None:
	continue

	if len(tokens1) != len(tokens2):
	print('Token WN mismatch: %s - %d, %d' % (utterance_id, len(tokens1), len(tokens2)))

	m = [i for i in range(len(tokens1)) if tokens1[i] != tokens2[i]]
	if m:
	print('Token WN mismatch: %s - %s' % (utterance_id, str(m)))


	# =================================== Character Identification ===================================

	def extract_character_identification(input_dir, output_dir):
	"""
	trn: episodes 1-19
	dev: episodes 20-21
	tst: episodes 22-end
	"""
	trn = {SEASON_ID: 'trn', EPISODES: []}
	dev = {SEASON_ID: 'dev', EPISODES: []}
	tst = {SEASON_ID: 'tst', EPISODES: []}

	def get_entities(entity_list):
	return [entity for entity in entity_list if entity[-1] != 'Non-Entity']

	for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))):
	if i >= 4: break
	season = json.load(open(input_file))
	print(input_file)

	for episode in season[EPISODES]:
	episode_id = int(episode[EPISODE_ID].split('_')[1][1:])
	d = tst if episode_id >= 22 else dev if episode_id >= 20 else trn
	d[EPISODES].append(episode)
	scenes = []

	for scene in episode[SCENES]:
	utterances = []

	for utterance in scene[UTTERANCES]:
	if utterance[TOKENS]:
	utterances.append(utterance)

	if CHARACTER_ENTITIES in utterance:
	utterance[CHARACTER_ENTITIES] = [get_entities(entity_list) for entity_list in utterance[CHARACTER_ENTITIES]]
	else:
	print(utterance[UTTERANCE_ID])

	if utterances:
	scene[UTTERANCES] = utterances
	scenes.append(scene)

	episode[SCENES] = scenes

	with open(os.path.join(output_dir, 'character-identification-trn.json'), 'w') as fout:
	fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))

	with open(os.path.join(output_dir, 'character-identification-dev.json'), 'w') as fout:
	fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))

	with open(os.path.join(output_dir, 'character-identification-tst.json'), 'w') as fout:
	fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, emotion=False, rc_entities=False))


	def entity_stats(json_dir):
	def stats(json_file):
	speaker_list = []
	entity_list = []
	num_scenes = 0
	num_utterances = 0
	num_tokens = 0
	num_mentions = 0

	season = json.load(open(json_file))
	episodes = season[EPISODES]

	for episode in episodes:
	scenes = episode[SCENES]
	num_scenes += len(scenes)

	for scene in scenes:
	utterances = scene[UTTERANCES]
	num_utterances += len(utterances)

	for utterance in utterances:
	num_tokens += sum([len(t) for t in utterance[TOKENS]])
	speaker_list.extend(utterance[SPEAKERS])

	if len(utterance[TOKENS]) != len(utterance[CHARACTER_ENTITIES]):
	print(utterances[UTTERANCE_ID])

	for character_entities in utterance[CHARACTER_ENTITIES]:
	num_mentions += len(character_entities)
	for entities in character_entities:
	entity_list.extend(entities[2:])

	g_speaker_list.extend(speaker_list)
	g_entity_list.extend(entity_list)
	return [season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_mentions, len(set(entity_list))]

	g_speaker_list = []
	g_entity_list = []
	print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Mentions', 'Entities']))

	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	l = stats(json_file)
	print('\t'.join(map(str, l)))

	print('All speakers: %s' % (len(set(g_speaker_list))))
	print('All entities: %s' % (len(set(g_entity_list))))


	# =================================== Emotion Detection ===================================

	def extract_emotion_detection(input_dir, output_dir):
	trn = {SEASON_ID: 'trn', EPISODES: []}
	dev = {SEASON_ID: 'dev', EPISODES: []}
	tst = {SEASON_ID: 'tst', EPISODES: []}

	DEV = {'s01_e15', 's01_e20', 's02_e10', 's02_e20', 's03_e01', 's03_e09', 's03_e21', 's04_e01', 's04_e06', 's04_e10', 's04_e21'}
	TST = {'s01_e01', 's01_e10', 's02_e08', 's02_e23', 's03_e08', 's03_e20', 's04_e02', 's04_e20', 's04_e17'}

	def get_entities(entity_list):
	return [entity for entity in entity_list if entity[-1] != 'Non-Entity']

	for i, input_file in enumerate(sorted(glob.glob(os.path.join(input_dir, '*.json')))):
	if i >= 4: break
	season = json.load(open(input_file))
	print(input_file)

	for episode in season[EPISODES]:
	episode_id = episode[EPISODE_ID]
	d = tst if episode_id in TST else dev if episode_id in DEV else trn
	d[EPISODES].append(episode)
	scenes = []

	for scene in episode[SCENES]:
	utterances = []
	emotions = 0
	misses = []

	for utterance in scene[UTTERANCES]:
	if utterance[TOKENS]:
	if EMOTION in utterance:
	utterance[EMOTION] = utterance[EMOTION][0]
	emotions += 1
	else:
	misses.append(utterance[UTTERANCE_ID])

	utterances.append(utterance)

	if emotions > 0:
	if emotions != len(utterances): print(misses)
	scene[UTTERANCES] = utterances
	scenes.append(scene)

	episode[SCENES] = scenes

	with open(os.path.join(output_dir, 'emotion-detection-trn.json'), 'w') as fout:
	fout.write(ordered_json(trn, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))

	with open(os.path.join(output_dir, 'emotion-detection-dev.json'), 'w') as fout:
	fout.write(ordered_json(dev, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))

	with open(os.path.join(output_dir, 'emotion-detection-tst.json'), 'w') as fout:
	fout.write(ordered_json(tst, plot=False, wi_note=False, caption=False, character_entities=False, rc_entities=False))


	def emotion_stats(json_dir):
	def stats(json_file):
	emotions = {}
	num_scenes = 0
	num_utterances = 0
	episode_ids = []

	season = json.load(open(json_file))
	episodes = season[EPISODES]

	for episode in episodes:
	episode_ids.append(episode[EPISODE_ID])
	scenes = episode[SCENES]
	num_scenes += len(scenes)

	for scene in scenes:
	utterances = scene[UTTERANCES]
	num_utterances += len(utterances)

	for utterance in utterances:
	e = utterance[EMOTION]
	emotions[e] = emotions.setdefault(e, 0) + 1

	print(episode_ids)
	return [season[SEASON_ID], len(episodes), num_scenes, num_utterances] + [emotions[e] for e in emotion_list]

	emotion_list = ['Joyful', 'Mad', 'Neutral', 'Peaceful', 'Powerful', 'Sad', 'Scared']
	print('\t'.join(['Dataset', 'Episodes', 'Scenes', 'Utterances'] + emotion_list))

	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	l = stats(json_file)
	print('\t'.join(map(str, l)))


	# =================================== Reading Comprehension ===================================

	def relabel(samples):
	re_samples = []
	for sample in samples:
	sam = {}

	q_words = sample[QUERY].split(' ')
	d_words = []
	for utter in sample[UTTERANCES]:
	d_words += utter[SPEAKERS]
	d_words += utter[TOKENS]

	entity_dict = {}
	entity_id = 0
	for word in d_words + q_words:
	if (word.startswith('@ent')) and (word not in entity_dict):
	entity_dict[word] = '@ent%02d' % entity_id
	entity_id += 1

	re_document = []
	for utter in sample[UTTERANCES]:
	sent = {SPEAKERS: ' '.join(
	[entity_dict[w] if w in entity_dict else w for w in utter[SPEAKERS]]),
	TOKENS: ' '.join([entity_dict[w] if w in entity_dict else w for w in utter[TOKENS]])}
	re_document.append(sent)

	sam[SCENE_ID] = sample[SCENE_ID]
	sam[QUERY] = ' '.join([entity_dict[w] if w in entity_dict else w for w in q_words])
	sam[ANSWER] = entity_dict[sample[ANSWER]]
	sam[UTTERANCES] = re_document
	re_samples.append(sam)
	return re_samples


	def extract_reading_comprehension(json_dir, output_dir):
	season_samples = defaultdict(list)
	random.seed(1234)

	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	season = json.load(open(json_file))
	for episode in season[EPISODES]:
	for scene in episode[SCENES]:
	if PLOTS in scene and scene[PLOTS]:
	masking_map = {}
	for vi, ki in enumerate(scene[RC_ENTITIES].keys()):
	masking_map[ki] = '@ent%02d' % vi

	masked_passages = []
	for i, passage in enumerate(scene[PLOTS]):
	masked_sentence = []
	ent_list = {}
	for ent, index_list in scene[RC_ENTITIES].items():
	for index in index_list[P_ENT]:
	if i == index[0]:
	ent_list[index[1]] = (index[1], index[2], ent)
	jump = 0
	for j, token in enumerate(passage.split(' ')):
	if jump > 0:
	jump -= 1
	continue
	if j in ent_list:
	masked_sentence.append(masking_map[ent_list[j][2]])
	jump = ent_list[j][1] - ent_list[j][0] - 1
	else:
	masked_sentence.append(token)
	masked_passages.append(masked_sentence)

	masked_dialog = []
	for i, utterance in enumerate(scene[UTTERANCES]):
	if utterance[TOKENS_WITH_NOTE] is not None:
	tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
	else:
	tokens = [w for sent in utterance[TOKENS] for w in sent]

	masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
	ent_list = {}
	for ent, index_list in scene[RC_ENTITIES].items():
	for index in index_list[U_ENT]:
	if i == index[0]:
	ent_list[index[1]] = (index[1], index[2], ent)
	for index in index_list[S_ENT]:
	if i == index[0]:
	masked_utter[SPEAKERS][index[1]] = masking_map[ent]

	jump = 0
	for j, token in enumerate(tokens):
	if jump > 0:
	jump -= 1
	continue
	if j in ent_list:
	masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
	jump = ent_list[j][1] - ent_list[j][0] - 1
	else:
	masked_utter[TOKENS].append(token)
	masked_dialog.append(masked_utter)

	dialog_entities = Counter()
	for ent, ent_list in scene[RC_ENTITIES].items():
	if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0:
	dialog_entities.update([masking_map[ent]])

	for sentence in masked_passages:
	for i, token in enumerate(sentence):
	if token.startswith('@ent') and token in dialog_entities:
	sample = {}
	query = deepcopy(sentence)
	query[i] = '@placeholder'
	sample[QUERY] = ' '.join(query)
	sample[ANSWER] = token
	sample[UTTERANCES] = masked_dialog
	sample[SCENE_ID] = scene[SCENE_ID]
	season_samples[season[SEASON_ID]].append(sample)

	trn = []
	dev = []
	tst = []
	for season_id, s_samples in season_samples.items():
	n = len(s_samples)
	random.shuffle(s_samples)
	trn.extend(s_samples[:int(0.8 * n)])
	dev.extend(s_samples[int(0.8 * n):int(0.9 * n)])
	tst.extend(s_samples[int(0.9 * n):])

	trn = relabel(trn)
	dev = relabel(dev)
	tst = relabel(tst)

	with open(os.path.join(output_dir, 'trn.json'), 'w') as fout:
	fout.write(json.dumps(trn, indent=2))

	with open(os.path.join(output_dir, 'dev.json'), 'w') as fout:
	fout.write(json.dumps(dev, indent=2))

	with open(os.path.join(output_dir, 'tst.json'), 'w') as fout:
	fout.write(json.dumps(tst, indent=2))


	def reading_stats(json_dir):
	def create(dataset, num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances):
	return [dataset,
	num_queries,
	num_utterances / num_queries,
	num_entity_type_query / num_queries,
	num_entity_count_query / num_queries,
	num_entity_type_utt / num_queries,
	num_entity_count_utt / num_queries]

	def stats(json_file):
	documents = json.load(open(json_file))
	num_queries = len(documents)
	num_entity_count_query = 0
	num_entity_type_query = 0
	num_entity_count_utt = 0
	num_entity_type_utt = 0
	num_utterances = 0

	for doc in documents:
	ents = [doc[ANSWER] if q == '@placeholder' else q for q in doc[QUERY].split() if q.startswith('@ent') or q == '@placeholder']
	num_entity_count_query += len(ents)
	num_entity_type_query += len(set(ents))

	num_utterances += len(doc[UTTERANCES])
	ents = []

	for utterance in doc[UTTERANCES]:
	ents.extend(utterance[SPEAKERS].split())
	ents.extend([t for t in utterance[TOKENS].split() if t.startswith('@ent')])

	num_entity_type_utt += len(set(ents))
	num_entity_count_utt += len(ents)

	return [num_queries, num_entity_count_query, num_entity_type_query, num_entity_count_utt, num_entity_type_utt, num_utterances]

	print('\t'.join(['Dataset', 'Queries', 'U / Q', '{E} / Q', '[E] / Q', '{E} / U', '[E] / U']))
	g_num = np.zeros(6)

	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	l = stats(json_file)
	g_num += np.array(l)
	print('\t'.join(map(str, create(json_file[-15:-12].upper(), *l))))

	print('\t'.join(map(str, create('Total', *g_num))))



	# =================================== Main ===================================

	if __name__ == '__main__':
	# json_dir = '/Users/jdchoi/Git/character-mining/json'
	# general_stats(json_dir)

	# character identification
	# input_dir = '/Users/jdchoi/Git/character-mining/json'
	# output_dir = '/Users/jdchoi/Git/character-identification/json'
	# extract_character_identification(input_dir, output_dir)
	# entity_stats(output_dir)

	# emotino detection
	# input_dir = '/Users/jdchoi/Git/character-mining/json'
	# output_dir = '/Users/jdchoi/Git/emotion-detection/json'
	# extract_emotion_detection(input_dir, output_dir)
	# emotion_stats(output_dir)

	# reading comprehension
	# json_dir = '/Users/jdchoi/Git/character-mining/json'
	output_dir = '/Users/jdchoi/Git/reading-comprehension/json'
	# extract_reading_comprehension(json_dir, output_dir)
	reading_stats(output_dir)



	# input_dir = '/Users/jdchoi/Git/character-mining/json'
	# ann_dir = '/Users/jdchoi/Downloads/dataset'
	# output_dir = '/Users/jdchoi/Git/character-mining/json/em'
	# merge_em(input_dir, ann_dir, output_dir)

	# input_dir1 = '/Users/jdchoi/Git/character-mining-dev/json-bak'
	# input_dir2 = '/Users/jdchoi/Downloads/Friends_newly_compiled'
	# output_dir = '/Users/jdchoi/Git/character-mining/json'
	# merge_rc(input_dir1, input_dir2, output_dir)










	# def merge_rc(input_dir1, input_dir2, output_dir):
	# def get_entities(rc_entities):
	# plot = rc_entities['plot_entities']
	# speaker = rc_entities['speaker_entities']
	# utterance = rc_entities['utterance_entities']
	# entities = {}
	#
	# if plot:
	# for name, ts in plot.items():
	# d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
	# d[P_ENT] = [t[:-1] for t in ts]
	#
	# for name, ts in utterance.items():
	# d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
	# d[U_ENT] = [t[:-1] for t in ts]
	#
	# for name, ts in speaker.items():
	# d = entities.setdefault(name, OrderedDict([(P_ENT, []), (U_ENT, []), (S_ENT, [])]))
	# d[S_ENT] = [t[:-1] for t in ts]
	#
	# return entities
	#
	# for input_file1 in sorted(glob.glob(os.path.join(input_dir1, '*.json'))):
	# input_file2 = os.path.join(input_dir2, os.path.basename(input_file1))
	# print(os.path.basename(input_file1))
	#
	# season1 = json.load(open(input_file1))
	# season2 = json.load(open(input_file2))
	#
	# episodes1 = season1[EPISODES]
	# episodes2 = season2[EPISODES]
	#
	# for episode1, episode2 in zip(episodes1, episodes2):
	# scenes1 = episode1[SCENES]
	# scenes2 = episode2[SCENES]
	#
	# for scene1, scene2 in zip(scenes1, scenes2):
	# scene1[PLOTS] = scene2[PLOTS]
	# scene1[RC_ENTITIES] = get_entities(scene2[RC_ENTITIES])
	#
	# with open(os.path.join(output_dir, os.path.basename(input_file1)), 'w') as fout:
	# fout.write(ordered_json(season1))
	#
	#
	# def merge_em(input_dir, ann_dir, output_dir):
	# def extend_ann(ann_file, ls):
	# fin = open(ann_file)
	#
	# for i, line in enumerate(fin):
	# if i == 0: continue
	# l = line.split()
	# season_id = int(l[0]) - 1
	# episode_id = int(l[1]) - 1
	# scene_id = int(l[2]) - 1
	# utterance_id = int(l[3])
	# annotation = l[4:8]
	# gold = l[10]
	# ls.append((season_id, episode_id, scene_id, utterance_id, annotation, gold))
	#
	#
	# annotations = []
	# for ann_file in glob.glob(os.path.join(ann_dir, '*.tsv')): extend_ann(ann_file, annotations)
	# seasons = [json.load(open(input_file)) for input_file in sorted(glob.glob(os.path.join(input_dir, '*.json')))]
	#
	# for season_id, episode_id, scene_id, utterance_id, annotation, gold in annotations:
	# utterance = seasons[season_id][EPISODES][episode_id][SCENES][scene_id][UTTERANCES][utterance_id]
	# if EMOTION in utterance:
	# if utterance[EMOTION] != gold: print(utterance[UTTERANCE_ID])
	# utterance[EMOTION] = [gold, annotation]
	# else:
	# print(utterance[UTTERANCE_ID])
	#
	# for i, season in enumerate(seasons):
	# with open(os.path.join(output_dir, 'friends_season_0%d.json' % (i+1)), 'w') as fout:
	# fout.write(ordered_json(season))
	#
	# def extract_reading_comprehension_padded(json_dir, output_dir, des_size):
	# season_samples = defaultdict(list)
	# random.seed(1234)
	#
	# for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	# data = json.load(open(json_file))
	# for episode_dict in data[EPISODES]:
	# for idx, scene_dict in enumerate(episode_dict[SCENES]):
	# if scene_dict[PLOTS] is not None:
	#
	# entities = Counter()
	# entities.update(scene_dict[RC_ENTITIES].keys())
	#
	# cur = idx
	# dialog_len = len(scene_dict[UTTERANCES])
	# while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1:
	# cur += 1
	# entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys())
	# dialog_len += len(episode_dict[SCENES][cur][UTTERANCES])
	# if dialog_len < des_size:
	# cur = idx
	# while (cur > 0 and dialog_len < des_size):
	# cur -= 1
	# entities.update(episode_dict[SCENES][cur][RC_ENTITIES].keys())
	# dialog_len += len(episode_dict[SCENES][cur][UTTERANCES])
	#
	# masking_map = {}
	# for vi, ki in enumerate(entities.keys()):
	# masking_map[ki] = '@ent%02d' % vi
	#
	# masked_passages = []
	# for i, passage in enumerate(scene_dict[PLOTS]):
	# masked_sentence = []
	# ent_list = {}
	# for ent, index_list in scene_dict[RC_ENTITIES].items():
	# for index in index_list[P_ENT]:
	# if i == index[0]:
	# ent_list[index[1]] = (index[1], index[2], ent)
	# jump = 0
	# for j, token in enumerate(passage.split(' ')):
	# if jump > 0:
	# jump -= 1
	# continue
	# if j in ent_list:
	# masked_sentence.append(masking_map[ent_list[j][2]])
	# jump = ent_list[j][1] - ent_list[j][0] - 1
	# else:
	# masked_sentence.append(token)
	# masked_passages.append(masked_sentence)
	#
	# cur = idx
	# dialog_len = len(scene_dict[UTTERANCES])
	# next_dialog = []
	# while dialog_len < des_size and cur < len(episode_dict[SCENES]) - 1:
	# cur += 1
	# for i, utterance in enumerate(episode_dict[SCENES][cur][UTTERANCES]):
	# if utterance[TOKENS_WITH_NOTE] is not None:
	# tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
	# else:
	# tokens = [w for sent in utterance[TOKENS] for w in sent]
	#
	# masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
	# ent_list = {}
	# for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items():
	# for index in index_list[U_ENT]:
	# if i == index[0]:
	# ent_list[index[1]] = (index[1], index[2], ent)
	# for index in index_list[S_ENT]:
	# if i == index[0]:
	# masked_utter[SPEAKERS][index[1]] = masking_map[ent]
	# jump = 0
	# for j, token in enumerate(tokens):
	# if jump > 0:
	# jump -= 1
	# continue
	# if j in ent_list:
	# masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
	# jump = ent_list[j][1] - ent_list[j][0] - 1
	# else:
	# masked_utter[TOKENS].append(token)
	# next_dialog.append(masked_utter)
	# dialog_len += 1
	# if dialog_len == des_size:
	# break
	#
	# prev_dialog = []
	# if dialog_len < des_size:
	# cur = idx
	# while dialog_len < des_size and cur > 0:
	# cur -= 1
	# for i, utterance in enumerate(reversed(episode_dict[SCENES][cur][UTTERANCES])):
	# if utterance[TOKENS_WITH_NOTE] is not None:
	# tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
	# else:
	# tokens = [w for sent in utterance[TOKENS] for w in sent]
	#
	# masked_utter = {}
	# masked_utter[SPEAKERS] = utterance[SPEAKERS]
	# masked_utter[TOKENS] = []
	# ent_list = {}
	# for ent, index_list in episode_dict[SCENES][cur][RC_ENTITIES].items():
	# for index in index_list[U_ENT]:
	# if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1:
	# ent_list[index[1]] = (index[1], index[2], ent)
	# for index in index_list[S_ENT]:
	# if i == len(episode_dict[SCENES][cur][UTTERANCES]) - index[0] - 1:
	# masked_utter[SPEAKERS][index[1]] = masking_map[ent]
	# jump = 0
	# for j, token in enumerate(tokens):
	# if jump > 0:
	# jump -= 1
	# continue
	# if j in ent_list:
	# masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
	# jump = ent_list[j][1] - ent_list[j][0] - 1
	# else:
	# masked_utter[TOKENS].append(token)
	# prev_dialog.append(masked_utter)
	# dialog_len += 1
	# if dialog_len == des_size:
	# break
	#
	# masked_dialog = []
	# for i, utterance in enumerate(scene_dict[UTTERANCES]):
	# if utterance[TOKENS_WITH_NOTE] is not None:
	# tokens = [w for sent in utterance[TOKENS_WITH_NOTE] for w in sent]
	# else:
	# tokens = [w for sent in utterance[TOKENS] for w in sent]
	#
	# masked_utter = {SPEAKERS: utterance[SPEAKERS], TOKENS: []}
	# ent_list = {}
	# for ent, index_list in scene_dict[RC_ENTITIES].items():
	# for index in index_list[U_ENT]:
	# if i == index[0]:
	# ent_list[index[1]] = (index[1], index[2], ent)
	# for index in index_list[S_ENT]:
	# if i == index[0]:
	# masked_utter[SPEAKERS][index[1]] = masking_map[ent]
	# jump = 0
	# for j, token in enumerate(tokens):
	# if jump > 0:
	# jump -= 1
	# continue
	# if j in ent_list:
	# masked_utter[TOKENS].append(masking_map[ent_list[j][2]])
	# jump = ent_list[j][1] - ent_list[j][0] - 1
	# else:
	# masked_utter[TOKENS].append(token)
	# masked_dialog.append(masked_utter)
	#
	# dialog_entities = Counter()
	# for ent, ent_list in scene_dict[RC_ENTITIES].items():
	# if len(ent_list[U_ENT]) > 0 or len(ent_list[S_ENT]) > 0:
	# dialog_entities.update([masking_map[ent]])
	#
	# full_dialog = []
	# for u in reversed(prev_dialog):
	# full_dialog.append(u)
	# for u in masked_dialog:
	# full_dialog.append(u)
	# for u in next_dialog:
	# full_dialog.append(u)
	#
	# for sentence in masked_passages:
	# for i, token in enumerate(sentence):
	# if token.startswith('@ent') and token in dialog_entities:
	# sample = {}
	# query = deepcopy(sentence)
	# query[i] = '@placeholder'
	# sample[QUERY] = ' '.join(query)
	# sample[ANSWER] = token
	# sample[UTTERANCES] = full_dialog
	# sample[SCENE_ID] = scene_dict[SCENE_ID]
	# season_samples[data[SEASON_ID]].append(sample)
	#
	# trn = []
	# dev = []
	# tst = []
	# for season_id, s_samples in season_samples.items():
	# l = len(s_samples)
	# random.shuffle(s_samples)
	# trn.extend(s_samples[:int(0.8 * l)])
	# dev.extend(s_samples[int(0.8 * l):int(0.9 * l)])
	# tst.extend(s_samples[int(0.9 * l):])
	#
	# trn = relabel(trn)
	# dev = relabel(dev)
	# tst = relabel(tst)
	# print(len(trn), len(dev), len(tst))
	#
	# with open(os.path.join(output_dir, 'trn-%d.json' % des_size), 'w') as fout:
	# fout.write(json.dumps(trn, indent=2))
	#
	# with open(os.path.join(output_dir, 'dev-%d.json' % des_size), 'w') as fout:
	# fout.write(json.dumps(dev, indent=2))
	#
	# with open(os.path.join(output_dir, 'tst-%d.json' % des_size), 'w') as fout:
	# fout.write(json.dumps(tst, indent=2))