malvika2003's picture
Upload folder using huggingface_hub
db5855f verified
# ========================================================================
# Copyright 2018 Emory University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================
import glob
import json
import os
from collections import Counter, OrderedDict
__author__ = 'Jinho D. Choi'
SEASON_ID = 'season_id'
EPISODES = 'episodes'
EPISODE_ID = 'episode_id'
EPISODE = 'episode'
SCENES = 'scenes'
SCENE_ID = 'scene_id'
UTTERANCES = 'utterances'
UTTERANCE_ID = 'utterance_id'
SPEAKERS = 'speakers'
TRANSCRIPT = 'transcript'
TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
TOKENS = 'tokens'
TOKENS_WITH_NOTE = 'tokens_with_note'
CHARACTER_ENTITIES = 'character_entities'
EMOTION = 'emotion'
CAPTION = 'caption'
def ordered_print(json_file, s=None):
def pair(key, d):
s = d[key]
if isinstance(s, str): s = ' '.join(s.split())
return key, s
if s is None: s = json.load(open(json_file))
season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)])
if len(s) != len(season): print('Error: 0')
episodes = season[EPISODES]
for i, e in enumerate(episodes):
episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)])
if len(e) != len(episode): print('Error: 1')
episodes[i] = episode
scenes = episode[SCENES]
for j, c in enumerate(scenes):
scene = OrderedDict([pair(SCENE_ID, c), pair(UTTERANCES, c)])
if len(c) != len(scene): print('Error: 2')
scenes[j] = scene
utterances = scene[UTTERANCES]
for k, u in enumerate(utterances):
utterance = [
pair(UTTERANCE_ID, u),
pair(SPEAKERS, u),
pair(TRANSCRIPT, u),
pair(TRANSCRIPT_WITH_NOTE, u),
pair(TOKENS, u),
pair(TOKENS_WITH_NOTE, u)]
if CHARACTER_ENTITIES in u: utterance.append(pair(CHARACTER_ENTITIES, u))
if EMOTION in u: utterance.append(pair(EMOTION, u))
if CAPTION in u: utterance.append(pair(CAPTION, u))
if len(u) != len(utterance): print('Error: 3')
utterances[k] = OrderedDict(utterance)
with open(json_file+'.v2','w') as fout:
json.dump(season, fout, indent=4)
def general_stats(json_file):
num_scenes = 0
num_utterances = 0
num_utterances_wn = 0
num_sentences = 0
num_sentences_wn = 0
num_tokens = 0
num_tokens_wn = 0
all_speakers = set()
season = json.load(open(json_file))
episodes = season[EPISODES]
for episode in episodes:
scenes = episode[SCENES]
num_scenes += len(scenes)
for scene in scenes:
utterances = scene[UTTERANCES]
num_utterances_wn += len(utterances)
for utterance in utterances:
all_speakers.update(utterance[SPEAKERS])
tokens = utterance[TOKENS]
if tokens:
num_utterances += 1
num_sentences += len(tokens)
num_tokens += sum([len(t) for t in tokens])
tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens
num_sentences_wn += len(tokens_wn)
num_tokens_wn += sum([len(t) for t in tokens_wn])
return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, all_speakers, num_utterances_wn, num_sentences_wn, num_tokens_wn]
def print_general_stats(json_dir):
all_speakers = set()
print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers']))
for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
l = general_stats(json_file)
all_speakers.update(l[6])
l[6] = len(l[6])
print('\t'.join(map(str, l)))
print('All speakers: %s' % (len(all_speakers)))
def entity_stats(json_dir):
g_speaker_list = []
g_entity_list = []
print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Entities', 'Singular', 'Plural', 'Mentions']))
for k, json_file in enumerate(sorted(glob.glob(os.path.join(json_dir, '*.json')))):
if k >= 4: break
speaker_list = []
entity_list = []
num_clusters = 0
num_scenes = 0
num_utterances = 0
num_tokens = 0
num_mentions = 0
num_singular_mentions = 0
num_plural_mentions = 0
entity_types = [0, 0, 0, 0, 0]
season = json.load(open(json_file))
episodes = season[EPISODES]
for episode in episodes:
scenes = episode[SCENES]
for scene in scenes:
annotated = False
cluster_set = set()
for utterance in scene[UTTERANCES]:
if CHARACTER_ENTITIES in utterance and len(utterance[TOKENS]) > 0:
annotated = True
num_utterances += 1
num_tokens += len(utterance[TOKENS])
speaker_list.extend(utterance[SPEAKERS])
for character_entities in utterance[CHARACTER_ENTITIES]:
# num_mentions += len(character_entities)
for entities in character_entities:
if 'Non-Entity' in entities: continue
for e in entities[2:]:
entity_list.append(e)
cluster_set.add(e)
if e in {'Girl', 'Girl 1', 'Girl 2', 'Guy', 'Guy 1', 'Man', 'Man 1', 'Man 2', 'Man 3', 'Person 1', 'Person 2', 'Person 3', 'Woman', 'Woman 1', 'Woman 2', 'Woman 3'}:
entity_types[2] += 1
elif e in {'Monica Geller', 'Ross Geller', 'Rachel Green', 'Joey Tribbiani', 'Phoebe Buffay', 'Chandler Bing'}:
entity_types[0] += 1
elif e == '#GENERAL#':
entity_types[3] += 1
elif e == '#OTHER#':
entity_types[4] += 1
else:
entity_types[1] += 1
if len(entities) == 3: num_singular_mentions += 1
else: num_plural_mentions += 1
num_mentions += 1
if annotated: num_scenes += 1
num_clusters += len(cluster_set)
g_speaker_list.extend(speaker_list)
g_entity_list.extend(entity_list)
s = '\t'.join(map(str, [season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_singular_mentions, num_plural_mentions, num_mentions, num_clusters, len(set(entity_list))]))
print(s)
print('All speakers: %s' % (len(set(g_speaker_list))))
print('All entities: %s' % (len(set(g_entity_list))))
if __name__ == '__main__':
json_dir = '/Users/jdchoi/Git/character-mining-dev/json'
# print_general_stats(json_dir)
entity_stats(json_dir)
#
# # for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
# # print(json_file)
# # ordered_print(json_file)