Spaces:
Runtime error
Runtime error
# ======================================================================== | |
# Copyright 2018 Emory University | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ======================================================================== | |
import glob | |
import json | |
import os | |
from collections import Counter, OrderedDict | |
__author__ = 'Jinho D. Choi' | |
SEASON_ID = 'season_id' | |
EPISODES = 'episodes' | |
EPISODE_ID = 'episode_id' | |
EPISODE = 'episode' | |
SCENES = 'scenes' | |
SCENE_ID = 'scene_id' | |
UTTERANCES = 'utterances' | |
UTTERANCE_ID = 'utterance_id' | |
SPEAKERS = 'speakers' | |
TRANSCRIPT = 'transcript' | |
TRANSCRIPT_WITH_NOTE = 'transcript_with_note' | |
TOKENS = 'tokens' | |
TOKENS_WITH_NOTE = 'tokens_with_note' | |
CHARACTER_ENTITIES = 'character_entities' | |
EMOTION = 'emotion' | |
CAPTION = 'caption' | |
S01_MAP = { | |
'David Hasselhof': 'David', | |
'Angela Delvecchio': 'Angela Delveccio', | |
'Barry': 'Barry Farber', | |
'Carl': "Carl (Rachel's date)", | |
'Carol': 'Carol Willick', | |
"Frannie": "Franny", | |
'Janice': 'Janice Litman Goralnik', | |
'Jill': 'Jill Goodacre', | |
'Lizzie': 'Lizzy', | |
'Luisa': 'Luisa Gianetti', | |
'Mindy': 'Mindy Hunter', | |
'Nurse': 'Nurse Sizemore', | |
'Paul': 'Paul the Wine Guy', | |
'Ronni Rappelano': 'Ronni Rapalono', | |
'Steve': 'Steve (drug addict)', | |
} | |
S02_MAP = { | |
'Richard': 'Richard Burke', | |
'Eddie': 'Eddie Menuek', | |
'Susie': 'Susie Moss', | |
'Mr. Green': 'Leonard Green', | |
'Van Damme': 'Jean-Claude Van Damme', | |
'Stephanie': 'Stephanie Schiffer', | |
'Estelle': 'Estelle Leonard', | |
'Janice': 'Janice Litman Goralnik', | |
'Steve': 'Steven Fisher', | |
'Lipson': 'Dean Lipson', | |
'Ben': 'Ben Geller', | |
'Mindy': 'Mindy Hunter', | |
'Carol': 'Carol Willick', | |
'Susan': 'Susan Bunch', | |
'Mr. Boyle': 'Buddy Boyles', | |
'Barry': 'Barry Farber', | |
} | |
S03_MAP = { | |
'Pete': 'Peter Becker', | |
'Peter Bekcer': 'Peter Bekcer', | |
'Janice': 'Janice Litman Goralnik', | |
'Kate': 'Kate Miller', | |
'Mark': 'Mark Robinson', | |
'Richard': 'Richard Burke', | |
'Dr. Green': 'Leonard Green', | |
'Robert': 'Robert Bobby', | |
'Phoebe Sr.': 'Phoebe Abbott', | |
'Julio': 'Julio (poet)', | |
'Alice': 'Alice Knight', | |
'Sarah': 'Sarah Tuttle', | |
'Eric': 'Eric (photographer)', | |
'Whitfield': 'Sherman Whitfield', | |
'Susan': 'Susan Bunch', | |
'Ben': 'Ben Geller', | |
'Cookie': 'Cookie Tribbiani', | |
'Estelle': 'Estelle Leonard', | |
'Stevenson': 'Parker Stevenson', | |
'Michelle': 'Michelle Burke', | |
'Carol': 'Carol Willick', | |
'Johnson': 'Dr. Johnson' | |
} | |
S04_MAP = { | |
'Emily': 'Emily Waltham', | |
'Joshua': 'Joshua Burgin', | |
'Phoebe Sr.': 'Phoebe Abbott', | |
'Tim': 'Timothy Burke', | |
'Alice': 'Alice Knight', | |
'Janice': 'Janice Litman Goralnik', | |
'Chip': 'Chip Matthews', | |
'Rick': 'Rick Sanoven', | |
'Ursula': 'Ursula Buffay', | |
'Amanda': "Amanda (Ross' date)", | |
'Susan': 'Susan Bunch', | |
'Dr. Timothy Burke': 'Timothy Burke', | |
'Mrs. Waltham': 'Andrea Waltham', | |
'Mr. Waltham': 'Stephen Waltham', | |
} | |
S05_MAP = { | |
'Steve': 'Steve Cera', | |
'Emily': 'Emily Waltham', | |
'Janice': 'Janice Litman Goralnik', | |
'Ursula': 'Ursula Buffay', | |
'Mrs. Waltham': 'Andrea Waltham', | |
'Mr. Waltham': 'Stephen Waltham', | |
'Alice': 'Alice Knight', | |
'Estelle': 'Estelle Leonard', | |
'Ben': 'Ben Geller', | |
} | |
S06_MAP = { | |
'Paul': 'Paul Stevens', | |
'Janine': 'Janine Lecroix', | |
'Elizabeth': 'Elizabeth Stevens', | |
'Jill': 'Jill Green', | |
'Richard': 'Richard Burke', | |
'Dana': 'Dana Keystone', | |
'Estelle': 'Estelle Leonard', | |
'Ursula': 'Ursula Buffay', | |
'Susan': 'Susan Bunch', | |
'Carl': "Carl (Joey's lookalike)", | |
'Ben': 'Ben Geller', | |
'Janice': 'Janice Litman Goralnik', | |
} | |
S07_MAP = { | |
'Tag': 'Tag Jones', | |
'Ben': 'Ben Geller', | |
'Melissa': 'Melissa Warburton', | |
'Richard': 'Richard Burke', | |
'Kristen': 'Kristen Leigh', | |
'Janine': 'Janine Lecroix', | |
'Cassie': 'Cassie Geller', | |
'Megan': 'Megan Bailey', | |
'Ursula': 'Ursula Buffay', | |
'Morse': 'Ned Morse', | |
'Mrs. Bing': 'Nora Tyler Bing', | |
'Mr. Bing': 'Charles Bing', | |
'Estelle': 'Estelle Leonard', | |
'Julie': 'Julie Graff', | |
'Frannie': 'Franny', | |
} | |
S08_MAP = { | |
'Will': 'Will Colbert', | |
'Dr. Green': 'Leonard Green', | |
'Janice': 'Janice Litman Goralnik', | |
'Clifford': 'Clifford Burnett', | |
'Ursula': 'Ursula Buffay', | |
'Tag': 'Tag Jones', | |
'Bob': "Bob (Chandler's coworker)", | |
'Bobby': 'Bobby Corso', | |
'Katie': 'Katie (saleswoman)', | |
'Julie': 'Julie Coreger', | |
'Marc': 'Marc Coreger', | |
'Ben': 'Ben Geller', | |
'Estelle': 'Estelle Leonard', | |
'Sid': 'Sid Goralnik', | |
} | |
S09_MAP = { | |
'Mike': 'Mike Hannigan', | |
'Charlie': 'Charlie Wheeler', | |
'Gavin': 'Gavin Mitchell', | |
'Amy': 'Amy Green', | |
'Bitsy': 'Bitsy Hannigan', | |
'Janice': 'Janice Litman Goralnik', | |
'Lowell': 'Lowell (mugger)', | |
'Mugger': 'Lowell (mugger)', | |
'Ben': 'Ben Geller', | |
'Mr. Oberblau': 'Jarvis Oberblau', | |
'Ms. Geller': 'Judy Geller', | |
} | |
S10_MAP = { | |
'Mike': 'Mike Hannigan', | |
'Amy': 'Amy Green', | |
'Charlie': 'Charlie Wheeler', | |
'Benjamin': 'Benjamin Hobart', | |
'Amanda': 'Amanda Buffamonteezi', | |
'Janice': 'Janice Litman Goralnik', | |
'Missy': 'Missy Goldberg', | |
'Mark': 'Mark Robinson', | |
'Dr. Green': 'Leonard Green', | |
'Estelle': 'Estelle Leonard', | |
'R Zelner': 'Mr. Zelner' | |
} | |
def entity_stats(json_file, SPEAKER_MAP): | |
speaker_list = [] | |
entity_list = [] | |
season = json.load(open(json_file)) | |
for episode in season[EPISODES]: | |
scenes = episode[SCENES] | |
for scene in scenes: | |
for utterance in scene[UTTERANCES]: | |
speakers = utterance[SPEAKERS] | |
for i, speaker in enumerate(speakers): | |
speakers[i] = SPEAKER_MAP.get(speaker, speaker) | |
speaker_list.extend(speakers) | |
# for character_entities in utterance['character_entities']: | |
# for entities in character_entities: | |
# for i, e in enumerate(entities[2:], 2): | |
# entities[i] = SPEAKER_MAP.get(e, e) | |
# entity_list.extend(entities[2:]) | |
with open(json_file+'.v2','w') as fout: | |
json.dump(season, fout, sort_keys=True, indent=4) | |
# print('===== Entities =====') | |
# c = Counter(entity_list) | |
# for k, v in sorted(c.items(), key=lambda x: x[1], reverse=True): | |
# print(k+'\t'+str(v)) | |
print('===== Speakers =====') | |
c = Counter(speaker_list) | |
for k, v in sorted(c.items(), key=lambda x: x[1], reverse=True): | |
print(k + '\t' + str(v)) | |
def find(json_file): | |
season = json.load(open(json_file)) | |
for episode in season[EPISODES]: | |
scenes = episode[SCENES] | |
for scene in scenes: | |
for alloquies in scene['alloquies']: | |
discourse = alloquies['discourseWithoutDescription'] | |
speakers = alloquies['speakers'] | |
if '' in speakers: | |
print(alloquies['alloquyId']) | |
for character_entities in discourse['characterEntities']: | |
for entity_list in character_entities: | |
pass | |
def entity_stats(json_dir): | |
g_speaker_list = [] | |
g_entity_list = [] | |
for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))): | |
speaker_list = [] | |
entity_list = [] | |
num_mentions = 0 | |
season = json.load(open(json_file)) | |
for episode in season[EPISODES]: | |
scenes = episode[SCENES] | |
for scene in scenes: | |
for utterance in scene[UTTERANCES]: | |
speakers = utterance[SPEAKERS] | |
# for i, speaker in enumerate(speakers): | |
# if speaker == 'Boys' or speaker == '': print(utterance[UTTERANCE_ID]) | |
speaker_list.extend(speakers) | |
if CHARACTER_ENTITIES in utterance: | |
for character_entities in utterance[CHARACTER_ENTITIES]: | |
num_mentions += len(character_entities) | |
for entities in character_entities: | |
for i, e in enumerate(entities[2:], 2): | |
entities[i] = SM.get(e, e) | |
# if e == 'Peter': print(utterance[UTTERANCE_ID]) | |
entity_list.extend(entities[2:]) | |
# ordered_print(json_file, season) | |
g_speaker_list.extend(speaker_list) | |
g_entity_list.extend(entity_list) | |
s = '\t'.join(map(str, [season[SEASON_ID], len(set(speaker_list)), num_mentions, len(set(entity_list))])) | |
print(s) | |
# print('===== Speakers =====') | |
# c = Counter(g_speaker_list) | |
# for k, v in sorted(c.items()): print(k + '\t' + str(v)) | |
# | |
# print('===== Entities =====') | |
# c = Counter(g_entity_list) | |
# for k, v in sorted(c.items()): print(k+'\t'+str(v)) | |
def get_tokens(json_dir): | |
tokens = {} | |
for i, json_file in enumerate(sorted(glob.glob(os.path.join(json_dir, '*.json')))): | |
if i >= 4: break | |
season = json.load(open(json_file)) | |
for episode in season[EPISODES]: | |
scenes = episode[SCENES] | |
for scene in scenes: | |
for utterance in scene[UTTERANCES]: | |
utterance_id = utterance['utterance_id'] | |
tokens[utterance_id] = utterance['tokens'] | |
return tokens | |
def compare(): | |
main_dir = '/Users/jdchoi/Git/character-mining-dev/json' | |
ethan_dir = '/Users/jdchoi/Downloads/enhanced-jsons' | |
m_utterances = get_tokens(main_dir) | |
e_utterances = get_tokens(ethan_dir) | |
c = 0 | |
for utterance_id, m_tokens in m_utterances.items(): | |
e_tokens = e_utterances[utterance_id] | |
e_tokens = [tokens for tokens in e_tokens if len(tokens) > 1 or tokens[0] != '_'] | |
if len(m_tokens) != len(e_tokens): | |
print(utterance_id) | |
print(m_tokens) | |
print(e_tokens) | |
c += 1 | |
for tokens in e_tokens: | |
if 'hes' in tokens: | |
print(tokens) | |
print(c) | |
if __name__ == '__main__': | |
pass |