malvika2003's picture
Upload folder using huggingface_hub
db5855f verified
# ========================================================================
# Copyright 2018 Emory University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================
import glob
import json
import os
from collections import Counter, OrderedDict
__author__ = 'Jinho D. Choi'
SEASON_ID = 'season_id'
EPISODES = 'episodes'
EPISODE_ID = 'episode_id'
EPISODE = 'episode'
SCENES = 'scenes'
SCENE_ID = 'scene_id'
UTTERANCES = 'utterances'
UTTERANCE_ID = 'utterance_id'
SPEAKERS = 'speakers'
TRANSCRIPT = 'transcript'
TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
TOKENS = 'tokens'
TOKENS_WITH_NOTE = 'tokens_with_note'
CHARACTER_ENTITIES = 'character_entities'
EMOTION = 'emotion'
CAPTION = 'caption'
S01_MAP = {
'David Hasselhof': 'David',
'Angela Delvecchio': 'Angela Delveccio',
'Barry': 'Barry Farber',
'Carl': "Carl (Rachel's date)",
'Carol': 'Carol Willick',
"Frannie": "Franny",
'Janice': 'Janice Litman Goralnik',
'Jill': 'Jill Goodacre',
'Lizzie': 'Lizzy',
'Luisa': 'Luisa Gianetti',
'Mindy': 'Mindy Hunter',
'Nurse': 'Nurse Sizemore',
'Paul': 'Paul the Wine Guy',
'Ronni Rappelano': 'Ronni Rapalono',
'Steve': 'Steve (drug addict)',
}
S02_MAP = {
'Richard': 'Richard Burke',
'Eddie': 'Eddie Menuek',
'Susie': 'Susie Moss',
'Mr. Green': 'Leonard Green',
'Van Damme': 'Jean-Claude Van Damme',
'Stephanie': 'Stephanie Schiffer',
'Estelle': 'Estelle Leonard',
'Janice': 'Janice Litman Goralnik',
'Steve': 'Steven Fisher',
'Lipson': 'Dean Lipson',
'Ben': 'Ben Geller',
'Mindy': 'Mindy Hunter',
'Carol': 'Carol Willick',
'Susan': 'Susan Bunch',
'Mr. Boyle': 'Buddy Boyles',
'Barry': 'Barry Farber',
}
S03_MAP = {
'Pete': 'Peter Becker',
'Peter Bekcer': 'Peter Bekcer',
'Janice': 'Janice Litman Goralnik',
'Kate': 'Kate Miller',
'Mark': 'Mark Robinson',
'Richard': 'Richard Burke',
'Dr. Green': 'Leonard Green',
'Robert': 'Robert Bobby',
'Phoebe Sr.': 'Phoebe Abbott',
'Julio': 'Julio (poet)',
'Alice': 'Alice Knight',
'Sarah': 'Sarah Tuttle',
'Eric': 'Eric (photographer)',
'Whitfield': 'Sherman Whitfield',
'Susan': 'Susan Bunch',
'Ben': 'Ben Geller',
'Cookie': 'Cookie Tribbiani',
'Estelle': 'Estelle Leonard',
'Stevenson': 'Parker Stevenson',
'Michelle': 'Michelle Burke',
'Carol': 'Carol Willick',
'Johnson': 'Dr. Johnson'
}
S04_MAP = {
'Emily': 'Emily Waltham',
'Joshua': 'Joshua Burgin',
'Phoebe Sr.': 'Phoebe Abbott',
'Tim': 'Timothy Burke',
'Alice': 'Alice Knight',
'Janice': 'Janice Litman Goralnik',
'Chip': 'Chip Matthews',
'Rick': 'Rick Sanoven',
'Ursula': 'Ursula Buffay',
'Amanda': "Amanda (Ross' date)",
'Susan': 'Susan Bunch',
'Dr. Timothy Burke': 'Timothy Burke',
'Mrs. Waltham': 'Andrea Waltham',
'Mr. Waltham': 'Stephen Waltham',
}
S05_MAP = {
'Steve': 'Steve Cera',
'Emily': 'Emily Waltham',
'Janice': 'Janice Litman Goralnik',
'Ursula': 'Ursula Buffay',
'Mrs. Waltham': 'Andrea Waltham',
'Mr. Waltham': 'Stephen Waltham',
'Alice': 'Alice Knight',
'Estelle': 'Estelle Leonard',
'Ben': 'Ben Geller',
}
S06_MAP = {
'Paul': 'Paul Stevens',
'Janine': 'Janine Lecroix',
'Elizabeth': 'Elizabeth Stevens',
'Jill': 'Jill Green',
'Richard': 'Richard Burke',
'Dana': 'Dana Keystone',
'Estelle': 'Estelle Leonard',
'Ursula': 'Ursula Buffay',
'Susan': 'Susan Bunch',
'Carl': "Carl (Joey's lookalike)",
'Ben': 'Ben Geller',
'Janice': 'Janice Litman Goralnik',
}
S07_MAP = {
'Tag': 'Tag Jones',
'Ben': 'Ben Geller',
'Melissa': 'Melissa Warburton',
'Richard': 'Richard Burke',
'Kristen': 'Kristen Leigh',
'Janine': 'Janine Lecroix',
'Cassie': 'Cassie Geller',
'Megan': 'Megan Bailey',
'Ursula': 'Ursula Buffay',
'Morse': 'Ned Morse',
'Mrs. Bing': 'Nora Tyler Bing',
'Mr. Bing': 'Charles Bing',
'Estelle': 'Estelle Leonard',
'Julie': 'Julie Graff',
'Frannie': 'Franny',
}
S08_MAP = {
'Will': 'Will Colbert',
'Dr. Green': 'Leonard Green',
'Janice': 'Janice Litman Goralnik',
'Clifford': 'Clifford Burnett',
'Ursula': 'Ursula Buffay',
'Tag': 'Tag Jones',
'Bob': "Bob (Chandler's coworker)",
'Bobby': 'Bobby Corso',
'Katie': 'Katie (saleswoman)',
'Julie': 'Julie Coreger',
'Marc': 'Marc Coreger',
'Ben': 'Ben Geller',
'Estelle': 'Estelle Leonard',
'Sid': 'Sid Goralnik',
}
S09_MAP = {
'Mike': 'Mike Hannigan',
'Charlie': 'Charlie Wheeler',
'Gavin': 'Gavin Mitchell',
'Amy': 'Amy Green',
'Bitsy': 'Bitsy Hannigan',
'Janice': 'Janice Litman Goralnik',
'Lowell': 'Lowell (mugger)',
'Mugger': 'Lowell (mugger)',
'Ben': 'Ben Geller',
'Mr. Oberblau': 'Jarvis Oberblau',
'Ms. Geller': 'Judy Geller',
}
S10_MAP = {
'Mike': 'Mike Hannigan',
'Amy': 'Amy Green',
'Charlie': 'Charlie Wheeler',
'Benjamin': 'Benjamin Hobart',
'Amanda': 'Amanda Buffamonteezi',
'Janice': 'Janice Litman Goralnik',
'Missy': 'Missy Goldberg',
'Mark': 'Mark Robinson',
'Dr. Green': 'Leonard Green',
'Estelle': 'Estelle Leonard',
'R Zelner': 'Mr. Zelner'
}
def entity_stats(json_file, SPEAKER_MAP):
speaker_list = []
entity_list = []
season = json.load(open(json_file))
for episode in season[EPISODES]:
scenes = episode[SCENES]
for scene in scenes:
for utterance in scene[UTTERANCES]:
speakers = utterance[SPEAKERS]
for i, speaker in enumerate(speakers):
speakers[i] = SPEAKER_MAP.get(speaker, speaker)
speaker_list.extend(speakers)
# for character_entities in utterance['character_entities']:
# for entities in character_entities:
# for i, e in enumerate(entities[2:], 2):
# entities[i] = SPEAKER_MAP.get(e, e)
# entity_list.extend(entities[2:])
with open(json_file+'.v2','w') as fout:
json.dump(season, fout, sort_keys=True, indent=4)
# print('===== Entities =====')
# c = Counter(entity_list)
# for k, v in sorted(c.items(), key=lambda x: x[1], reverse=True):
# print(k+'\t'+str(v))
print('===== Speakers =====')
c = Counter(speaker_list)
for k, v in sorted(c.items(), key=lambda x: x[1], reverse=True):
print(k + '\t' + str(v))
def find(json_file):
season = json.load(open(json_file))
for episode in season[EPISODES]:
scenes = episode[SCENES]
for scene in scenes:
for alloquies in scene['alloquies']:
discourse = alloquies['discourseWithoutDescription']
speakers = alloquies['speakers']
if '' in speakers:
print(alloquies['alloquyId'])
for character_entities in discourse['characterEntities']:
for entity_list in character_entities:
pass
def entity_stats(json_dir):
g_speaker_list = []
g_entity_list = []
for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
speaker_list = []
entity_list = []
num_mentions = 0
season = json.load(open(json_file))
for episode in season[EPISODES]:
scenes = episode[SCENES]
for scene in scenes:
for utterance in scene[UTTERANCES]:
speakers = utterance[SPEAKERS]
# for i, speaker in enumerate(speakers):
# if speaker == 'Boys' or speaker == '': print(utterance[UTTERANCE_ID])
speaker_list.extend(speakers)
if CHARACTER_ENTITIES in utterance:
for character_entities in utterance[CHARACTER_ENTITIES]:
num_mentions += len(character_entities)
for entities in character_entities:
for i, e in enumerate(entities[2:], 2):
entities[i] = SM.get(e, e)
# if e == 'Peter': print(utterance[UTTERANCE_ID])
entity_list.extend(entities[2:])
# ordered_print(json_file, season)
g_speaker_list.extend(speaker_list)
g_entity_list.extend(entity_list)
s = '\t'.join(map(str, [season[SEASON_ID], len(set(speaker_list)), num_mentions, len(set(entity_list))]))
print(s)
# print('===== Speakers =====')
# c = Counter(g_speaker_list)
# for k, v in sorted(c.items()): print(k + '\t' + str(v))
#
# print('===== Entities =====')
# c = Counter(g_entity_list)
# for k, v in sorted(c.items()): print(k+'\t'+str(v))
def get_tokens(json_dir):
tokens = {}
for i, json_file in enumerate(sorted(glob.glob(os.path.join(json_dir, '*.json')))):
if i >= 4: break
season = json.load(open(json_file))
for episode in season[EPISODES]:
scenes = episode[SCENES]
for scene in scenes:
for utterance in scene[UTTERANCES]:
utterance_id = utterance['utterance_id']
tokens[utterance_id] = utterance['tokens']
return tokens
def compare():
main_dir = '/Users/jdchoi/Git/character-mining-dev/json'
ethan_dir = '/Users/jdchoi/Downloads/enhanced-jsons'
m_utterances = get_tokens(main_dir)
e_utterances = get_tokens(ethan_dir)
c = 0
for utterance_id, m_tokens in m_utterances.items():
e_tokens = e_utterances[utterance_id]
e_tokens = [tokens for tokens in e_tokens if len(tokens) > 1 or tokens[0] != '_']
if len(m_tokens) != len(e_tokens):
print(utterance_id)
print(m_tokens)
print(e_tokens)
c += 1
for tokens in e_tokens:
if 'hes' in tokens:
print(tokens)
print(c)
if __name__ == '__main__':
pass