Spaces:

malvika2003
/

openvino_notebooks

Runtime error

App Files Files Community

openvino_notebooks / notebooks /llm-question-answering /character-mining /scripts /stats.py

malvika2003

Upload folder using huggingface_hub

db5855f verified 9 months ago

raw

history blame contribute delete

8.21 kB

	# ========================================================================
	# Copyright 2018 Emory University
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========================================================================
	import glob
	import json

	import os
	from collections import Counter, OrderedDict

	__author__ = 'Jinho D. Choi'

	SEASON_ID = 'season_id'
	EPISODES = 'episodes'
	EPISODE_ID = 'episode_id'
	EPISODE = 'episode'
	SCENES = 'scenes'
	SCENE_ID = 'scene_id'
	UTTERANCES = 'utterances'
	UTTERANCE_ID = 'utterance_id'
	SPEAKERS = 'speakers'
	TRANSCRIPT = 'transcript'
	TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
	TOKENS = 'tokens'
	TOKENS_WITH_NOTE = 'tokens_with_note'
	CHARACTER_ENTITIES = 'character_entities'
	EMOTION = 'emotion'
	CAPTION = 'caption'


	def ordered_print(json_file, s=None):
	def pair(key, d):
	s = d[key]
	if isinstance(s, str): s = ' '.join(s.split())
	return key, s

	if s is None: s = json.load(open(json_file))
	season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)])
	if len(s) != len(season): print('Error: 0')
	episodes = season[EPISODES]

	for i, e in enumerate(episodes):
	episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)])
	if len(e) != len(episode): print('Error: 1')
	episodes[i] = episode
	scenes = episode[SCENES]

	for j, c in enumerate(scenes):
	scene = OrderedDict([pair(SCENE_ID, c), pair(UTTERANCES, c)])
	if len(c) != len(scene): print('Error: 2')
	scenes[j] = scene
	utterances = scene[UTTERANCES]

	for k, u in enumerate(utterances):
	utterance = [
	pair(UTTERANCE_ID, u),
	pair(SPEAKERS, u),
	pair(TRANSCRIPT, u),
	pair(TRANSCRIPT_WITH_NOTE, u),
	pair(TOKENS, u),
	pair(TOKENS_WITH_NOTE, u)]

	if CHARACTER_ENTITIES in u: utterance.append(pair(CHARACTER_ENTITIES, u))
	if EMOTION in u: utterance.append(pair(EMOTION, u))
	if CAPTION in u: utterance.append(pair(CAPTION, u))

	if len(u) != len(utterance): print('Error: 3')
	utterances[k] = OrderedDict(utterance)

	with open(json_file+'.v2','w') as fout:
	json.dump(season, fout, indent=4)


	def general_stats(json_file):
	num_scenes = 0
	num_utterances = 0
	num_utterances_wn = 0
	num_sentences = 0
	num_sentences_wn = 0
	num_tokens = 0
	num_tokens_wn = 0
	all_speakers = set()

	season = json.load(open(json_file))
	episodes = season[EPISODES]

	for episode in episodes:
	scenes = episode[SCENES]
	num_scenes += len(scenes)

	for scene in scenes:
	utterances = scene[UTTERANCES]
	num_utterances_wn += len(utterances)

	for utterance in utterances:
	all_speakers.update(utterance[SPEAKERS])

	tokens = utterance[TOKENS]
	if tokens:
	num_utterances += 1
	num_sentences += len(tokens)
	num_tokens += sum([len(t) for t in tokens])

	tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens
	num_sentences_wn += len(tokens_wn)
	num_tokens_wn += sum([len(t) for t in tokens_wn])

	return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, all_speakers, num_utterances_wn, num_sentences_wn, num_tokens_wn]


	def print_general_stats(json_dir):
	all_speakers = set()
	print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers']))
	for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	l = general_stats(json_file)
	all_speakers.update(l[6])
	l[6] = len(l[6])
	print('\t'.join(map(str, l)))
	print('All speakers: %s' % (len(all_speakers)))


	def entity_stats(json_dir):
	g_speaker_list = []
	g_entity_list = []

	print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Entities', 'Singular', 'Plural', 'Mentions']))

	for k, json_file in enumerate(sorted(glob.glob(os.path.join(json_dir, '*.json')))):
	if k >= 4: break
	speaker_list = []
	entity_list = []
	num_clusters = 0
	num_scenes = 0
	num_utterances = 0
	num_tokens = 0
	num_mentions = 0
	num_singular_mentions = 0
	num_plural_mentions = 0
	entity_types = [0, 0, 0, 0, 0]

	season = json.load(open(json_file))
	episodes = season[EPISODES]

	for episode in episodes:
	scenes = episode[SCENES]

	for scene in scenes:
	annotated = False
	cluster_set = set()

	for utterance in scene[UTTERANCES]:
	if CHARACTER_ENTITIES in utterance and len(utterance[TOKENS]) > 0:
	annotated = True
	num_utterances += 1
	num_tokens += len(utterance[TOKENS])
	speaker_list.extend(utterance[SPEAKERS])

	for character_entities in utterance[CHARACTER_ENTITIES]:
	# num_mentions += len(character_entities)
	for entities in character_entities:
	if 'Non-Entity' in entities: continue
	for e in entities[2:]:
	entity_list.append(e)
	cluster_set.add(e)

	if e in {'Girl', 'Girl 1', 'Girl 2', 'Guy', 'Guy 1', 'Man', 'Man 1', 'Man 2', 'Man 3', 'Person 1', 'Person 2', 'Person 3', 'Woman', 'Woman 1', 'Woman 2', 'Woman 3'}:
	entity_types[2] += 1
	elif e in {'Monica Geller', 'Ross Geller', 'Rachel Green', 'Joey Tribbiani', 'Phoebe Buffay', 'Chandler Bing'}:
	entity_types[0] += 1
	elif e == '#GENERAL#':
	entity_types[3] += 1
	elif e == '#OTHER#':
	entity_types[4] += 1
	else:
	entity_types[1] += 1

	if len(entities) == 3: num_singular_mentions += 1
	else: num_plural_mentions += 1
	num_mentions += 1

	if annotated: num_scenes += 1
	num_clusters += len(cluster_set)

	g_speaker_list.extend(speaker_list)
	g_entity_list.extend(entity_list)
	s = '\t'.join(map(str, [season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_singular_mentions, num_plural_mentions, num_mentions, num_clusters, len(set(entity_list))]))
	print(s)

	print('All speakers: %s' % (len(set(g_speaker_list))))
	print('All entities: %s' % (len(set(g_entity_list))))




	if __name__ == '__main__':
	json_dir = '/Users/jdchoi/Git/character-mining-dev/json'
	# print_general_stats(json_dir)
	entity_stats(json_dir)
	#
	# # for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
	# # print(json_file)
	# # ordered_print(json_file)