Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import json | |
class PrepareArticles(object): | |
""" | |
Parse preprocessed data from csv | |
This information is needed for evaluating log-perplexity of the text with respect to a language model | |
and later on to test the likelihood that the sentence was sampled from the model with the relevant context. | |
""" | |
def __init__(self, article_obj, get_edits=False, min_tokens=10, max_tokens=100, max_sentences=None): | |
self.article_obj = article_obj | |
self.min_tokens = min_tokens | |
self.max_tokens = max_tokens | |
self.get_edits = get_edits | |
self.max_sentences = max_sentences | |
def __call__(self, combined=True): | |
return self.parse_dataset(combined) | |
def parse_dataset(self, combined=True): | |
texts = [] | |
lengths = [] | |
contexts = [] | |
tags = [] | |
current_texts = [] | |
current_lengths = [] | |
current_contexts = [] | |
current_tags = [] | |
exceeded_max_sentences = False | |
for sub_title in self.article_obj['sub_titles']: # For each sub title | |
for sentence in sub_title['sentences']: # Go over each sentence | |
sentence_size = len(sentence['sentence'].split()) | |
if sentence_size >= self.min_tokens and sentence_size <= self.max_tokens: | |
current_texts.append(sentence['sentence']) | |
current_lengths.append(len(sentence['sentence'].split())) # Number of tokens | |
current_contexts.append(sentence['context'] if 'context' in sentence else None) | |
current_tags.append('no edits') | |
# If get_edits and has edited sentence save it | |
if self.get_edits and 'alternative' in sentence and len(sentence['alternative'].split()) >= self.min_tokens and len(sentence['alternative'].split()) <= self.max_tokens: | |
current_texts.append(sentence['alternative']) | |
current_lengths.append(len(sentence['alternative'].split())) | |
current_contexts.append(sentence['alternative_context'] if 'alternative_context' in sentence else None) | |
current_tags.append('<edit>') | |
if self.max_sentences and len(current_texts) >= self.max_sentences: | |
exceeded_max_sentences = True | |
break | |
# return {'text': np.array(texts, dtype=object), 'length': np.array(lengths, dtype=object), 'context': np.array(contexts, dtype=object), 'tag': np.array(tags, dtype=object), | |
# 'number_in_par': np.arange(1,1+len(texts))} | |
if exceeded_max_sentences: | |
break | |
# If exceede max sentences only if self.max_sentences is not None | |
if (self.max_sentences and exceeded_max_sentences) or (not self.max_sentences): | |
# If combined, combine the data | |
if combined: | |
texts = texts + current_texts | |
lengths = lengths + current_lengths | |
contexts = contexts + current_contexts | |
tags = tags + current_tags | |
else: | |
texts.append(np.array(current_texts)) | |
lengths.append(np.array(current_lengths)) | |
contexts.append(np.array(current_contexts)) | |
tags.append(np.array(current_tags)) | |
return {'text': np.array(texts, dtype=object), 'length': np.array(lengths, dtype=object), 'context': np.array(contexts, dtype=object), 'tag': np.array(tags, dtype=object), | |
'number_in_par': np.arange(1,1+len(texts))} | |