import pandas as pd import numpy as np import json class PrepareArticles(object): """ Parse preprocessed data from csv This information is needed for evaluating log-perplexity of the text with respect to a language model and later on to test the likelihood that the sentence was sampled from the model with the relevant context. """ def __init__(self, article_obj, get_edits=False, min_tokens=10, max_tokens=100, max_sentences=None): self.article_obj = article_obj self.min_tokens = min_tokens self.max_tokens = max_tokens self.get_edits = get_edits self.max_sentences = max_sentences def __call__(self, combined=True): return self.parse_dataset(combined) def parse_dataset(self, combined=True): texts = [] lengths = [] contexts = [] tags = [] current_texts = [] current_lengths = [] current_contexts = [] current_tags = [] exceeded_max_sentences = False for sub_title in self.article_obj['sub_titles']: # For each sub title for sentence in sub_title['sentences']: # Go over each sentence sentence_size = len(sentence['sentence'].split()) if sentence_size >= self.min_tokens and sentence_size <= self.max_tokens: current_texts.append(sentence['sentence']) current_lengths.append(len(sentence['sentence'].split())) # Number of tokens current_contexts.append(sentence['context'] if 'context' in sentence else None) current_tags.append('no edits') # If get_edits and has edited sentence save it if self.get_edits and 'alternative' in sentence and len(sentence['alternative'].split()) >= self.min_tokens and len(sentence['alternative'].split()) <= self.max_tokens: current_texts.append(sentence['alternative']) current_lengths.append(len(sentence['alternative'].split())) current_contexts.append(sentence['alternative_context'] if 'alternative_context' in sentence else None) current_tags.append('') if self.max_sentences and len(current_texts) >= self.max_sentences: exceeded_max_sentences = True break # return {'text': np.array(texts, dtype=object), 'length': np.array(lengths, dtype=object), 'context': np.array(contexts, dtype=object), 'tag': np.array(tags, dtype=object), # 'number_in_par': np.arange(1,1+len(texts))} if exceeded_max_sentences: break # If exceede max sentences only if self.max_sentences is not None if (self.max_sentences and exceeded_max_sentences) or (not self.max_sentences): # If combined, combine the data if combined: texts = texts + current_texts lengths = lengths + current_lengths contexts = contexts + current_contexts tags = tags + current_tags else: texts.append(np.array(current_texts)) lengths.append(np.array(current_lengths)) contexts.append(np.array(current_contexts)) tags.append(np.array(current_tags)) return {'text': np.array(texts, dtype=object), 'length': np.array(lengths, dtype=object), 'context': np.array(contexts, dtype=object), 'tag': np.array(tags, dtype=object), 'number_in_par': np.arange(1,1+len(texts))}