|
import pandas as pd |
|
import numpy as np |
|
import json |
|
|
|
class PrepareArticles(object): |
|
""" |
|
Parse preprocessed data from csv |
|
|
|
This information is needed for evaluating log-perplexity of the text with respect to a language model |
|
and later on to test the likelihood that the sentence was sampled from the model with the relevant context. |
|
""" |
|
def __init__(self, article_obj, get_edits=False, min_tokens=10, max_tokens=100, max_sentences=None): |
|
self.article_obj = article_obj |
|
self.min_tokens = min_tokens |
|
self.max_tokens = max_tokens |
|
self.get_edits = get_edits |
|
self.max_sentences = max_sentences |
|
|
|
def __call__(self, combined=True): |
|
return self.parse_dataset(combined) |
|
|
|
def parse_dataset(self, combined=True): |
|
|
|
texts = [] |
|
lengths = [] |
|
contexts = [] |
|
tags = [] |
|
|
|
current_texts = [] |
|
current_lengths = [] |
|
current_contexts = [] |
|
current_tags = [] |
|
exceeded_max_sentences = False |
|
|
|
for sub_title in self.article_obj['sub_titles']: |
|
for sentence in sub_title['sentences']: |
|
sentence_size = len(sentence['sentence'].split()) |
|
if sentence_size >= self.min_tokens and sentence_size <= self.max_tokens: |
|
current_texts.append(sentence['sentence']) |
|
current_lengths.append(len(sentence['sentence'].split())) |
|
current_contexts.append(sentence['context'] if 'context' in sentence else None) |
|
current_tags.append('no edits') |
|
|
|
|
|
if self.get_edits and 'alternative' in sentence and len(sentence['alternative'].split()) >= self.min_tokens and len(sentence['alternative'].split()) <= self.max_tokens: |
|
current_texts.append(sentence['alternative']) |
|
current_lengths.append(len(sentence['alternative'].split())) |
|
current_contexts.append(sentence['alternative_context'] if 'alternative_context' in sentence else None) |
|
current_tags.append('<edit>') |
|
if self.max_sentences and len(current_texts) >= self.max_sentences: |
|
exceeded_max_sentences = True |
|
break |
|
|
|
|
|
if exceeded_max_sentences: |
|
break |
|
|
|
|
|
if (self.max_sentences and exceeded_max_sentences) or (not self.max_sentences): |
|
|
|
if combined: |
|
texts = texts + current_texts |
|
lengths = lengths + current_lengths |
|
contexts = contexts + current_contexts |
|
tags = tags + current_tags |
|
else: |
|
texts.append(np.array(current_texts)) |
|
lengths.append(np.array(current_lengths)) |
|
contexts.append(np.array(current_contexts)) |
|
tags.append(np.array(current_tags)) |
|
|
|
return {'text': np.array(texts, dtype=object), 'length': np.array(lengths, dtype=object), 'context': np.array(contexts, dtype=object), 'tag': np.array(tags, dtype=object), |
|
'number_in_par': np.arange(1,1+len(texts))} |
|
|
|
|