import logging import spacy import re import numpy as np from src.SentenceParser import SentenceParser class PrepareSentenceContext(object): """ Parse text and extract length and context information This information is needed for evaluating log-perplexity of the text with respect to a language model and later on to test the likelihood that the sentence was sampled from the model with the relevant context. """ def __init__(self, sentence_parser='spacy', context_policy=None, context=None): if sentence_parser == 'spacy': self.nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer", "ner"]) if sentence_parser == 'regex': logging.warning("Regex-based parser is not good at breaking sentences like 'Dr. Stone', etc.") self.nlp = SentenceParser() self.sentence_parser_name = sentence_parser self.context_policy = context_policy self.context = context def __call__(self, text): return self.parse_sentences(text) def parse_sentences(self, text): pattern_close = r"(.*?)" pattern_open = r"(.*?)" MIN_TOKEN_LEN = 3 texts = [] tags = [] lengths = [] contexts = [] def update_sent(sent_text, tag, sent_length): texts.append(sent_text) tags.append(tag) lengths.append(sent_length) if self.context is not None: context = self.context elif self.context_policy is None: context = None elif self.context_policy == 'previous_sentence' and len(texts) > 0: context = texts[-1] else: context = None contexts.append(context) curr_tag = None parsed = self.nlp(text) for s in parsed.sents: prev_tag = curr_tag matches_close = re.findall(pattern_close, s.text) matches_open = re.findall(pattern_open, s.text) matches_between = re.findall(r"(.*?)", s.text) logging.debug(f"Current sentence: {s.text}") logging.debug(f"Matches open: {matches_open}") logging.debug(f"Matches close: {matches_close}") logging.debug(f"Matches between: {matches_between}") if len(matches_close)>0 and len(matches_open)>0: logging.debug("Found an opening and a closing tag in the same sentence.") if prev_tag is None and len(matches_open[0]) >= MIN_TOKEN_LEN: logging.debug("Openning followed by closing with some text in between.") update_sent(matches_open[0], "", len(s)-2) curr_tag = None if prev_tag == "" and len(matches_close[0]) >= MIN_TOKEN_LEN: logging.warning(f"Wierd case: closing/openning followed by openning in sentence {len(texts)}") update_sent(matches_close[0], prev_tag, len(s)-1) curr_tag = None if prev_tag == "": logging.debug("Closing followed by openning.") curr_tag = "" if len(matches_between[0]) > MIN_TOKEN_LEN: update_sent(matches_between[0], None, len(s)-2) elif len(matches_open) > 0: curr_tag = "" assert prev_tag is None, f"Found an opening tag without a closing tag in sentence num. {len(texts)}" if len(matches_open[0]) >= MIN_TOKEN_LEN: # text and tag are in the same sentence sent_text = matches_open[0] update_sent(sent_text, curr_tag, len(s)-1) elif len(matches_close) > 0: curr_tag = "" assert prev_tag == "", f"Found a closing tag without an opening tag in sentence num. {len(texts)}" if len(matches_close[0]) >= MIN_TOKEN_LEN: # text and tag are in the same sentence update_sent(matches_close[0], prev_tag, len(s)-1) curr_tag = None else: #if len(matches_close)==0 and len(matches_open)==0: # no tag update_sent(s.text, curr_tag, len(s)) return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags, 'number_in_par': np.arange(1,1+len(texts))} def REMOVE_parse_sentences(self, text): texts = [] contexts = [] lengths = [] tags = [] num_in_par = [] previous = None text = re.sub("(\.?)\s+", r"\1.\n", text) # to make sure that tags are in separate sentences #text = re.sub("(\.?)\s+", r"\n\1.\n", text) # to make sure that tags are in separate sentences parsed = self.nlp(text) running_sent_num = 0 curr_tag = None for i, sent in enumerate(parsed.sents): # Here we try to track HTML-like tags. There might be # some issues because spacy sentence parser has unexpected behavior when it comes to newlines all_tags = re.findall(r"()", str(sent)) if len(all_tags) > 1: logging.error(f"More than one tag in sentence {i}: {all_tags}") exit(1) if len(all_tags) == 1: tag = all_tags[0] if tag[:2] == '