Spaces:
Runtime error
Runtime error
File size: 7,101 Bytes
7a0ff7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import logging
import spacy
import re
import numpy as np
from src.SentenceParser import SentenceParser
class PrepareSentenceContext(object):
"""
Parse text and extract length and context information
This information is needed for evaluating log-perplexity of the text with respect to a language model
and later on to test the likelihood that the sentence was sampled from the model with the relevant context.
"""
def __init__(self, sentence_parser='spacy', context_policy=None, context=None):
if sentence_parser == 'spacy':
self.nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer", "ner"])
if sentence_parser == 'regex':
logging.warning("Regex-based parser is not good at breaking sentences like 'Dr. Stone', etc.")
self.nlp = SentenceParser()
self.sentence_parser_name = sentence_parser
self.context_policy = context_policy
self.context = context
def __call__(self, text):
return self.parse_sentences(text)
def parse_sentences(self, text):
pattern_close = r"(.*?)</edit>"
pattern_open = r"<edit>(.*?)"
MIN_TOKEN_LEN = 3
texts = []
tags = []
lengths = []
contexts = []
def update_sent(sent_text, tag, sent_length):
texts.append(sent_text)
tags.append(tag)
lengths.append(sent_length)
if self.context is not None:
context = self.context
elif self.context_policy is None:
context = None
elif self.context_policy == 'previous_sentence' and len(texts) > 0:
context = texts[-1]
else:
context = None
contexts.append(context)
curr_tag = None
parsed = self.nlp(text)
for s in parsed.sents:
prev_tag = curr_tag
matches_close = re.findall(pattern_close, s.text)
matches_open = re.findall(pattern_open, s.text)
matches_between = re.findall(r"<edit>(.*?)</edit>", s.text)
logging.debug(f"Current sentence: {s.text}")
logging.debug(f"Matches open: {matches_open}")
logging.debug(f"Matches close: {matches_close}")
logging.debug(f"Matches between: {matches_between}")
if len(matches_close)>0 and len(matches_open)>0:
logging.debug("Found an opening and a closing tag in the same sentence.")
if prev_tag is None and len(matches_open[0]) >= MIN_TOKEN_LEN:
logging.debug("Openning followed by closing with some text in between.")
update_sent(matches_open[0], "<edit>", len(s)-2)
curr_tag = None
if prev_tag == "<edit>" and len(matches_close[0]) >= MIN_TOKEN_LEN:
logging.warning(f"Wierd case: closing/openning followed by openning in sentence {len(texts)}")
update_sent(matches_close[0], prev_tag, len(s)-1)
curr_tag = None
if prev_tag == "</edit>":
logging.debug("Closing followed by openning.")
curr_tag = "<edit>"
if len(matches_between[0]) > MIN_TOKEN_LEN:
update_sent(matches_between[0], None, len(s)-2)
elif len(matches_open) > 0:
curr_tag = "<edit>"
assert prev_tag is None, f"Found an opening tag without a closing tag in sentence num. {len(texts)}"
if len(matches_open[0]) >= MIN_TOKEN_LEN:
# text and tag are in the same sentence
sent_text = matches_open[0]
update_sent(sent_text, curr_tag, len(s)-1)
elif len(matches_close) > 0:
curr_tag = "</edit>"
assert prev_tag == "<edit>", f"Found a closing tag without an opening tag in sentence num. {len(texts)}"
if len(matches_close[0]) >= MIN_TOKEN_LEN:
# text and tag are in the same sentence
update_sent(matches_close[0], prev_tag, len(s)-1)
curr_tag = None
else:
#if len(matches_close)==0 and len(matches_open)==0:
# no tag
update_sent(s.text, curr_tag, len(s))
return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
'number_in_par': np.arange(1,1+len(texts))}
def REMOVE_parse_sentences(self, text):
texts = []
contexts = []
lengths = []
tags = []
num_in_par = []
previous = None
text = re.sub("(</?[a-zA-Z0-9 ]+>\.?)\s+", r"\1.\n", text) # to make sure that tags are in separate sentences
#text = re.sub("(</[a-zA-Z0-9 ]+>\.?)\s+", r"\n\1.\n", text) # to make sure that tags are in separate sentences
parsed = self.nlp(text)
running_sent_num = 0
curr_tag = None
for i, sent in enumerate(parsed.sents):
# Here we try to track HTML-like tags. There might be
# some issues because spacy sentence parser has unexpected behavior when it comes to newlines
all_tags = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(sent))
if len(all_tags) > 1:
logging.error(f"More than one tag in sentence {i}: {all_tags}")
exit(1)
if len(all_tags) == 1:
tag = all_tags[0]
if tag[:2] == '</': # a closing tag
if curr_tag is None:
logging.warning(f"Closing tag without an opening tag in sentence {i}: {sent}")
else:
curr_tag = None
else:
if curr_tag is not None:
logging.warning(f"Opening tag without a closing tag in sentence {i}: {sent}")
else:
curr_tag = tag
else: # if text is not a tag
sent_text = str(sent)
sent_length = len(sent)
texts.append(sent_text)
running_sent_num += 1
num_in_par.append(running_sent_num)
tags.append(curr_tag)
lengths.append(sent_length)
if self.context is not None:
context = self.context
elif self.context_policy is None:
context = None
elif self.context_policy == 'previous_sentence':
context = previous
previous = sent_text
else:
context = None
contexts.append(context)
return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
'number_in_par': num_in_par} |