File size: 7,101 Bytes
7a0ff7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import logging
import spacy
import re
import numpy as np
from src.SentenceParser import SentenceParser

class PrepareSentenceContext(object):
    """

    Parse text and extract length and context information



    This information is needed for evaluating log-perplexity of the text with respect to a language model

    and later on to test the likelihood that the sentence was sampled from the model with the relevant context.

    """

    def __init__(self, sentence_parser='spacy', context_policy=None, context=None):
        if sentence_parser == 'spacy':
            self.nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer", "ner"])
        if sentence_parser == 'regex':
            logging.warning("Regex-based parser is not good at breaking sentences like 'Dr. Stone', etc.")
            self.nlp = SentenceParser()
            
        self.sentence_parser_name = sentence_parser

        self.context_policy = context_policy
        self.context = context

    def __call__(self, text):
        return self.parse_sentences(text)

    def parse_sentences(self, text):
        pattern_close = r"(.*?)</edit>"
        pattern_open = r"<edit>(.*?)"
        MIN_TOKEN_LEN = 3

        texts = []
        tags = []
        lengths = []
        contexts = []

        def update_sent(sent_text, tag, sent_length):
            texts.append(sent_text)
            tags.append(tag)
            lengths.append(sent_length)
            if self.context is not None:
                context = self.context
            elif self.context_policy is None:
                context = None
            elif self.context_policy == 'previous_sentence' and len(texts) > 0:
                context = texts[-1]
            else:
                context = None
            contexts.append(context)

        curr_tag = None
        parsed = self.nlp(text)
        for s in parsed.sents:
            prev_tag = curr_tag
            matches_close = re.findall(pattern_close, s.text)
            matches_open = re.findall(pattern_open, s.text)
            matches_between = re.findall(r"<edit>(.*?)</edit>", s.text)
            
            logging.debug(f"Current sentence: {s.text}")
            logging.debug(f"Matches open: {matches_open}")
            logging.debug(f"Matches close: {matches_close}")
            logging.debug(f"Matches between: {matches_between}")
            if len(matches_close)>0 and len(matches_open)>0: 
                logging.debug("Found an opening and a closing tag in the same sentence.")
                if prev_tag is None and len(matches_open[0]) >= MIN_TOKEN_LEN:
                    logging.debug("Openning followed by closing with some text in between.")
                    update_sent(matches_open[0], "<edit>", len(s)-2)
                    curr_tag = None
                if prev_tag == "<edit>" and len(matches_close[0]) >= MIN_TOKEN_LEN:
                    logging.warning(f"Wierd case: closing/openning followed by openning in sentence {len(texts)}")
                    update_sent(matches_close[0], prev_tag, len(s)-1)
                    curr_tag = None
                if prev_tag == "</edit>":
                    logging.debug("Closing followed by openning.")
                    curr_tag = "<edit>"
                    if len(matches_between[0]) > MIN_TOKEN_LEN:
                        update_sent(matches_between[0], None, len(s)-2)
            elif len(matches_open) > 0:
                curr_tag = "<edit>"
                assert prev_tag is None, f"Found an opening tag without a closing tag in sentence num. {len(texts)}"
                if len(matches_open[0]) >= MIN_TOKEN_LEN:
                    # text and tag are in the same sentence
                    sent_text = matches_open[0]
                    update_sent(sent_text, curr_tag, len(s)-1)      
            elif len(matches_close) > 0:
                curr_tag = "</edit>"
                assert prev_tag == "<edit>", f"Found a closing tag without an opening tag in sentence num. {len(texts)}"
                if len(matches_close[0]) >= MIN_TOKEN_LEN:
                    # text and tag are in the same sentence
                    update_sent(matches_close[0], prev_tag, len(s)-1)
                curr_tag = None
            else:
                #if len(matches_close)==0 and len(matches_open)==0: 
                # no tag
                update_sent(s.text, curr_tag, len(s))
        return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
                    'number_in_par': np.arange(1,1+len(texts))}

    def REMOVE_parse_sentences(self, text):
        texts = []
        contexts = []
        lengths = []
        tags = []
        num_in_par = []
        previous = None

        text = re.sub("(</?[a-zA-Z0-9 ]+>\.?)\s+", r"\1.\n", text)  # to make sure that tags are in separate sentences
        #text = re.sub("(</[a-zA-Z0-9 ]+>\.?)\s+", r"\n\1.\n", text)  # to make sure that tags are in separate sentences

        parsed = self.nlp(text)

        running_sent_num = 0
        curr_tag = None
        for i, sent in enumerate(parsed.sents):
            # Here we try to track HTML-like tags. There might be
            # some issues because spacy sentence parser has unexpected behavior when it comes to newlines
            all_tags = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(sent))
            if len(all_tags) > 1:
                    logging.error(f"More than one tag in sentence {i}: {all_tags}")
                    exit(1)
            if len(all_tags) == 1:
                tag = all_tags[0]
                if tag[:2] == '</': # a closing tag
                    if curr_tag is None:
                        logging.warning(f"Closing tag without an opening tag in sentence {i}: {sent}")
                    else:
                        curr_tag = None
                else:
                    if curr_tag is not None:
                        logging.warning(f"Opening tag without a closing tag in sentence {i}: {sent}")
                    else:
                        curr_tag = tag
            else:  # if text is not a tag
                sent_text = str(sent)
                sent_length = len(sent)

                texts.append(sent_text)
                running_sent_num += 1
                num_in_par.append(running_sent_num)
                tags.append(curr_tag)
                lengths.append(sent_length)

                if self.context is not None:
                    context = self.context
                elif self.context_policy is None:
                    context = None
                elif self.context_policy == 'previous_sentence':
                    context = previous
                    previous = sent_text
                else:
                    context = None

                contexts.append(context)
        return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
                'number_in_par': num_in_par}