File size: 4,032 Bytes
847e3e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
"""
Convert CLC-FCE dataset (The Cambridge Learner Corpus) to the parallel sentences format.
"""

import argparse
import glob
import os
import re
from xml.etree import cElementTree

from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm


def annotate_fce_doc(xml):
    """Takes a FCE xml document and yields sentences with annotated errors."""
    result = []
    doc = cElementTree.fromstring(xml)
    paragraphs = doc.findall('head/text/*/coded_answer/p')
    for p in paragraphs:
        text = _get_formatted_text(p)
        result.append(text)

    return '\n'.join(result)


def _get_formatted_text(elem, ignore_tags=None):
    text = elem.text or ''
    ignore_tags = [tag.upper() for tag in (ignore_tags or [])]
    correct = None
    mistake = None

    for child in elem.getchildren():
        tag = child.tag.upper()
        if tag == 'NS':
            text += _get_formatted_text(child)

        elif tag == 'UNKNOWN':
            text += ' UNKNOWN '

        elif tag == 'C':
            assert correct is None
            correct = _get_formatted_text(child)

        elif tag == 'I':
            assert mistake is None
            mistake = _get_formatted_text(child)

        elif tag in ignore_tags:
            pass

        else:
            raise ValueError(f"Unknown tag `{child.tag}`", text)

    if correct or mistake:
        correct = correct or ''
        mistake = mistake or ''
        if '=>' not in mistake:
            text += f'{{{mistake}=>{correct}}}'
        else:
            text += mistake

    text += elem.tail or ''
    return text


def convert_fce(fce_dir):
    """Processes the whole FCE directory. Yields annotated documents (strings)."""

    # Ensure we got the valid dataset path
    if not os.path.isdir(fce_dir):
        raise UserWarning(
            f"{fce_dir} is not a valid path")

    dataset_dir = os.path.join(fce_dir, 'dataset')
    if not os.path.exists(dataset_dir):
        raise UserWarning(
            f"{fce_dir} doesn't point to a dataset's root dir")

    # Convert XML docs to the corpora format
    filenames = sorted(glob.glob(os.path.join(dataset_dir, '*/*.xml')))

    docs = []
    for filename in filenames:
        with open(filename, encoding='utf-8') as f:
            doc = annotate_fce_doc(f.read())
            docs.append(doc)
    return docs


def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n")


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=(
        "Convert CLC-FCE dataset to the parallel sentences format."))
    parser.add_argument('fce_dataset_path',
                        help='Path to the folder with the FCE dataset')
    parser.add_argument('--output',
                        help='Path to the output folder')
    args = parser.parse_args()

    main()