File size: 3,435 Bytes
6ed21b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import sys
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader


"""
This functions is used to replace the leaves of parse trees in a file of text string form by the leaves from
a file of conll format.
It takes the file for the parse trees of text string form and the file of conll file, which have to correspond
with each other. 
It creates a new file containing the replaced parse tree in text string format.
"""
def replace_leaves(parse_path, conll_path, output_path):

    # extract new leaves form the conll file
    with open(conll_path, 'r', encoding='utf-8') as f_conll:
        leaves = []
        current_leaves = []
        for line in f_conll.readlines():
            if line == '\n':
                leaves.append(current_leaves)
                current_leaves = []
            else:
                leaf = line.split()[1].strip()
                current_leaves.append(leaf)
    
    # read the original parse tree
    reader = BracketParseCorpusReader('', [parse_path])
    trees = reader.parsed_sents()

    assert len(trees) == len(leaves), "The number of trees and leaves is not matched."
    with open(output_path, 'w', encoding='utf-8') as f_output:
        for i, (tree, current_leaves) in enumerate(zip(trees, leaves)):
            leaf_positions = tree.treepositions('leaves')
            assert len(leaf_positions) == len(current_leaves), f"The number of leaves is not matched at position {i}:\
            {len(leaf_positions)} vs {len(current_leaves)} \n{tree.leaves()}\n{current_leaves}"
            for j, (pos, leaf) in enumerate(zip(leaf_positions, current_leaves)):
                tree[pos] = leaf
            f_output.write('{}\n'.format(tree.pformat(margin=1e100)))

def replace_labels(parse_path, conll_path, output_path):

    # extract new leaves form the conll file
    with open(conll_path, 'r', encoding='utf-8') as f_conll:
        leaves = []
        current_leaves = []
        for line in f_conll.readlines():
            if line == '\n':
                leaves.append(current_leaves)
                current_leaves = []
            else:
                leaf = line.split()[1].strip()
                current_leaves.append(leaf)
    
    # read the original parse tree
    reader = BracketParseCorpusReader('', [parse_path])
    trees = reader.parsed_sents()

    assert len(trees) == len(leaves), "The number of trees and leaves is not matched."
    with open(output_path, 'w', encoding='utf-8') as f_output:
        for i, (tree, current_leaves) in enumerate(zip(trees, leaves)):
            leaf_positions = tree.treepositions('leaves')
            assert len(leaf_positions) == len(current_leaves), f"The number of leaves is not matched at position {i}:\
            {len(leaf_positions)} vs {len(current_leaves)} \n{tree.leaves()}\n{current_leaves}"
            for j, (pos, leaf) in enumerate(zip(leaf_positions, current_leaves)):
                tree[pos[:-1]].set_label(leaf)
            f_output.write('{}\n'.format(tree.pformat(margin=1e100)))


"""
For example:
cd schmid/MHG-Parser/self-attentive-parser-master
python src/utils.py data/mhg/MHG.parses data/mhg/MHG.mapped data/mhg/MHG_retag.parses
"""
if __name__=='__main__':
    assert len(sys.argv) == 4, "Wrong number of input file paths"
    parse_path, conll_path, output_path = sys.argv[1:]
    # replace_leaves(parse_path, conll_path, output_path)
    replace_labels(parse_path, conll_path, output_path)