|
|
|
|
|
import collections |
|
import re |
|
|
|
|
|
class DataInput(): |
|
def __init__(self, file_name): |
|
self.file = open(file_name, "r") |
|
self.sentences = None |
|
|
|
|
|
def read_phrase(self): |
|
self.sentences = [] |
|
sentence = None |
|
span_reg = re.compile("\|[0-9]+-[0-9]+\|") |
|
previous = "" |
|
for line in self.file: |
|
sentence = Single() |
|
for word in line.split(): |
|
if span_reg.match(word): |
|
sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip() |
|
previous = " " |
|
else: |
|
previous += word + " " |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence.number = len(self.sentences) |
|
|
|
def read_syntax(self): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
for line in self.file: |
|
if int(line.split()[2]) != number: |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence = Single() |
|
sentence.number = int(line.split()[2]) |
|
number = sentence.number |
|
sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \ |
|
= line.strip() |
|
|
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
|
|
|
|
def read_syntax_cubes(self, cell_limit): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
new_item = False |
|
for line in self.file: |
|
if line.startswith("Chart Cell"): |
|
pass |
|
elif line.startswith("---------"): |
|
new_item = True |
|
elif line.startswith("Trans Opt") and new_item is True: |
|
new_item = False |
|
if int(line.split()[2]) != number: |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence = Multiple() |
|
sentence.number = int(line.split()[2]) |
|
number = sentence.number |
|
span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")]) |
|
if len(sentence.spans[span]) < cell_limit: |
|
sentence.spans[span].append(line.strip()) |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
def read_phrase_stack_flag(self, cell_limit): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
for line in self.file: |
|
if len(line.split()) < 6: |
|
pass |
|
|
|
|
|
else: |
|
if int(line.split()[0]) != number: |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence = Multiple() |
|
sentence.number = int(line.split()[0]) |
|
number = sentence.number |
|
|
|
span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>") |
|
|
|
span = tuple([int(i) for i in span.split("-")]) |
|
if len(sentence.spans[span]) < cell_limit: |
|
sentence.spans[span].append(line.strip()) |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
def read_phrase_stack_verbose(self, cell_limit): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
span_input = False |
|
for line in self.file: |
|
if line.startswith("Translating: "): |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
number += 1 |
|
sentence = Multiple() |
|
sentence.number = number |
|
else: |
|
if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line): |
|
span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")]) |
|
sentence.spans[span].append(line.strip()) |
|
span_input = True |
|
|
|
elif span_input is True: |
|
if line.strip() == "": |
|
span_input = False |
|
|
|
else: |
|
if len(sentence.spans[span]) < cell_limit: |
|
sentence.spans[span].append(line.strip()) |
|
|
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
|
|
|
|
def read_syntax_cube_flag(self, cell_limit): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
for line in self.file: |
|
if len(line.split()) < 6: |
|
pass |
|
else: |
|
if int(line.split()[0]) != number: |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence = Multiple() |
|
sentence.number = int(line.split()[0]) |
|
number = sentence.number |
|
span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>") |
|
span = tuple([int(i) for i in span.split()]) |
|
if len(sentence.spans[span]) < cell_limit: |
|
sentence.spans[span].append(line.strip()) |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
|
|
def read_mbot(self, cell_limit): |
|
self.sentences = [] |
|
sentence = None |
|
number = -1 |
|
hypo = False |
|
rule = False |
|
popping = False |
|
target = "" |
|
source = "" |
|
source_parent = "" |
|
target_parent = "" |
|
alignment = "" |
|
for line in self.file: |
|
if line.startswith("Translating:"): |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
sentence = Multiple() |
|
sentence.number = number + 1 |
|
number = sentence.number |
|
elif line.startswith("POPPING"): |
|
popping = True |
|
elif popping is True: |
|
popping = False |
|
span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")]) |
|
hypo = True |
|
elif hypo is True: |
|
if line.startswith("Target Phrases"): |
|
target = line.split(":", 1)[1].strip() |
|
|
|
elif line.startswith("Alignment Info"): |
|
alignment = line.split(":", 1)[1].strip() |
|
if alignment == "": |
|
alignment = "(1)" |
|
|
|
elif line.startswith("Source Phrase"): |
|
source = line.split(":", 1)[1].strip() |
|
|
|
elif line.startswith("Source Left-hand-side"): |
|
source_parent = line.split(":", 1)[1].strip() |
|
|
|
elif line.startswith("Target Left-hand-side"): |
|
target_parent = line.split(":", 1)[1].strip() |
|
|
|
|
|
alignment = re.sub(r"\([0-9]+\)", "||", alignment) |
|
align_blocks = alignment.split("||")[:-1] |
|
target = re.sub(r"\([0-9]+\)", "||", target) |
|
target = [x.split() for x in target.split("||")][:-1] |
|
source = source.split() |
|
|
|
for i in range(len(source)): |
|
if source[i].isupper(): |
|
source[i] = "[" + source[i] + "]" |
|
for k in range(len(align_blocks)): |
|
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()] |
|
for j in filter(lambda x: x[0] == i, align_pairs): |
|
source[i] = source[i] + "[" + target[k][j[1]] + "]" |
|
|
|
for i in range(len(target)): |
|
for j in range(len(target[i])): |
|
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()] |
|
for k in filter(lambda x: x[1] == j, align_pairs): |
|
target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]" |
|
|
|
|
|
|
|
target = " || ".join([" ".join(x) for x in target]) + " ||" |
|
|
|
source = " ".join(source) |
|
source = source + " [" + source_parent + "]" |
|
|
|
tp = re.sub(r"\([0-9]+\)", "", target_parent).split() |
|
for i in tp: |
|
target = target.replace("||", " [" + i + "] !!", 1) |
|
target = target.replace("!!", "||") |
|
|
|
rule = False |
|
search_pattern = "||| " + source + " ||| " + target + "| --- ||| " + alignment + "|" |
|
|
|
sentence.spans[span].append(search_pattern) |
|
|
|
if len(sentence.spans[span]) < cell_limit: |
|
sentence.spans[span].append(search_pattern) |
|
else: |
|
pass |
|
if sentence is not None: |
|
sentence.set_length() |
|
self.sentences.append(sentence) |
|
|
|
|
|
|
|
|
|
class Single(): |
|
def __init__(self): |
|
self.number = None |
|
self.spans = {} |
|
self.length = None |
|
|
|
def set_length(self): |
|
self.length = max([x[1] for x in self.spans.keys()]) |
|
|
|
def __str__(self): |
|
number = str(self.number) |
|
length = str(self.length) |
|
spans = "\n" |
|
for i in self.spans.keys(): |
|
spans += str(i) + " - " + str(self.spans[i]) + "\n" |
|
return str((number, length, spans)) |
|
|
|
class Multiple(): |
|
def __init__(self): |
|
self.number = None |
|
self.spans = collections.defaultdict(list) |
|
self.length = None |
|
|
|
def set_length(self): |
|
self.length = max([x[1] for x in self.spans.keys()]) |
|
|
|
def __str__(self): |
|
number = str(self.number) |
|
length = str(self.length) |
|
spans = "\n" |
|
for i in self.spans.keys(): |
|
spans += str(i) + " - " + str(self.spans[i]) + "\n" |
|
return str((number, length, spans)) |
|
|
|
|
|
|
|
|