Spaces:
Runtime error
Runtime error
"""Encapsulates all word and punctuation symbols layer. | |
Layer 0 is the basic layer for all the UCCA annotation, as it includes the | |
actual words and punctuation marks found in the :class:`core`.Passage. | |
Layer 0 has only one type of node, :class:`Terminal`. This is a subtype of | |
:class:`core`.Node, and can have one of two tags: Word or Punctuation. | |
""" | |
from ucca import core | |
LAYER_ID = '0' | |
class NodeTags: | |
Punct = 'Punctuation' | |
Word = 'Word' | |
__init__ = None | |
ATTRIB_KEYS = ('text', 'paragraph', 'paragraph_position') | |
class Terminal(core.Node): | |
"""Layer 0 Node type, represents a word or a punctuation mark. | |
Terminals are :class:`core`.Node objects which represent a word or | |
a punctuation mark in the :class:`core`.Passage object. They are immutable, | |
as they shouldn't be changed throughout their use and have no children. | |
Hence, they can be compared and hashed, unlike other core.Node subclasses. | |
Attributes: | |
ID: the unique ID of each Terminal is its global position in the | |
Passage, e.g. ID=0.4 is the 4th Terminal in the :class:`Passage`. | |
tag: from NodeTags | |
layer: '0' (LAYER_ID) | |
attrib: returns a copy of the attribute dictionary, so changing it | |
will not affect the Terminal object | |
text: text of the Terminal, whether punctuation or a word | |
position: global position of the Terminal in the passage, starting at 1 | |
paragraph: which paragraph the Terminal belongs to, starting at 1 | |
para_pos: the position of the Terminal in the paragraph, | |
starting at 1 (per paragraph). | |
punct: whether the Terminal is a punctuation mark (boolean) | |
""" | |
def text(self): | |
return self.attrib['text'] | |
def position(self): | |
# the format of ID is LAYER_ID + ID separator + position | |
return int(self.ID[len(LAYER_ID) + len(core.Node.ID_SEPARATOR):]) | |
def para_pos(self): | |
return self.attrib['paragraph_position'] | |
def paragraph(self): | |
return self.attrib['paragraph'] | |
def tok(self): | |
try: | |
return self.layer.extra["doc"][self.paragraph - 1][self.para_pos - 1] | |
except (KeyError, IndexError): | |
return None | |
def get_annotation(self, attr, as_array=False): | |
return attr(self.tok[attr.value]) if as_array else self.extra.get(attr.key) | |
def attrib(self): | |
return self._attrib.copy() | |
def punct(self): | |
return self.tag == NodeTags.Punct | |
def get_terminals(self, punct=True, *args, **kwargs): | |
"""Returns a list containing just this Terminal. | |
:param punct: whether to include punctuation Terminals, defaults to True | |
:return: a list of :class:`layer0`.Terminal objects | |
""" | |
del args, kwargs | |
return [] if self.punct and not punct else [self] | |
def equals(self, other, *, ordered=False, **kwargs): | |
"""Equals if the Terminals are of the same Layer, tag, position & text. | |
:param other: another Terminal to equal to | |
:param ordered: unused, here for API conformity. | |
:return: True iff the two Terminals are equal. | |
""" | |
return (self.layer.ID == other.layer.ID and self.text == other.text | |
and self.position == other.position and self.tag == other.tag | |
and self.paragraph == other.paragraph | |
and self.para_pos == other.para_pos) | |
def __eq__(self, other): | |
"""Equals if both of the same Passage, Layer, position, tag & text.""" | |
if other.layer.ID != LAYER_ID: | |
return False | |
return (self.root == other.root and self.layer.ID == other.layer.ID | |
and self.position == other.position | |
and self.text == other.text and self.tag == other.tag | |
and self.paragraph == other.paragraph | |
and self.para_pos == other.para_pos) | |
def __hash__(self): | |
"""Hashes the Terminals according to its ID and text.""" | |
return hash(self.ID + str(self.text)) | |
def __str__(self): | |
return self.text | |
# Terminal are immutable (except the extra dictionary which is | |
# just a temporary playground) and have no children, so enforce it | |
def add(self, *args, **kwargs): | |
raise NotImplementedError() | |
def remove(self, *args, **kwargs): | |
raise NotImplementedError() | |
class Layer0(core.Layer): | |
"""Represents the :class:`Terminal` objects layer. | |
Attributes: | |
words: a tuple of only the words (not punctuation) Terminals, ordered | |
pairs: a tuple of (position, terminal) tuples of all Terminals, ordered | |
""" | |
def __init__(self, root, attrib=None): | |
super().__init__(ID=LAYER_ID, root=root, attrib=attrib) | |
def words(self): | |
return tuple(x for x in self._all if not x.punct) | |
def pairs(self): | |
return tuple(enumerate(self._all, start=1)) | |
def by_position(self, pos): | |
"""Returns the Terminals at the position given. | |
:param pos: the position of the Terminal object | |
:return: the Terminal in this position | |
:raise IndexError: if the position is out of bounds | |
""" | |
return self._all[pos - 1] # positions start at 1, not 0 | |
def add_terminal(self, text, punct, paragraph=1): | |
"""Adds the next Terminal at the next available position. | |
Creates a :class:`Terminal` object with the next position, assuming that | |
all positions are filled (no holes). | |
:param text: the text of the Terminal | |
:param punct: boolean, whether it's a punctuation mark | |
:param paragraph: paragraph number, defaults to 1 | |
:return: the created Terminal | |
:raise DuplicateIdError: if trying to add an already existing Terminal, | |
caused by un-ordered Terminal positions in the layer | |
""" | |
position = len(self._all) + 1 # we want positions to start with 1 | |
para_pos = self._all[-1].para_pos + 1 if position > 1 and paragraph == self._all[-1].paragraph else 1 | |
tag = NodeTags.Punct if punct else NodeTags.Word | |
return Terminal(ID="{}{}{}".format(LAYER_ID, core.Node.ID_SEPARATOR, position), | |
root=self.root, tag=tag, | |
attrib={'text': text, | |
'paragraph': paragraph, | |
'paragraph_position': para_pos}) | |
def copy(self, other_passage): | |
"""Creates a copied Layer0 object and Terminals in other_passage. | |
:param other_passage: the Passage to copy self to | |
""" | |
other = Layer0(root=other_passage, attrib=self.attrib.copy()) | |
other.extra = self.extra.copy() | |
for t in self._all: | |
copied = other.add_terminal(t.text, t.punct, t.paragraph) | |
copied.extra = t.extra.copy() | |
def docs(self, num_paragraphs=1): | |
docs = self.extra.setdefault("doc", [[]]) | |
while len(docs) < num_paragraphs: | |
docs.append([]) | |
return docs | |
def doc(self, paragraph): | |
return self.docs(paragraph)[paragraph - 1] | |
def is_punct(node): | |
"""Returns whether the unit is a layer0 punctuation (for all Units).""" | |
return node.layer.ID == LAYER_ID and node.punct | |