Spaces:
Running
Running
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
These classes represent graph elements. | |
Consider this "flavor" of graph representation to be a superset of | |
`openCypher` _labeled property graphs_ (LPG) with additional support | |
for probabilistic graphs. | |
Imposing a discipline of IRIs for node names and edge relations | |
helps guarantee that a view of the graph can be exported to RDF | |
for data quality checks, transitive closure, semantic inference, | |
and so on. | |
see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md | |
""" | |
from dataclasses import dataclass, field | |
import typing | |
import spacy # pylint: disable=E0401 | |
from .util import EnumBase | |
###################################################################### | |
## class definitions | |
class KGSearchHit: # pylint: disable=R0902 | |
""" | |
A data class representing a hit from a _knowledge graph_ search. | |
""" | |
iri: str | |
label: str | |
descrip: str | |
aliases: typing.List[ str ] | |
prob: float | |
class LinkedEntity: # pylint: disable=R0902 | |
""" | |
A data class representing one linked entity. | |
""" | |
span: typing.Optional[ spacy.tokens.span.Span ] | |
iri: str | |
length: int | |
rel: str | |
prob: float | |
token_id: int | |
kg_ent: typing.Optional[ KGSearchHit ] | |
count: int = 1 | |
class NounChunk: # pylint: disable=R0902 | |
""" | |
A data class representing one noun chunk, i.e., a candidate as an extracted phrase. | |
""" | |
span: spacy.tokens.span.Span | |
text: str | |
length: int | |
lemma_key: str | |
unseen: bool | |
sent_id: int | |
start: int = 0 | |
class NodeEnum (EnumBase): | |
""" | |
Enumeration for the kinds of node categories | |
""" | |
DEP = 0 # `spaCy` parse dependency | |
LEM = 1 # lemmatized token | |
ENT = 2 # named entity | |
CHU = 3 # noun chunk | |
IRI = 4 # IRI for linked entity | |
def decoder ( | |
self | |
) -> typing.List[ str ]: | |
""" | |
Decoder values | |
""" | |
return [ | |
"dep", | |
"lem", | |
"ent", | |
"chu", | |
"iri", | |
] | |
class Node: # pylint: disable=R0902 | |
""" | |
A data class representing one node, i.e., an extracted phrase. | |
""" | |
node_id: int | |
key: str | |
text: str | |
pos: str | |
kind: NodeEnum | |
span: typing.Optional[ typing.Union[ spacy.tokens.span.Span, spacy.tokens.token.Token ]] = None | |
loc: typing.List[ typing.List[ int ] ] = field(default_factory = lambda: []) | |
label: typing.Optional[ str ] = None | |
length: int = 1 | |
sub_obj: bool = False | |
count: int = 0 | |
neighbors: int = 0 | |
weight: float = 0.0 | |
entity: typing.List[ LinkedEntity ] = field(default_factory = lambda: []) | |
annotated: bool = False | |
def get_linked_label ( | |
self | |
) -> typing.Optional[ str ]: | |
""" | |
When this node has a linked entity, return that IRI. | |
Otherwise return its `label` value. | |
returns: | |
a label for the linked entity | |
""" | |
if len(self.entity) > 0: | |
return self.entity[0].iri | |
return self.label | |
def get_name ( | |
self | |
) -> str: | |
""" | |
Return a brief name for the graphical depiction of this Node. | |
returns: | |
brief label to be used in a graph | |
""" | |
if self.kind == NodeEnum.IRI: | |
return self.label # type: ignore | |
if self.kind == NodeEnum.LEM: | |
return self.key | |
return self.text | |
def get_stacked_count ( | |
self | |
) -> int: | |
""" | |
Return a modified count, to redact verbs and linked entities from | |
the stack-rank partitions. | |
returns: | |
count, used for re-ranking extracted entities | |
""" | |
if self.pos == "VERB" or self.kind == NodeEnum.IRI: | |
return 0 | |
return self.count | |
def get_pos ( | |
self | |
) -> typing.Tuple[ int, int ]: | |
""" | |
Generate a position span for `OpenNRE`. | |
returns: | |
a position span needed for `OpenNRE` relation extraction | |
""" | |
position: typing.Tuple[ int, int ] = ( self.span.idx, self.span.idx + len(self.text) - 1, ) # type: ignore # pylint: disable=C0301 | |
return position | |
class RelEnum (EnumBase): | |
""" | |
Enumeration for the kinds of edge relations | |
""" | |
DEP = 0 # `spaCy` parse dependency | |
CHU = 1 # `spaCy` noun chunk | |
INF = 2 # `REBEL` or `OpenNRE` inferred relation | |
SYN = 3 # `sense2vec` inferred synonym | |
IRI = 4 # `DBPedia` or `Wikidata` linked entity | |
def decoder ( | |
self | |
) -> typing.List[ str ]: | |
""" | |
Decoder values | |
""" | |
return [ | |
"dep", | |
"chu", | |
"inf", | |
"syn", | |
"iri", | |
] | |
class Edge: | |
""" | |
A data class representing an edge between two nodes. | |
""" | |
src_node: int | |
dst_node: int | |
kind: RelEnum | |
rel: str | |
prob: float | |
count: int = 1 | |