|
""" |
|
Functions for hashing graphs to strings. |
|
Isomorphic graphs should be assigned identical hashes. |
|
For now, only Weisfeiler-Lehman hashing is implemented. |
|
""" |
|
|
|
from collections import Counter, defaultdict |
|
from hashlib import blake2b |
|
|
|
import networkx as nx |
|
|
|
__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"] |
|
|
|
|
|
def _hash_label(label, digest_size): |
|
return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest() |
|
|
|
|
|
def _init_node_labels(G, edge_attr, node_attr): |
|
if node_attr: |
|
return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)} |
|
elif edge_attr: |
|
return {u: "" for u in G} |
|
else: |
|
return {u: str(deg) for u, deg in G.degree()} |
|
|
|
|
|
def _neighborhood_aggregate(G, node, node_labels, edge_attr=None): |
|
""" |
|
Compute new labels for given node by aggregating |
|
the labels of each node's neighbors. |
|
""" |
|
label_list = [] |
|
for nbr in G.neighbors(node): |
|
prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr]) |
|
label_list.append(prefix + node_labels[nbr]) |
|
return node_labels[node] + "".join(sorted(label_list)) |
|
|
|
|
|
@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr") |
|
def weisfeiler_lehman_graph_hash( |
|
G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 |
|
): |
|
"""Return Weisfeiler Lehman (WL) graph hash. |
|
|
|
The function iteratively aggregates and hashes neighborhoods of each node. |
|
After each node's neighbors are hashed to obtain updated node labels, |
|
a hashed histogram of resulting labels is returned as the final hash. |
|
|
|
Hashes are identical for isomorphic graphs and strong guarantees that |
|
non-isomorphic graphs will get different hashes. See [1]_ for details. |
|
|
|
If no node or edge attributes are provided, the degree of each node |
|
is used as its initial label. |
|
Otherwise, node and/or edge labels are used to compute the hash. |
|
|
|
Parameters |
|
---------- |
|
G : graph |
|
The graph to be hashed. |
|
Can have node and/or edge attributes. Can also have no attributes. |
|
edge_attr : string, optional (default=None) |
|
The key in edge attribute dictionary to be used for hashing. |
|
If None, edge labels are ignored. |
|
node_attr: string, optional (default=None) |
|
The key in node attribute dictionary to be used for hashing. |
|
If None, and no edge_attr given, use the degrees of the nodes as labels. |
|
iterations: int, optional (default=3) |
|
Number of neighbor aggregations to perform. |
|
Should be larger for larger graphs. |
|
digest_size: int, optional (default=16) |
|
Size (in bits) of blake2b hash digest to use for hashing node labels. |
|
|
|
Returns |
|
------- |
|
h : string |
|
Hexadecimal string corresponding to hash of the input graph. |
|
|
|
Examples |
|
-------- |
|
Two graphs with edge attributes that are isomorphic, except for |
|
differences in the edge labels. |
|
|
|
>>> G1 = nx.Graph() |
|
>>> G1.add_edges_from( |
|
... [ |
|
... (1, 2, {"label": "A"}), |
|
... (2, 3, {"label": "A"}), |
|
... (3, 1, {"label": "A"}), |
|
... (1, 4, {"label": "B"}), |
|
... ] |
|
... ) |
|
>>> G2 = nx.Graph() |
|
>>> G2.add_edges_from( |
|
... [ |
|
... (5, 6, {"label": "B"}), |
|
... (6, 7, {"label": "A"}), |
|
... (7, 5, {"label": "A"}), |
|
... (7, 8, {"label": "A"}), |
|
... ] |
|
... ) |
|
|
|
Omitting the `edge_attr` option, results in identical hashes. |
|
|
|
>>> nx.weisfeiler_lehman_graph_hash(G1) |
|
'7bc4dde9a09d0b94c5097b219891d81a' |
|
>>> nx.weisfeiler_lehman_graph_hash(G2) |
|
'7bc4dde9a09d0b94c5097b219891d81a' |
|
|
|
With edge labels, the graphs are no longer assigned |
|
the same hash digest. |
|
|
|
>>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label") |
|
'c653d85538bcf041d88c011f4f905f10' |
|
>>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label") |
|
'3dcd84af1ca855d0eff3c978d88e7ec7' |
|
|
|
Notes |
|
----- |
|
To return the WL hashes of each subgraph of a graph, use |
|
`weisfeiler_lehman_subgraph_hashes` |
|
|
|
Similarity between hashes does not imply similarity between graphs. |
|
|
|
References |
|
---------- |
|
.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, |
|
Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman |
|
Graph Kernels. Journal of Machine Learning Research. 2011. |
|
http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf |
|
|
|
See also |
|
-------- |
|
weisfeiler_lehman_subgraph_hashes |
|
""" |
|
|
|
def weisfeiler_lehman_step(G, labels, edge_attr=None): |
|
""" |
|
Apply neighborhood aggregation to each node |
|
in the graph. |
|
Computes a dictionary with labels for each node. |
|
""" |
|
new_labels = {} |
|
for node in G.nodes(): |
|
label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) |
|
new_labels[node] = _hash_label(label, digest_size) |
|
return new_labels |
|
|
|
|
|
node_labels = _init_node_labels(G, edge_attr, node_attr) |
|
|
|
subgraph_hash_counts = [] |
|
for _ in range(iterations): |
|
node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr) |
|
counter = Counter(node_labels.values()) |
|
|
|
subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0])) |
|
|
|
|
|
return _hash_label(str(tuple(subgraph_hash_counts)), digest_size) |
|
|
|
|
|
@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr") |
|
def weisfeiler_lehman_subgraph_hashes( |
|
G, |
|
edge_attr=None, |
|
node_attr=None, |
|
iterations=3, |
|
digest_size=16, |
|
include_initial_labels=False, |
|
): |
|
""" |
|
Return a dictionary of subgraph hashes by node. |
|
|
|
Dictionary keys are nodes in `G`, and values are a list of hashes. |
|
Each hash corresponds to a subgraph rooted at a given node u in `G`. |
|
Lists of subgraph hashes are sorted in increasing order of depth from |
|
their root node, with the hash at index i corresponding to a subgraph |
|
of nodes at most i edges distance from u. Thus, each list will contain |
|
`iterations` elements - a hash for a subgraph at each depth. If |
|
`include_initial_labels` is set to `True`, each list will additionally |
|
have contain a hash of the initial node label (or equivalently a |
|
subgraph of depth 0) prepended, totalling ``iterations + 1`` elements. |
|
|
|
The function iteratively aggregates and hashes neighborhoods of each node. |
|
This is achieved for each step by replacing for each node its label from |
|
the previous iteration with its hashed 1-hop neighborhood aggregate. |
|
The new node label is then appended to a list of node labels for each |
|
node. |
|
|
|
To aggregate neighborhoods for a node $u$ at each step, all labels of |
|
nodes adjacent to $u$ are concatenated. If the `edge_attr` parameter is set, |
|
labels for each neighboring node are prefixed with the value of this attribute |
|
along the connecting edge from this neighbor to node $u$. The resulting string |
|
is then hashed to compress this information into a fixed digest size. |
|
|
|
Thus, at the $i$-th iteration, nodes within $i$ hops influence any given |
|
hashed node label. We can therefore say that at depth $i$ for node $u$ |
|
we have a hash for a subgraph induced by the $i$-hop neighborhood of $u$. |
|
|
|
The output can be used to to create general Weisfeiler-Lehman graph kernels, |
|
or generate features for graphs or nodes - for example to generate 'words' in |
|
a graph as seen in the 'graph2vec' algorithm. |
|
See [1]_ & [2]_ respectively for details. |
|
|
|
Hashes are identical for isomorphic subgraphs and there exist strong |
|
guarantees that non-isomorphic graphs will get different hashes. |
|
See [1]_ for details. |
|
|
|
If no node or edge attributes are provided, the degree of each node |
|
is used as its initial label. |
|
Otherwise, node and/or edge labels are used to compute the hash. |
|
|
|
Parameters |
|
---------- |
|
G : graph |
|
The graph to be hashed. |
|
Can have node and/or edge attributes. Can also have no attributes. |
|
edge_attr : string, optional (default=None) |
|
The key in edge attribute dictionary to be used for hashing. |
|
If None, edge labels are ignored. |
|
node_attr : string, optional (default=None) |
|
The key in node attribute dictionary to be used for hashing. |
|
If None, and no edge_attr given, use the degrees of the nodes as labels. |
|
If None, and edge_attr is given, each node starts with an identical label. |
|
iterations : int, optional (default=3) |
|
Number of neighbor aggregations to perform. |
|
Should be larger for larger graphs. |
|
digest_size : int, optional (default=16) |
|
Size (in bits) of blake2b hash digest to use for hashing node labels. |
|
The default size is 16 bits. |
|
include_initial_labels : bool, optional (default=False) |
|
If True, include the hashed initial node label as the first subgraph |
|
hash for each node. |
|
|
|
Returns |
|
------- |
|
node_subgraph_hashes : dict |
|
A dictionary with each key given by a node in G, and each value given |
|
by the subgraph hashes in order of depth from the key node. |
|
|
|
Examples |
|
-------- |
|
Finding similar nodes in different graphs: |
|
|
|
>>> G1 = nx.Graph() |
|
>>> G1.add_edges_from([(1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)]) |
|
>>> G2 = nx.Graph() |
|
>>> G2.add_edges_from([(1, 3), (2, 3), (1, 6), (1, 5), (4, 6)]) |
|
>>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(G1, iterations=3, digest_size=8) |
|
>>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(G2, iterations=3, digest_size=8) |
|
|
|
Even though G1 and G2 are not isomorphic (they have different numbers of edges), |
|
the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar: |
|
|
|
>>> g1_hashes[1] |
|
['a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0'] |
|
>>> g2_hashes[5] |
|
['a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc'] |
|
|
|
The first 2 WL subgraph hashes match. From this we can conclude that it's very |
|
likely the neighborhood of 2 hops around these nodes are isomorphic. |
|
|
|
However the 3-hop neighborhoods of ``G1`` and ``G2`` are not isomorphic since the |
|
3rd hashes in the lists above are not equal. |
|
|
|
These nodes may be candidates to be classified together since their local topology |
|
is similar. |
|
|
|
Notes |
|
----- |
|
To hash the full graph when subgraph hashes are not needed, use |
|
`weisfeiler_lehman_graph_hash` for efficiency. |
|
|
|
Similarity between hashes does not imply similarity between graphs. |
|
|
|
References |
|
---------- |
|
.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, |
|
Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman |
|
Graph Kernels. Journal of Machine Learning Research. 2011. |
|
http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf |
|
.. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan, |
|
Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning |
|
Distributed Representations of Graphs. arXiv. 2017 |
|
https://arxiv.org/pdf/1707.05005.pdf |
|
|
|
See also |
|
-------- |
|
weisfeiler_lehman_graph_hash |
|
""" |
|
|
|
def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None): |
|
""" |
|
Apply neighborhood aggregation to each node |
|
in the graph. |
|
Computes a dictionary with labels for each node. |
|
Appends the new hashed label to the dictionary of subgraph hashes |
|
originating from and indexed by each node in G |
|
""" |
|
new_labels = {} |
|
for node in G.nodes(): |
|
label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) |
|
hashed_label = _hash_label(label, digest_size) |
|
new_labels[node] = hashed_label |
|
node_subgraph_hashes[node].append(hashed_label) |
|
return new_labels |
|
|
|
node_labels = _init_node_labels(G, edge_attr, node_attr) |
|
if include_initial_labels: |
|
node_subgraph_hashes = { |
|
k: [_hash_label(v, digest_size)] for k, v in node_labels.items() |
|
} |
|
else: |
|
node_subgraph_hashes = defaultdict(list) |
|
|
|
for _ in range(iterations): |
|
node_labels = weisfeiler_lehman_step( |
|
G, node_labels, node_subgraph_hashes, edge_attr |
|
) |
|
|
|
return dict(node_subgraph_hashes) |
|
|