Spaces:
Running
Running
File size: 11,887 Bytes
b200bda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 |
"""
Functions for hashing graphs to strings.
Isomorphic graphs should be assigned identical hashes.
For now, only Weisfeiler-Lehman hashing is implemented.
"""
from collections import Counter, defaultdict
from hashlib import blake2b
import networkx as nx
__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"]
def _hash_label(label, digest_size):
return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()
def _init_node_labels(G, edge_attr, node_attr):
if node_attr:
return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)}
elif edge_attr:
return {u: "" for u in G}
else:
return {u: str(deg) for u, deg in G.degree()}
def _neighborhood_aggregate(G, node, node_labels, edge_attr=None):
"""
Compute new labels for given node by aggregating
the labels of each node's neighbors.
"""
label_list = []
for nbr in G.neighbors(node):
prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr])
label_list.append(prefix + node_labels[nbr])
return node_labels[node] + "".join(sorted(label_list))
@nx._dispatch(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
def weisfeiler_lehman_graph_hash(
G, edge_attr=None, node_attr=None, iterations=3, digest_size=16
):
"""Return Weisfeiler Lehman (WL) graph hash.
The function iteratively aggregates and hashes neighbourhoods of each node.
After each node's neighbors are hashed to obtain updated node labels,
a hashed histogram of resulting labels is returned as the final hash.
Hashes are identical for isomorphic graphs and strong guarantees that
non-isomorphic graphs will get different hashes. See [1]_ for details.
If no node or edge attributes are provided, the degree of each node
is used as its initial label.
Otherwise, node and/or edge labels are used to compute the hash.
Parameters
----------
G: graph
The graph to be hashed.
Can have node and/or edge attributes. Can also have no attributes.
edge_attr: string, default=None
The key in edge attribute dictionary to be used for hashing.
If None, edge labels are ignored.
node_attr: string, default=None
The key in node attribute dictionary to be used for hashing.
If None, and no edge_attr given, use the degrees of the nodes as labels.
iterations: int, default=3
Number of neighbor aggregations to perform.
Should be larger for larger graphs.
digest_size: int, default=16
Size (in bits) of blake2b hash digest to use for hashing node labels.
Returns
-------
h : string
Hexadecimal string corresponding to hash of the input graph.
Examples
--------
Two graphs with edge attributes that are isomorphic, except for
differences in the edge labels.
>>> G1 = nx.Graph()
>>> G1.add_edges_from(
... [
... (1, 2, {"label": "A"}),
... (2, 3, {"label": "A"}),
... (3, 1, {"label": "A"}),
... (1, 4, {"label": "B"}),
... ]
... )
>>> G2 = nx.Graph()
>>> G2.add_edges_from(
... [
... (5, 6, {"label": "B"}),
... (6, 7, {"label": "A"}),
... (7, 5, {"label": "A"}),
... (7, 8, {"label": "A"}),
... ]
... )
Omitting the `edge_attr` option, results in identical hashes.
>>> nx.weisfeiler_lehman_graph_hash(G1)
'7bc4dde9a09d0b94c5097b219891d81a'
>>> nx.weisfeiler_lehman_graph_hash(G2)
'7bc4dde9a09d0b94c5097b219891d81a'
With edge labels, the graphs are no longer assigned
the same hash digest.
>>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label")
'c653d85538bcf041d88c011f4f905f10'
>>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label")
'3dcd84af1ca855d0eff3c978d88e7ec7'
Notes
-----
To return the WL hashes of each subgraph of a graph, use
`weisfeiler_lehman_subgraph_hashes`
Similarity between hashes does not imply similarity between graphs.
References
----------
.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
Graph Kernels. Journal of Machine Learning Research. 2011.
http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
See also
--------
weisfeiler_lehman_subgraph_hashes
"""
def weisfeiler_lehman_step(G, labels, edge_attr=None):
"""
Apply neighborhood aggregation to each node
in the graph.
Computes a dictionary with labels for each node.
"""
new_labels = {}
for node in G.nodes():
label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
new_labels[node] = _hash_label(label, digest_size)
return new_labels
# set initial node labels
node_labels = _init_node_labels(G, edge_attr, node_attr)
subgraph_hash_counts = []
for _ in range(iterations):
node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr)
counter = Counter(node_labels.values())
# sort the counter, extend total counts
subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0]))
# hash the final counter
return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)
@nx._dispatch(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
def weisfeiler_lehman_subgraph_hashes(
G, edge_attr=None, node_attr=None, iterations=3, digest_size=16
):
"""
Return a dictionary of subgraph hashes by node.
Dictionary keys are nodes in `G`, and values are a list of hashes.
Each hash corresponds to a subgraph rooted at a given node u in `G`.
Lists of subgraph hashes are sorted in increasing order of depth from
their root node, with the hash at index i corresponding to a subgraph
of nodes at most i edges distance from u. Thus, each list will contain
``iterations + 1`` elements - a hash for a subgraph at each depth, and
additionally a hash of the initial node label (or equivalently a
subgraph of depth 0)
The function iteratively aggregates and hashes neighbourhoods of each node.
This is achieved for each step by replacing for each node its label from
the previous iteration with its hashed 1-hop neighborhood aggregate.
The new node label is then appended to a list of node labels for each
node.
To aggregate neighborhoods at each step for a node $n$, all labels of
nodes adjacent to $n$ are concatenated. If the `edge_attr` parameter is set,
labels for each neighboring node are prefixed with the value of this attribute
along the connecting edge from this neighbor to node $n$. The resulting string
is then hashed to compress this information into a fixed digest size.
Thus, at the $i$-th iteration, nodes within $i$ hops influence any given
hashed node label. We can therefore say that at depth $i$ for node $n$
we have a hash for a subgraph induced by the $2i$-hop neighborhood of $n$.
The output can be used to to create general Weisfeiler-Lehman graph kernels,
or generate features for graphs or nodes - for example to generate 'words' in
a graph as seen in the 'graph2vec' algorithm.
See [1]_ & [2]_ respectively for details.
Hashes are identical for isomorphic subgraphs and there exist strong
guarantees that non-isomorphic graphs will get different hashes.
See [1]_ for details.
If no node or edge attributes are provided, the degree of each node
is used as its initial label.
Otherwise, node and/or edge labels are used to compute the hash.
Parameters
----------
G: graph
The graph to be hashed.
Can have node and/or edge attributes. Can also have no attributes.
edge_attr: string, default=None
The key in edge attribute dictionary to be used for hashing.
If None, edge labels are ignored.
node_attr: string, default=None
The key in node attribute dictionary to be used for hashing.
If None, and no edge_attr given, use the degrees of the nodes as labels.
iterations: int, default=3
Number of neighbor aggregations to perform.
Should be larger for larger graphs.
digest_size: int, default=16
Size (in bits) of blake2b hash digest to use for hashing node labels.
The default size is 16 bits
Returns
-------
node_subgraph_hashes : dict
A dictionary with each key given by a node in G, and each value given
by the subgraph hashes in order of depth from the key node.
Examples
--------
Finding similar nodes in different graphs:
>>> G1 = nx.Graph()
>>> G1.add_edges_from([
... (1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)
... ])
>>> G2 = nx.Graph()
>>> G2.add_edges_from([
... (1, 3), (2, 3), (1, 6), (1, 5), (4, 6)
... ])
>>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(G1, iterations=3, digest_size=8)
>>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(G2, iterations=3, digest_size=8)
Even though G1 and G2 are not isomorphic (they have different numbers of edges),
the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar:
>>> g1_hashes[1]
['a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0']
>>> g2_hashes[5]
['a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc']
The first 2 WL subgraph hashes match. From this we can conclude that it's very
likely the neighborhood of 4 hops around these nodes are isomorphic: each
iteration aggregates 1-hop neighbourhoods meaning hashes at depth $n$ are influenced
by every node within $2n$ hops.
However the neighborhood of 6 hops is no longer isomorphic since their 3rd hash does
not match.
These nodes may be candidates to be classified together since their local topology
is similar.
Notes
-----
To hash the full graph when subgraph hashes are not needed, use
`weisfeiler_lehman_graph_hash` for efficiency.
Similarity between hashes does not imply similarity between graphs.
References
----------
.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
Graph Kernels. Journal of Machine Learning Research. 2011.
http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
.. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan,
Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning
Distributed Representations of Graphs. arXiv. 2017
https://arxiv.org/pdf/1707.05005.pdf
See also
--------
weisfeiler_lehman_graph_hash
"""
def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None):
"""
Apply neighborhood aggregation to each node
in the graph.
Computes a dictionary with labels for each node.
Appends the new hashed label to the dictionary of subgraph hashes
originating from and indexed by each node in G
"""
new_labels = {}
for node in G.nodes():
label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
hashed_label = _hash_label(label, digest_size)
new_labels[node] = hashed_label
node_subgraph_hashes[node].append(hashed_label)
return new_labels
node_labels = _init_node_labels(G, edge_attr, node_attr)
node_subgraph_hashes = defaultdict(list)
for _ in range(iterations):
node_labels = weisfeiler_lehman_step(
G, node_labels, node_subgraph_hashes, edge_attr
)
return dict(node_subgraph_hashes)
|