Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

12.4 kB

	"""
	Functions for hashing graphs to strings.
	Isomorphic graphs should be assigned identical hashes.
	For now, only Weisfeiler-Lehman hashing is implemented.
	"""

	from collections import Counter, defaultdict
	from hashlib import blake2b

	import networkx as nx

	__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"]


	def _hash_label(label, digest_size):
	return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()


	def _init_node_labels(G, edge_attr, node_attr):
	if node_attr:
	return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)}
	elif edge_attr:
	return {u: "" for u in G}
	else:
	return {u: str(deg) for u, deg in G.degree()}


	def _neighborhood_aggregate(G, node, node_labels, edge_attr=None):
	"""
	Compute new labels for given node by aggregating
	the labels of each node's neighbors.
	"""
	label_list = []
	for nbr in G.neighbors(node):
	prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr])
	label_list.append(prefix + node_labels[nbr])
	return node_labels[node] + "".join(sorted(label_list))


	@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
	def weisfeiler_lehman_graph_hash(
	G, edge_attr=None, node_attr=None, iterations=3, digest_size=16
	):
	"""Return Weisfeiler Lehman (WL) graph hash.

	The function iteratively aggregates and hashes neighborhoods of each node.
	After each node's neighbors are hashed to obtain updated node labels,
	a hashed histogram of resulting labels is returned as the final hash.

	Hashes are identical for isomorphic graphs and strong guarantees that
	non-isomorphic graphs will get different hashes. See [1]_ for details.

	If no node or edge attributes are provided, the degree of each node
	is used as its initial label.
	Otherwise, node and/or edge labels are used to compute the hash.

	Parameters
	----------
	G : graph
	The graph to be hashed.
	Can have node and/or edge attributes. Can also have no attributes.
	edge_attr : string, optional (default=None)
	The key in edge attribute dictionary to be used for hashing.
	If None, edge labels are ignored.
	node_attr: string, optional (default=None)
	The key in node attribute dictionary to be used for hashing.
	If None, and no edge_attr given, use the degrees of the nodes as labels.
	iterations: int, optional (default=3)
	Number of neighbor aggregations to perform.
	Should be larger for larger graphs.
	digest_size: int, optional (default=16)
	Size (in bits) of blake2b hash digest to use for hashing node labels.

	Returns
	-------
	h : string
	Hexadecimal string corresponding to hash of the input graph.

	Examples
	--------
	Two graphs with edge attributes that are isomorphic, except for
	differences in the edge labels.

	>>> G1 = nx.Graph()
	>>> G1.add_edges_from(
	... [
	... (1, 2, {"label": "A"}),
	... (2, 3, {"label": "A"}),
	... (3, 1, {"label": "A"}),
	... (1, 4, {"label": "B"}),
	... ]
	... )
	>>> G2 = nx.Graph()
	>>> G2.add_edges_from(
	... [
	... (5, 6, {"label": "B"}),
	... (6, 7, {"label": "A"}),
	... (7, 5, {"label": "A"}),
	... (7, 8, {"label": "A"}),
	... ]
	... )

	Omitting the `edge_attr` option, results in identical hashes.

	>>> nx.weisfeiler_lehman_graph_hash(G1)
	'7bc4dde9a09d0b94c5097b219891d81a'
	>>> nx.weisfeiler_lehman_graph_hash(G2)
	'7bc4dde9a09d0b94c5097b219891d81a'

	With edge labels, the graphs are no longer assigned
	the same hash digest.

	>>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label")
	'c653d85538bcf041d88c011f4f905f10'
	>>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label")
	'3dcd84af1ca855d0eff3c978d88e7ec7'

	Notes
	-----
	To return the WL hashes of each subgraph of a graph, use
	`weisfeiler_lehman_subgraph_hashes`

	Similarity between hashes does not imply similarity between graphs.

	References
	----------
	.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
	Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
	Graph Kernels. Journal of Machine Learning Research. 2011.
	http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf

	See also
	--------
	weisfeiler_lehman_subgraph_hashes
	"""

	def weisfeiler_lehman_step(G, labels, edge_attr=None):
	"""
	Apply neighborhood aggregation to each node
	in the graph.
	Computes a dictionary with labels for each node.
	"""
	new_labels = {}
	for node in G.nodes():
	label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
	new_labels[node] = _hash_label(label, digest_size)
	return new_labels

	# set initial node labels
	node_labels = _init_node_labels(G, edge_attr, node_attr)

	subgraph_hash_counts = []
	for _ in range(iterations):
	node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr)
	counter = Counter(node_labels.values())
	# sort the counter, extend total counts
	subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0]))

	# hash the final counter
	return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)


	@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
	def weisfeiler_lehman_subgraph_hashes(
	G,
	edge_attr=None,
	node_attr=None,
	iterations=3,
	digest_size=16,
	include_initial_labels=False,
	):
	"""
	Return a dictionary of subgraph hashes by node.

	Dictionary keys are nodes in `G`, and values are a list of hashes.
	Each hash corresponds to a subgraph rooted at a given node u in `G`.
	Lists of subgraph hashes are sorted in increasing order of depth from
	their root node, with the hash at index i corresponding to a subgraph
	of nodes at most i edges distance from u. Thus, each list will contain
	`iterations` elements - a hash for a subgraph at each depth. If
	`include_initial_labels` is set to `True`, each list will additionally
	have contain a hash of the initial node label (or equivalently a
	subgraph of depth 0) prepended, totalling ``iterations + 1`` elements.

	The function iteratively aggregates and hashes neighborhoods of each node.
	This is achieved for each step by replacing for each node its label from
	the previous iteration with its hashed 1-hop neighborhood aggregate.
	The new node label is then appended to a list of node labels for each
	node.

	To aggregate neighborhoods for a node $u$ at each step, all labels of
	nodes adjacent to $u$ are concatenated. If the `edge_attr` parameter is set,
	labels for each neighboring node are prefixed with the value of this attribute
	along the connecting edge from this neighbor to node $u$. The resulting string
	is then hashed to compress this information into a fixed digest size.

	Thus, at the $i$-th iteration, nodes within $i$ hops influence any given
	hashed node label. We can therefore say that at depth $i$ for node $u$
	we have a hash for a subgraph induced by the $i$-hop neighborhood of $u$.

	The output can be used to to create general Weisfeiler-Lehman graph kernels,
	or generate features for graphs or nodes - for example to generate 'words' in
	a graph as seen in the 'graph2vec' algorithm.
	See [1]_ & [2]_ respectively for details.

	Hashes are identical for isomorphic subgraphs and there exist strong
	guarantees that non-isomorphic graphs will get different hashes.
	See [1]_ for details.

	If no node or edge attributes are provided, the degree of each node
	is used as its initial label.
	Otherwise, node and/or edge labels are used to compute the hash.

	Parameters
	----------
	G : graph
	The graph to be hashed.
	Can have node and/or edge attributes. Can also have no attributes.
	edge_attr : string, optional (default=None)
	The key in edge attribute dictionary to be used for hashing.
	If None, edge labels are ignored.
	node_attr : string, optional (default=None)
	The key in node attribute dictionary to be used for hashing.
	If None, and no edge_attr given, use the degrees of the nodes as labels.
	If None, and edge_attr is given, each node starts with an identical label.
	iterations : int, optional (default=3)
	Number of neighbor aggregations to perform.
	Should be larger for larger graphs.
	digest_size : int, optional (default=16)
	Size (in bits) of blake2b hash digest to use for hashing node labels.
	The default size is 16 bits.
	include_initial_labels : bool, optional (default=False)
	If True, include the hashed initial node label as the first subgraph
	hash for each node.

	Returns
	-------
	node_subgraph_hashes : dict
	A dictionary with each key given by a node in G, and each value given
	by the subgraph hashes in order of depth from the key node.

	Examples
	--------
	Finding similar nodes in different graphs:

	>>> G1 = nx.Graph()
	>>> G1.add_edges_from([(1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)])
	>>> G2 = nx.Graph()
	>>> G2.add_edges_from([(1, 3), (2, 3), (1, 6), (1, 5), (4, 6)])
	>>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(G1, iterations=3, digest_size=8)
	>>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(G2, iterations=3, digest_size=8)

	Even though G1 and G2 are not isomorphic (they have different numbers of edges),
	the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar:

	>>> g1_hashes[1]
	['a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0']
	>>> g2_hashes[5]
	['a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc']

	The first 2 WL subgraph hashes match. From this we can conclude that it's very
	likely the neighborhood of 2 hops around these nodes are isomorphic.

	However the 3-hop neighborhoods of ``G1`` and ``G2`` are not isomorphic since the
	3rd hashes in the lists above are not equal.

	These nodes may be candidates to be classified together since their local topology
	is similar.

	Notes
	-----
	To hash the full graph when subgraph hashes are not needed, use
	`weisfeiler_lehman_graph_hash` for efficiency.

	Similarity between hashes does not imply similarity between graphs.

	References
	----------
	.. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
	Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
	Graph Kernels. Journal of Machine Learning Research. 2011.
	http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
	.. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan,
	Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning
	Distributed Representations of Graphs. arXiv. 2017
	https://arxiv.org/pdf/1707.05005.pdf

	See also
	--------
	weisfeiler_lehman_graph_hash
	"""

	def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None):
	"""
	Apply neighborhood aggregation to each node
	in the graph.
	Computes a dictionary with labels for each node.
	Appends the new hashed label to the dictionary of subgraph hashes
	originating from and indexed by each node in G
	"""
	new_labels = {}
	for node in G.nodes():
	label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
	hashed_label = _hash_label(label, digest_size)
	new_labels[node] = hashed_label
	node_subgraph_hashes[node].append(hashed_label)
	return new_labels

	node_labels = _init_node_labels(G, edge_attr, node_attr)
	if include_initial_labels:
	node_subgraph_hashes = {
	k: [_hash_label(v, digest_size)] for k, v in node_labels.items()
	}
	else:
	node_subgraph_hashes = defaultdict(list)

	for _ in range(iterations):
	node_labels = weisfeiler_lehman_step(
	G, node_labels, node_subgraph_hashes, edge_attr
	)

	return dict(node_subgraph_hashes)