Source code for hypergraphx.communities.hy_sc.model
from typing import Optional, Tuple
import logging
import numpy as np
try:
from sklearn.cluster import KMeans # type: ignore
except ImportError: # pragma: no cover
KMeans = None
from hypergraphx import Hypergraph
from hypergraphx.linalg.linalg import binary_incidence_matrix, incidence_matrix
[docs]
class HySC:
"""Implementation of Hypergraph Spectral Clustering from
"Learning with hypergraphs: Clustering, classification, and embedding",
Zhou D., Huang J., Schรถlkopf B.
"""
def __init__(
self,
seed: int = 0,
inf: int = 1e40,
n_realizations: int = 10,
out_inference: bool = False,
out_folder: str = "../data/output/",
end_file: str = "_sc.dat",
) -> None:
"""Initialize the model.
Parameters
----------
seed: random seed.
inf: initial value for the log-likelihood.
n_realizations: number of realizations with different random initialization.
out_inference: flag to store the inferred parameters.
out_folder: path to store the output.
end_file: output file suffix.
"""
# Training related attributes.
self.seed = seed
self.inf = inf
self.n_realizations = n_realizations
# Output related attributes.
self.out_inference = out_inference
self.out_folder = out_folder
self.end_file = end_file
# Membership matrix
self.u: Optional[np.array] = None
[docs]
def fit(self, hypergraph: Hypergraph, K: int, weighted_L: bool = False) -> np.array:
"""Perform community detection on hypergraphs with spectral clustering.
Parameters
----------
hypergraph: the hypergraph to perform inference on.
K: number of communities.
weighted_L: flag to use the weighted Laplacian.
Returns
-------
u: hard-membership matrix of dimension (N, K).
"""
# Initialize all the parameters needed for training.
self._init_data(hypergraph=hypergraph, K=K)
# Get the Laplacian matrix.
self._extract_laplacian(weighted_L=weighted_L)
# Get eigenvalues and eigenvectors of the Laplacian matrix.
e_vals, e_vecs = self.extract_eigenvectors()
# Extract hard-memberships by applying the K-Means algorithm to the eigenvectors.
self.u = self.apply_kmeans(e_vecs.real, seed=self.seed)
# Save inferred parameter.
if self.out_inference:
self.output_results()
return self.u
def _init_data(
self,
hypergraph: Hypergraph,
K: int,
):
"""Initialize parameters for the inference.
Parameters
----------
hypergraph: the hypergraph to perform inference on.
K: number of communities.
"""
# Weighted incidence matrix.
self.incidence = incidence_matrix(hypergraph)
# Binary incidence matrix.
self.binary_incidence = binary_incidence_matrix(hypergraph)
# Number of nodes, and number of hyperedges.
self.N, self.E = self.incidence.shape
# Number of communities.
self.K = K
# Maximum observed hyperedge size.
self.D = max([len(e) for e in np.array(hypergraph.get_edges(), dtype=object)])
# Nodes' degree.
# TODO: is there a better way to get the array with the node degrees?
self.node_degree = self.binary_incidence.sum(axis=1)
# TODO: is there a better way to get the weighted degrees?
self.node_degree_weighted = self.incidence.sum(axis=1)
# Hyperedges' size.
self.hye_size = np.array(hypergraph.get_sizes())
# TODO: is there a better way to get the weighted sizes?
self.hye_size_weighted = self.incidence.sum(axis=1)
node_order = list(hypergraph.get_mapping().classes_)
self.isolates = hypergraph.isolates(node_order=node_order)
self.non_isolates = hypergraph.non_isolates(node_order=node_order)
def _extract_laplacian(self, weighted_L: bool = False) -> None:
# Check for division by zero and handle isolated nodes
invDE = np.diag(1.0 / np.where(self.hye_size == 0, 1, self.hye_size))
invDV2 = np.diag(
np.sqrt(1.0 / np.where(self.node_degree == 0, 1, self.node_degree))
)
# set to zero for isolated nodes - we check directly on self.node_degree
invDV2[self.node_degree == 0, self.node_degree == 0] = 0
if weighted_L:
dense_incidence = self.incidence.toarray()
HT = dense_incidence.T
self.L = np.eye(self.N) - invDV2 @ dense_incidence @ invDE @ HT @ invDV2
else:
dense_binary_incidence = self.binary_incidence.toarray()
HT = dense_binary_incidence.T
self.L = (
np.eye(self.N) - invDV2 @ dense_binary_incidence @ invDE @ HT @ invDV2
)
[docs]
def apply_kmeans(self, X: np.array, seed: int = 10) -> np.array:
"""Apply K-means algorithm to the eigenvectors of the Laplacian matrix.
Parameters
----------
X: matrix with eigenvectors.
seed: random seed.
Returns
-------
X_pred: membership matrix.
"""
if KMeans is None:
raise ImportError(
"HySC requires scikit-learn. Install hypergraphx with the appropriate extra "
"(e.g. `pip install hypergraphx[ml]`) or install scikit-learn manually."
)
y_pred = KMeans(
n_clusters=self.K, random_state=seed, n_init=self.n_realizations
).fit_predict(X)
X_pred = np.zeros((self.N, self.K))
for idx, i in enumerate(self.non_isolates):
X_pred[i, y_pred[idx]] = 1
return X_pred
[docs]
def output_results(self) -> None:
"""Save the results in a .npz file."""
outfile = self.out_folder + "theta" + self.end_file
np.savez_compressed(outfile + ".npz", u=self.u)
logger = logging.getLogger(__name__)
logger.info("Inferred parameters saved in: %s", outfile + ".npz")
logger.info('To load: theta=np.load(filename), then e.g. theta["u"]')