Source code for hypergraphx.communities.hy_sc.model

from typing import Optional, Tuple

import logging
import numpy as np

try:
    from sklearn.cluster import KMeans  # type: ignore
except ImportError:  # pragma: no cover
    KMeans = None

from hypergraphx import Hypergraph
from hypergraphx.linalg.linalg import binary_incidence_matrix, incidence_matrix


[docs] class HySC: """Implementation of Hypergraph Spectral Clustering from "Learning with hypergraphs: Clustering, classification, and embedding", Zhou D., Huang J., Schรถlkopf B. """ def __init__( self, seed: int = 0, inf: int = 1e40, n_realizations: int = 10, out_inference: bool = False, out_folder: str = "../data/output/", end_file: str = "_sc.dat", ) -> None: """Initialize the model. Parameters ---------- seed: random seed. inf: initial value for the log-likelihood. n_realizations: number of realizations with different random initialization. out_inference: flag to store the inferred parameters. out_folder: path to store the output. end_file: output file suffix. """ # Training related attributes. self.seed = seed self.inf = inf self.n_realizations = n_realizations # Output related attributes. self.out_inference = out_inference self.out_folder = out_folder self.end_file = end_file # Membership matrix self.u: Optional[np.array] = None
[docs] def fit(self, hypergraph: Hypergraph, K: int, weighted_L: bool = False) -> np.array: """Perform community detection on hypergraphs with spectral clustering. Parameters ---------- hypergraph: the hypergraph to perform inference on. K: number of communities. weighted_L: flag to use the weighted Laplacian. Returns ------- u: hard-membership matrix of dimension (N, K). """ # Initialize all the parameters needed for training. self._init_data(hypergraph=hypergraph, K=K) # Get the Laplacian matrix. self._extract_laplacian(weighted_L=weighted_L) # Get eigenvalues and eigenvectors of the Laplacian matrix. e_vals, e_vecs = self.extract_eigenvectors() # Extract hard-memberships by applying the K-Means algorithm to the eigenvectors. self.u = self.apply_kmeans(e_vecs.real, seed=self.seed) # Save inferred parameter. if self.out_inference: self.output_results() return self.u
def _init_data( self, hypergraph: Hypergraph, K: int, ): """Initialize parameters for the inference. Parameters ---------- hypergraph: the hypergraph to perform inference on. K: number of communities. """ # Weighted incidence matrix. self.incidence = incidence_matrix(hypergraph) # Binary incidence matrix. self.binary_incidence = binary_incidence_matrix(hypergraph) # Number of nodes, and number of hyperedges. self.N, self.E = self.incidence.shape # Number of communities. self.K = K # Maximum observed hyperedge size. self.D = max([len(e) for e in np.array(hypergraph.get_edges(), dtype=object)]) # Nodes' degree. # TODO: is there a better way to get the array with the node degrees? self.node_degree = self.binary_incidence.sum(axis=1) # TODO: is there a better way to get the weighted degrees? self.node_degree_weighted = self.incidence.sum(axis=1) # Hyperedges' size. self.hye_size = np.array(hypergraph.get_sizes()) # TODO: is there a better way to get the weighted sizes? self.hye_size_weighted = self.incidence.sum(axis=1) node_order = list(hypergraph.get_mapping().classes_) self.isolates = hypergraph.isolates(node_order=node_order) self.non_isolates = hypergraph.non_isolates(node_order=node_order) def _extract_laplacian(self, weighted_L: bool = False) -> None: # Check for division by zero and handle isolated nodes invDE = np.diag(1.0 / np.where(self.hye_size == 0, 1, self.hye_size)) invDV2 = np.diag( np.sqrt(1.0 / np.where(self.node_degree == 0, 1, self.node_degree)) ) # set to zero for isolated nodes - we check directly on self.node_degree invDV2[self.node_degree == 0, self.node_degree == 0] = 0 if weighted_L: dense_incidence = self.incidence.toarray() HT = dense_incidence.T self.L = np.eye(self.N) - invDV2 @ dense_incidence @ invDE @ HT @ invDV2 else: dense_binary_incidence = self.binary_incidence.toarray() HT = dense_binary_incidence.T self.L = ( np.eye(self.N) - invDV2 @ dense_binary_incidence @ invDE @ HT @ invDV2 )
[docs] def extract_eigenvectors(self) -> Tuple[np.array, np.array]: """Extract eigenvalues and eigenvectors of the Laplacian matrix.""" e_vals, e_vecs = np.linalg.eig(self.L[self.non_isolates][:, self.non_isolates]) sorted_indices = np.argsort(e_vals) return e_vals[sorted_indices[: self.K]], e_vecs[:, sorted_indices[1 : self.K]]
[docs] def apply_kmeans(self, X: np.array, seed: int = 10) -> np.array: """Apply K-means algorithm to the eigenvectors of the Laplacian matrix. Parameters ---------- X: matrix with eigenvectors. seed: random seed. Returns ------- X_pred: membership matrix. """ if KMeans is None: raise ImportError( "HySC requires scikit-learn. Install hypergraphx with the appropriate extra " "(e.g. `pip install hypergraphx[ml]`) or install scikit-learn manually." ) y_pred = KMeans( n_clusters=self.K, random_state=seed, n_init=self.n_realizations ).fit_predict(X) X_pred = np.zeros((self.N, self.K)) for idx, i in enumerate(self.non_isolates): X_pred[i, y_pred[idx]] = 1 return X_pred
[docs] def output_results(self) -> None: """Save the results in a .npz file.""" outfile = self.out_folder + "theta" + self.end_file np.savez_compressed(outfile + ".npz", u=self.u) logger = logging.getLogger(__name__) logger.info("Inferred parameters saved in: %s", outfile + ".npz") logger.info('To load: theta=np.load(filename), then e.g. theta["u"]')