# Copyright 2019 NeuroData (http://neurodata.io)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Implements multiview spectral clustering algorithm for data with
# multiple views.
import numpy as np
import scipy as sp
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from ..utils.utils import check_Xs
from .base import BaseCluster
from sklearn.exceptions import NotFittedError
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel
from sklearn.neighbors import NearestNeighbors
AFFINITY_METRICS = ['rbf', 'nearest_neighbors', 'poly']
[docs]class MultiviewSpectralClustering(BaseCluster):
r'''
An implementation of multiview spectral clustering using the
basic cotraining framework as described in [#1Clu]_.
Additionally, this can be effective when the dataset naturally
contains features that are of 2 different data types, such as
continuous features and categorical features [#3Clu]_, and then the
original features are separated into two views in this way.
This algorithm can handle 2 or more views of data.
Parameters

n_clusters : int
The number of clusters
random_state : int, optional, default=None
Determines random number generation for kmeans.
info_view : int, optional, default=None
The most informative view. Must be between 0 and n_views1
If given, then the final clustering will be performed on the
designated view alone. Otherwise, the algorithm will concatenate
across all views and cluster on the result.
max_iter : int, optional, default=10
The maximum number of iterations to run the clustering
algorithm.
n_init : int, optional, default=10
The number of random initializations to use for kmeans clustering.
affinity : string, optional, default='rbf'
The affinity metric used to construct the affinity matrix. Options
include 'rbf' (radial basis function), 'nearest_neighbors', and
'poly' (polynomial)
gamma : float, optional, default=None
Kernel coefficient for rbf and polynomial kernels. If None then
gamma is computed as 1 / (2 * median(pair_wise_distances(X))^2)
for each data view X.
n_neighbors : int, optional, default=10
Only used if nearest neighbors is selected for affinity. The
number of neighbors to use for the nearest neighbors kernel.
Attributes

labels_ : arraylike, shape (n_samples)
Cluster labels for each sample in the fitted data.
embedding_ : arraylike, shape (n_samples, n_clusters)
The final spectral representation of the data to be used as input
for the KMeans clustering step.
Notes

Multiview spectral clustering adapts the spectral clustering algorithm
to applications where more than one view of data is available. This
algorithm relies on the basic assumptions of the cotraining, which are:
(a) Sufficiency: each view is sufficient for classification on its own,
(b) Compatibility: the target functions in both views predict the same
labels for cooccurring features with high probability, and (c)
Conditional independence: the views are conditionally independent given
the class labels. In contrast to multiview kmeans clustering,
multiview spectral clustering performs well on arbitrary shaped clusters,
and can therefore be readily used in applications where clusters are not
expected to be convex. However multiview spectral clustering tends to be
computationally expensive unless the similarity graph for the data is
sparse.
Multiview spectral clustering works by using the spectral embedding
from one view to constrain the similarity graph in the other view. By
iteratively applying this procedure, the clustering of the two views
tend to each other. Here we outline the algorithm for the Multiview
Spectral clustering algorithm for 2 views.

*Multiview Spectral Clustering Algorithm (for 2 views)*
Input: Similarity matrix for both views: :math:`\mathbf{K}_1, \mathbf{K}_2`
Output: Assignments to k clusters
#. Initialize: :math:`\mathbf{L}_v = \mathbf{D}_v^{1/2}
\mathbf{K}_v\mathbf{D}_v^{1/2}` for :math:`v = 1, 2`
:math:`\mathbf{U}_v^0` is an :math:`n \times k` matrix with the
top k eigenvectors of :math:`\mathbf{L}_v` for :math:`v = 1, 2`
#. For :math:`i = 1` to iter:
a. :math:`\mathbf{S}_1 = sym(\mathbf{U}_2^{i1}
{\mathbf{U}_2^{i1}}^T\mathbf{K}_1)`
b. :math:`\mathbf{S}_2 = sym(\mathbf{U}_1^{i1}
{\mathbf{U}_1^{i1}}^T\mathbf{K}_2)`
c. Use :math:`\mathbf{S}_1` and :math:`\mathbf{S}_2` as the new
graph similarities and compute the Laplacians. Solve for the
largest k eigenvectors to obtain :math:`\mathbf{U}_1^i` and
:math:`\mathbf{U}_2^i`.
#. Rownormalize :math:`\mathbf{U}_1^i` and :math:`\mathbf{U}_2^i`.
#. Form matrix :math:`\mathbf{V} = \mathbf{U}_v^i`, where :math:`v` is
believed to be the most informative view a priori. If there is no
prior knowledge on the view informativeness, matrix
:math:`\mathbf{V}` can also be set to the columnwise concatenation
of the two :math:`\mathbf{U}_v^i` s.
#. Assign example j to cluster c if the jth row of :math:`\mathbf{V}`
is assigned to cluster c by the kmeans algorithm.
References

.. [#1Clu] Abhishek Kumar and Hal DaumÃ©. A Cotraining Approach for
Multiview Spectral Clustering. In International Conference
on Machine Learning, 2011
Examples

>>> from mvlearn.datasets import load_UCImultifeature
>>> from mvlearn.cluster import MultiviewSpectralClustering
>>> from sklearn.metrics import normalized_mutual_info_score as nmi_score
>>> # Get 5class data
>>> data, labels = load_UCImultifeature(select_labeled = list(range(5)))
>>> mv_data = data[:2] # first 2 views only
>>> mv_spectral = MultiviewSpectralClustering(n_clusters=5,
... random_state=10, n_init=100)
>>> mv_clusters = mv_spectral.fit_predict(mv_data)
>>> nmi = nmi_score(labels, mv_clusters)
>>> print('{0:.3f}'.format(nmi))
0.872
'''
def __init__(self, n_clusters=2, random_state=None,
info_view=None, max_iter=10, n_init=10, affinity='rbf',
gamma=None, n_neighbors=10):
super().__init__()
self.n_clusters = n_clusters
self.random_state = random_state
self.info_view = info_view
self.max_iter = max_iter
self.n_init = n_init
self.affinity = affinity
self.gamma = gamma
self.n_neighbors = n_neighbors
self.labels_ = None
self.embedding_ = None
def _affinity_mat(self, X):
r'''
Computes the affinity matrix based on the selected
kernel type.
Parameters

X : arraylike, shape (n_samples, n_features)
The data matrix from which we will compute the
affinity matrix.
Returns

sims : arraylike, shape (n_samples, n_samples)
The resulting affinity kernel.
'''
sims = None
# If gamma is None, then compute default gamma value for this view
gamma = self.gamma
if self.gamma is None:
distances = cdist(X, X)
gamma = 1 / (2 * np.median(distances) ** 2)
# Produce the affinity matrix based on the selected kernel type
if (self.affinity == 'rbf'):
sims = rbf_kernel(X, gamma=gamma)
elif(self.affinity == 'nearest_neighbors'):
neighbor = NearestNeighbors(n_neighbors=self.n_neighbors)
neighbor.fit(X)
sims = neighbor.kneighbors_graph(X).toarray()
else:
sims = polynomial_kernel(X, gamma=gamma)
return sims
def _compute_eigs(self, X):
r'''
Computes the top several eigenvectors of the
normalized graph laplacian of a given similarity matrix.
The number of eigenvectors returned is equal to n_clusters.
Parameters

X : arraylike, shape(n_samples, n_samples)
The similarity matrix for the data in a single view.
Returns

la_eigs : arraylike, shape(n_samples, n_clusters)
The top n_cluster eigenvectors of the normalized graph
laplacian.
'''
# Compute the normalized laplacian
d_mat = np.diag(np.sum(X, axis=1))
# Double check why we take absolute value of d_mat
d_alt = np.sqrt(np.linalg.inv(np.abs(d_mat)))
laplacian = d_alt @ X @ d_alt
# Make the resulting matrix symmetric
laplacian = (laplacian + np.transpose(laplacian)) / 2.0
# Obtain the top n_cluster eigenvectors of the laplacian
u_mat, _, _ = sp.sparse.linalg.svds(laplacian, k=self.n_clusters)
la_eigs = u_mat[:, :self.n_clusters]
return la_eigs
def _param_checks(self, Xs):
r'''
Performs bulk of checks and exception handling for
inputted user parameters.
Parameters

Xs : list of arraylikes or numpy.ndarray
 Xs length: n_views
 Xs[i] shape: (n_samples, n_features_i)
This list must be of size n_views, corresponding to the number
of views of data. Each view can have a different number of
features, but they must have the same number of samples.
Returns

Xs : list of arraylikes or numpy.ndarray
 Xs length: n_views
 Xs[i] shape: (n_samples, n_features_i)
The data in the appropriate format.
'''
Xs = check_Xs(Xs)
if len(Xs) < 2:
msg = 'Xs must have at least 2 views'
raise ValueError(msg)
self._n_views = len(Xs)
if not (isinstance(self.n_clusters, int) and self.n_clusters > 0):
msg = 'n_clusters must be a positive integer'
raise ValueError(msg)
if self.random_state is not None:
msg = 'random_state must be convertible to 32 bit unsigned integer'
try:
self.random_state = int(self.random_state)
except ValueError:
raise ValueError(msg)
np.random.seed(self.random_state)
if self.info_view is not None:
if not (isinstance(self.info_view, int) and
(self. info_view >= 0 and self.info_view < self._n_views)):
msg = 'info_view must be an integer between 0 and n_clusters1'
raise ValueError(msg)
if not (isinstance(self.max_iter, int) and (self.max_iter > 0)):
msg = 'max_iter must be a positive integer'
raise ValueError(msg)
if not (isinstance(self.n_init, int) and self.n_init > 0):
msg = 'n_init must be a positive integer'
raise ValueError(msg)
if self.affinity not in AFFINITY_METRICS:
msg = 'affinity must be a valid affinity metric'
raise ValueError(msg)
if self.gamma is not None and not ((isinstance(
self.gamma, float) or isinstance(self.gamma, int))
and self.gamma > 0):
msg = 'gamma must be a positive float'
raise ValueError(msg)
if not (isinstance(self.n_neighbors, int) and self.n_neighbors > 0):
msg = 'n_neighbors must be a positive integer'
raise ValueError(msg)
return Xs
[docs] def fit(self, Xs, y=None):
r'''
Performs clustering on the multiple views of data.
Parameters

Xs : list of arraylikes or numpy.ndarray
 Xs length: n_views
 Xs[i] shape: (n_samples, n_features_i)
This list must be of size n_views, corresponding to the number
of views of data. Each view can have a different number of
features, but they must have the same number of samples.
y : Ignored
Not used, present for API consistency by convention.
Returns

self : returns an instance of self.
'''
# Perform checks on the data and inputted parameters
Xs = self._param_checks(Xs)
# Compute the similarity matrices
sims = [self._affinity_mat(X) for X in Xs]
# Initialize matrices of eigenvectors
U_mats = [self._compute_eigs(sim) for sim in sims]
# Iteratively compute new graph similarities, laplacians,
# and eigenvectors
for iter in range(self.max_iter):
# Compute the sums of the products of the spectral embeddings
# and their transposes
eig_sums = [u_mat @ np.transpose(u_mat) for u_mat in U_mats]
U_sum = np.sum(np.array(eig_sums), axis=0)
new_sims = list()
for view in range(self._n_views):
# Compute new graph similarity representation
mat1 = sims[view] @ (U_sum  eig_sums[view])
mat1 = (mat1 + np.transpose(mat1)) / 2.0
new_sims.append(mat1)
# Recompute eigenvectors
U_mats = [self._compute_eigs(sim)
for sim in new_sims]
# Row normalize
for view in range(self._n_views):
U_norm = np.linalg.norm(U_mats[view], axis=1).reshape((1, 1))
U_norm[U_norm == 0] = 1
U_mats[view] /= U_norm
# Performing kmeans clustering
kmeans = KMeans(n_clusters=self.n_clusters, n_init=self.n_init,
random_state=self.random_state)
if self.info_view is not None:
# Use a single view if one was previously designated
self.embedding_ = U_mats[self.info_view]
self.labels_ = kmeans.fit_predict(self.embedding_)
else:
# Otherwise, perform columwise concatenation across views
# and use result for clustering
self.embedding_ = np.hstack(U_mats)
self.labels_ = kmeans.fit_predict(self.embedding_)