Source code for nclustgen.TriclusterGen


from .Generator import Generator

import os
import json
import sys
import csv
import numpy as np
from sparse import concatenate, COO

# import dgl without backend info
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
import dgl
sys.stderr = stderr

import torch as th
import networkx as nx

from com.gtric import generator as gen
from com.gtric.service import GTricService
from com.gtric.types import Background
from com.gtric.types import BackgroundType
from com.gtric.types import Contiguity
from com.gtric.types import Distribution
from com.gtric.types import PatternType
from com.gtric.types import TimeProfile
from com.gtric.types import PlaidCoherency
from com.gtric.utils import OverlappingSettings
from com.gtric.utils import TriclusterStructure
from com.gtric.utils import TriclusterPattern
from com.gtric.utils import RandomObject
from com.gtric.utils import IOUtils as io

from java.util import ArrayList

# helper function
from .utils import tensor_value_check as tvc, loader


[docs]class TriclusterGenerator(Generator):
    """
        This class provides an implementation for three-dimensional datasets with hidden triclusters.

        **Examples**

        >>> from nclustgen import TriclusterGenerator
        >>> generator = TriclusterGenerator(
        ...     dstype='NUMERIC',
        ...     patterns=[['CONSTANT', 'CONSTANT', 'CONSTANT'], ['CONSTANT', 'NONE', 'NONE']],
        ...     bktype='UNIFORM',
        ...     in_memory=True,
        ...     silence=True
        ... )
        >>> generator.get_params()
        {'X': None, 'Y': None, 'background': ['UNIFORM'], 'clusterdistribution': [['UNIFORM', 4, 4], ['UNIFORM', 4, 4],
        ['UNIFORM', 4, 4]], 'contiguity': 'NONE', 'dstype': 'NUMERIC', 'errors': (0.0, 0.0, 0.0),
        'generatedDataset': None, 'graph': None, 'in_memory': 'True', 'maxclustsperoverlappedarea': 0,
        'maxpercofoverlappingelements': 0.0, 'maxval': 10.0, 'minval': -10.0, 'missing': (0.0, 0.0), 'cuda': 3,
        'noise': (0.0, 0.0, 0.0), 'patterns': [['CONSTANT', 'CONSTANT', 'CONSTANT'], ['CONSTANT', 'NONE', 'NONE']],
        'percofoverlappingclusts': 0.0, 'percofoverlappingcolumns': 1.0, 'percofoverlappingcontexts': 1.0,
        'percofoverlappingrows': 1.0, 'plaidcoherency': 'NO_OVERLAPPING', 'realval': True, 'seed': -1,
        'silenced': False, 'time_profile': None}
        >>> x, y = generator.generate(nrows=50, ncols=100, ncontexts=5, nclusters=3)
        >>> x
        array([[[-1.29, -4.92, -2.49, ..., -9.17, -5.19,  6.66],
                [ 5.41, -3.04, -1.58, ..., -3.44,  1.99,  9.84],
                [-1.88, -9.09,  5.06, ..., -8.96, -8.4 ,  3.56],
                ...,
                [ 8.74,  4.07,  0.6 , ...,  6.73, -1.3 ,  5.  ],
                [ 3.33,  4.66, -5.72, ...,  0.55,  5.82, -3.17],
                [ 2.32, -9.29,  3.95, ...,  3.61,  3.93, -6.76]],
               [[ 4.34,  8.59, -1.96, ...,  0.88,  8.52, -7.85],
                [ 1.87, -8.59, -9.78, ...,  5.33, -7.45,  3.1 ],
                [-6.86, -3.93,  7.73, ...,  3.21,  6.54, -7.13],
                ...,
                [-5.75,  9.91, -4.76, ...,  0.94, -9.2 , -1.32],
                [ 3.11, -8.26, -2.32, ..., -5.08,  5.33,  2.52],
                [-4.18,  7.98,  8.42, ...,  4.21, -0.03, -7.51]],
               [[ 1.22, -5.69, -8.72, ...,  5.78,  8.74,  1.44],
                [ 3.41, -7.45,  7.01, ...,  8.93, -6.01,  0.18],
                [ 3.8 ,  2.92, -1.87, ..., -1.16, -3.31, -3.02],
                ...,
                [ 4.82, -9.82,  0.31, ...,  9.91, -0.45,  7.86],
                [ 7.24,  8.28, -3.13, ...,  9.12, -0.47,  6.16],
                [-6.61, -7.34, -0.56, ...,  1.41, -1.7 ,  6.22]],
               [[ 3.46,  7.85, -8.23, ...,  1.33,  2.82, -4.05],
                [-8.87, -6.42,  2.28, ...,  9.72, -1.75,  5.01],
                [-0.26, -3.25, -9.16, ..., -1.69,  6.96,  4.63],
                ...,
                [-5.36,  2.84, -2.09, ...,  0.33, -2.88,  3.43],
                [ 5.72,  1.11,  2.11, ...,  0.27, -5.95,  3.39],
                [-7.02, -3.85, -5.44, ...,  1.64, -1.24, -2.74]],
               [[-2.39, -9.27, -8.12, ..., -7.86,  7.54,  4.99],
                [ 2.06,  3.84, -2.99, ...,  4.82, -9.29, -9.23],
                [ 0.21, -5.85, -8.45, ...,  4.35, -2.69,  0.34],
                ...,
                [-0.52, -2.59,  7.63, ..., -8.07, -3.51,  2.7 ],
                [ 4.93, -1.55, -0.65, ..., -0.87,  8.53,  9.97],
                [ 8.03,  2.32, -4.76, ..., -2.03, -4.48, -5.56]]])
        >>> y
        [[[8, 16, 17, 35], [36, 55, 69, 88], [0, 2, 3, 4]], [[7, 21, 33, 35], [22, 57, 65, 75], [0, 1, 2, 4]],
        [[9, 19, 23, 27], [12, 19, 59, 72], [1, 2, 3, 4]]]
        >>> graph = generator.to_graph(x, framework='dgl', device='cpu')
        >>> graph
        Graph(num_nodes={'col': 100, 'ctx': 5, 'row': 50},
              num_edges={('col', 'elem', 'ctx'): 500, ('row', 'elem', 'col'): 5000, ('row', 'elem', 'ctx'): 250},
              metagraph=[('col', 'ctx', 'elem'), ('row', 'col', 'elem'), ('row', 'ctx', 'elem')])
        >>> generator.save(file_name='example', single_file=True)
        """

    def __init__(self, *args, **kwargs):
        super().__init__(n=3, *args, **kwargs)

    def _initialize_seed(self):

        RandomObject.initialization(self.seed)

    def _build_background(self):

        try:
            self.background[0] = getattr(BackgroundType, self.background[0])
        except TypeError:
            pass

        return Background(*self.background)

    def _build_generator(self, class_call, params, contexts_index):

        return getattr(gen, class_call)(*params)

    def _build_patterns(self):

        patterns = ArrayList()

        if self.time_profile:
            self.time_profile = getattr(TimeProfile, str(self.time_profile).upper())

        [patterns.add(
            TriclusterPattern(*[getattr(PatternType, pattern_type) for pattern_type in pattern] + [self.time_profile])
        ) for pattern in self.patterns]

        return patterns

    def _build_structure(self):

        structure = TriclusterStructure()
        structure.setRowsSettings(
            getattr(Distribution, self.clusterdistribution[0][0]), *self.clusterdistribution[0][1:]
        )
        structure.setColumnsSettings(
            getattr(Distribution, self.clusterdistribution[1][0]), *self.clusterdistribution[1][1:]
        )
        structure.setContextsSettings(
            getattr(Distribution, self.clusterdistribution[2][0]), *self.clusterdistribution[2][1:]
        )
        structure.setContiguity(getattr(Contiguity, self.contiguity))

        return structure

    def _build_overlapping(self):

        overlapping = OverlappingSettings()
        overlapping.setPlaidCoherency(getattr(PlaidCoherency, self.plaidcoherency))
        overlapping.setPercOfOverlappingTrics(self.percofoverlappingclusts)
        overlapping.setMaxTricsPerOverlappedArea(self.maxclustsperoverlappedarea)
        overlapping.setMaxPercOfOverlappingElements(self.maxpercofoverlappingelements)
        overlapping.setPercOfOverlappingRows(self.percofoverlappingrows)
        overlapping.setPercOfOverlappingColumns(self.percofoverlappingcolumns)
        overlapping.setPercOfOverlappingContexts(self.percofoverlappingcontexts)

        return overlapping

    @staticmethod
    def _java_to_numpy(generatedDataset):

        """
        Extracts numpy array from Dataset object.

        Parameters
        ----------

        generatedDataset: Dataset object
            Generated dataset (java object).

        Returns
        -------

        numpy array
            Generated dataset as numpy array.
            Shape: (ncontexts, nrows, ncols)

        """

        tensor = str(io.matrixToStringColOriented(generatedDataset, generatedDataset.getNumRows(), 0, False))

        tensor = np.array(
            [np.array_split([tvc(val) for val in row.split('\t')[1:]], generatedDataset.getNumContexts())
             for row in tensor.split('\n')][:-1]
        )

        return tensor.reshape(
            (generatedDataset.getNumContexts(), generatedDataset.getNumRows(), generatedDataset.getNumCols())
        )

    @staticmethod
    def _java_to_sparse(generatedDataset):

        """
        Extracts sparce tensor from Dataset object.

        Parameters
        ----------

        generatedDataset: Dataset object
            Generated dataset (java object).

        Returns
        -------

        COO tensor
            Generated dataset as COO tensor.

            **Shape**: (ncontexts, nrows, ncols)

        """

        threshold = int(generatedDataset.getNumRows() / 10)
        steps = [i for i in range(int(generatedDataset.getNumRows() / threshold))]
        tensors = []

        for step in steps:
            tensor = str(io.matrixToStringColOriented(generatedDataset, threshold, step, False))

            tensor = COO.from_numpy(np.array(
                [np.array_split([tvc(val) for val in row.split('\t')[1:]], generatedDataset.getNumContexts())
                 for row in tensor.split('\n')][:-1]
            ))

            tensor = tensor.reshape((generatedDataset.getNumContexts(), threshold, generatedDataset.getNumCols()))

            tensors.append(tensor)

        return concatenate(tensors, axis=1)

    @staticmethod
    def _dense_to_dgl(x, device, cuda=0, nclusters=1, clust_init='zeros'):

        """
        Extracts a tripartite dgl graph from a numpy array

        Parameters
        ----------

        x: numpy array
            Data array.
        device: {'cpu', 'gpu'}
            Type of device for storing the tensor.
        cuda: int, default 0
            Index of cuda device to use. Only used if device==True.
        nclusters: int, default 1
            Number of clusters to be initialized in graph.
        clust_init: str or function, default 'zeros'
            Function to initialize clusters. If string it should be a function available in torch. Else it should point
            to a function with inputs in form (shape, dtype).

        Returns
        -------

        heterograph object
            numpy array as tripartite dgl graph.

            **Shape**: (nrows + ncols + ncontexts, nrows * ncols * ncontexts * 3)
        """

        # set (u,v)
        clust_init = loader(th, clust_init)

        tensor = th.tensor(
            [[i, j, z, elem] for z, ctx in enumerate(x) for i, row in enumerate(ctx) for j, elem in enumerate(row)]
        ).T

        graph_data = {
            ('row', 'elem', 'col'): (tensor[0].int(), tensor[1].int()),
            ('row', 'elem', 'ctx'): (tensor[0].int(), tensor[2].int()),
            ('col', 'elem', 'ctx'): (tensor[1].int(), tensor[2].int()),
        }

        # create graph
        G = dgl.heterograph(graph_data)

        # set weights
        G.edges[('row', 'elem', 'col')].data['w'] = tensor[3].float()
        G.edges[('row', 'elem', 'ctx')].data['w'] = tensor[3].float()
        G.edges[('col', 'elem', 'ctx')].data['w'] = tensor[3].float()

        # set cluster members
        for n, axis in enumerate(['ctx', 'row', 'col']):
            for i in range(nclusters):
                G.nodes[axis].data[i] = clust_init(x.shape[n], dtype=th.bool)

        if device == 'gpu':
            G = G.to('cuda:{}'.format(cuda))

        return G

    @staticmethod
    def _dense_to_networkx(x, **kwargs):

        """
        Extracts a tripartite networkx graph from numpy array

        Parameters
        ----------

        x: numpy array
            Data array.
        **kwargs: any, default None
            Additional keywords have no effect but might be accepted for compatibility.

        Returns
        -------

        Graph object
            numpy array as tripartite networkx graph.

            **Shape**: (nrows + ncols + ncontexts, nrows * ncols * ncontexts * 3)

        """

        G = nx.MultiGraph()

        edges = np.array(
            [[('row-{}'.format(i), 'col-{}'.format(j), elem),
              ('row-{}'.format(i), 'ctx-{}'.format(z), elem),
              ('col-{}'.format(j), 'ctx-{}'.format(z), elem)]
             for z, ctx in enumerate(x) for i, row in enumerate(ctx) for j, elem in enumerate(row)]
        )

        # reshape from (elements, n, edge) to (edges, edge)
        edges = edges.reshape(edges.shape[0] * edges.shape[1], edges.shape[2])

        G.add_weighted_edges_from(edges)

        return G

[docs]    def save(self, extension='default', file_name='example', path=None, single_file=None, **kwargs):

        """
        Saves data files to chosen path.

        Parameters
        ----------

        extension: {'default', 'csv'}, default 'default'
            Extension of saved data file. If default, uses Java class default. Else it returns a data file per context.
        file_name: str, default 'example_dataset'
            Saved files prefix.
        path: str, default None
            Path to save files. If None then files are saved in the current working directory.
        single_file: Bool, default None.
            If False dataset is saved in multiple data files. If None then if the dataset's size is larger then 10**5
            it defaults to False, else True. Only used if extension=='default'.
        **kwargs: any, default None
            Additional keywords that are passed on.

        Examples
        --------

        >>> generator = TriclusterGenerator(silence=True)
        >>> generator.generate()
        >>> generator.save(file_name='TricFiles', single_file=False)
        >>> generator.save(extension='csv', file_name='TricFiles', delimiter=';')

        """

        if path is None:
            path = os.getcwd() + '/'

        self._start_silencing()

        if extension == 'csv':
            # check if dense exists
            if self.generatedDataset is None:
                raise AttributeError('No generated dataset exists. '
                                     'Data must first be generated using the .generate() method.')

            elif self.X is None:
                _, _ = self.to_tensor(in_memory=False)

            elif isinstance(self.X, COO):
                self.X = self._java_to_numpy(self.generatedDataset)

            # save data
            for i, arr in enumerate(self.X):
                np.savetxt('{}_dataset_ctx{}.csv'.format(os.path.join(path, file_name), i), arr, fmt="%d", **kwargs)

            # save json

            with open('{}_cluster_data.json'.format(os.path.join(path, file_name)), 'w') as outfile:
                json.dump(self.cluster_info, outfile)

            # save txt
            with open('{}_cluster_data.txt'.format(os.path.join(path, file_name)), 'w') as outfile:
                outfile.write(str(self.generatedDataset.getTricsInfo()))

        else:

            serv = GTricService()

            serv.setPath(path)
            serv.setSingleFileOutput(self._asses_memory(single_file, gends=self.generatedDataset))
            serv.saveResult(self.generatedDataset, file_name + '_cluster_data', file_name + '_dataset')

        self._stop_silencing()


[docs]class TriclusterGeneratorbyConfig(TriclusterGenerator):

    """
    This class initializes the generator via configuration file.

    **Examples**

    >>> from nclustgen import TriclusterGeneratorbyConfig
    >>> generator = TriclusterGeneratorbyConfig('example.json')
    >>> x, y = generator.generate(nrows=50, ncols=100, ncontexte=4, nclusters=2)
    >>> x
    array([[[ 3.94, -7.62, -2.68, ..., -1.66,  4.41, -3.8 ],
            [-2.27, -7.19, -3.42, ...,  7.19, -2.9 , -6.03],
            [-8.91, -9.46, -7.98, ..., -0.78, -7.66, -4.96],
            ...,
            [-7.93,  9.79,  2.95, ...,  2.01,  7.99,  6.15],
            [-4.25, -3.81, -1.43, ..., -0.61, -5.36, -8.09],
            [ 0.4 , -5.36, -3.68, ...,  8.5 ,  6.8 , -7.34]],
           [[ 0.62, -1.18, -3.07, ...,  0.23, -8.38,  2.96],
            [ 6.37,  4.63,  6.15, ...,  9.13,  9.6 ,  9.5 ],
            [-5.33,  0.15,  1.65, ...,  5.73, -4.64, -6.47],
            ...,
            [ 9.16,  4.75,  3.06, ...,  3.76, -3.09, -6.96],
            [ 3.6 ,  5.54, -0.2 , ...,  1.09,  9.23, -0.62],
            [ 2.68, -6.15, -8.99, ...,  8.65,  9.89,  7.63]],
           [[ 0.55, -1.03,  6.35, ...,  3.88,  5.96, -6.52],
            [-0.71,  7.99,  2.56, ..., -7.15,  0.33,  7.9 ],
            [ 0.86,  2.99,  3.69, ...,  1.57, -5.23,  4.59],
            ...,
            [ 4.2 ,  4.03, -9.11, ...,  5.28,  6.09,  1.19],
            [-0.31,  7.71,  7.57, ..., -3.57, -9.67, -9.89],
            [ 6.55,  4.69, -9.96, ..., -8.9 ,  7.31, -0.13]]])

    """

    def __init__(self, file_path=None):

        """
        **Parameters**

        file_path: str, default None
            Determines the path to the configuration file. If None then no parameters are passed to class.
        """

        if file_path:
            f = open(file_path, )
            params = json.load(f)
            f.close()

            super().__init__(**params)

        else:
            super().__init__()