Source code for nclustgen.TriclusterGen


from .Generator import Generator

import os
import json
import sys
import csv
import numpy as np
from sparse import concatenate, COO

# import dgl without backend info
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
import dgl
sys.stderr = stderr

import torch as th
import networkx as nx

from com.gtric import generator as gen
from com.gtric.service import GTricService
from com.gtric.types import Background
from com.gtric.types import BackgroundType
from com.gtric.types import Contiguity
from com.gtric.types import Distribution
from com.gtric.types import PatternType
from com.gtric.types import TimeProfile
from com.gtric.types import PlaidCoherency
from com.gtric.utils import OverlappingSettings
from com.gtric.utils import TriclusterStructure
from com.gtric.utils import TriclusterPattern
from com.gtric.utils import RandomObject
from com.gtric.utils import IOUtils as io

from java.util import ArrayList

# helper function
from .utils import tensor_value_check as tvc, loader


[docs]class TriclusterGenerator(Generator): """ This class provides an implementation for three-dimensional datasets with hidden triclusters. **Examples** >>> from nclustgen import TriclusterGenerator >>> generator = TriclusterGenerator( ... dstype='NUMERIC', ... patterns=[['CONSTANT', 'CONSTANT', 'CONSTANT'], ['CONSTANT', 'NONE', 'NONE']], ... bktype='UNIFORM', ... in_memory=True, ... silence=True ... ) >>> generator.get_params() {'X': None, 'Y': None, 'background': ['UNIFORM'], 'clusterdistribution': [['UNIFORM', 4, 4], ['UNIFORM', 4, 4], ['UNIFORM', 4, 4]], 'contiguity': 'NONE', 'dstype': 'NUMERIC', 'errors': (0.0, 0.0, 0.0), 'generatedDataset': None, 'graph': None, 'in_memory': 'True', 'maxclustsperoverlappedarea': 0, 'maxpercofoverlappingelements': 0.0, 'maxval': 10.0, 'minval': -10.0, 'missing': (0.0, 0.0), 'cuda': 3, 'noise': (0.0, 0.0, 0.0), 'patterns': [['CONSTANT', 'CONSTANT', 'CONSTANT'], ['CONSTANT', 'NONE', 'NONE']], 'percofoverlappingclusts': 0.0, 'percofoverlappingcolumns': 1.0, 'percofoverlappingcontexts': 1.0, 'percofoverlappingrows': 1.0, 'plaidcoherency': 'NO_OVERLAPPING', 'realval': True, 'seed': -1, 'silenced': False, 'time_profile': None} >>> x, y = generator.generate(nrows=50, ncols=100, ncontexts=5, nclusters=3) >>> x array([[[-1.29, -4.92, -2.49, ..., -9.17, -5.19, 6.66], [ 5.41, -3.04, -1.58, ..., -3.44, 1.99, 9.84], [-1.88, -9.09, 5.06, ..., -8.96, -8.4 , 3.56], ..., [ 8.74, 4.07, 0.6 , ..., 6.73, -1.3 , 5. ], [ 3.33, 4.66, -5.72, ..., 0.55, 5.82, -3.17], [ 2.32, -9.29, 3.95, ..., 3.61, 3.93, -6.76]], [[ 4.34, 8.59, -1.96, ..., 0.88, 8.52, -7.85], [ 1.87, -8.59, -9.78, ..., 5.33, -7.45, 3.1 ], [-6.86, -3.93, 7.73, ..., 3.21, 6.54, -7.13], ..., [-5.75, 9.91, -4.76, ..., 0.94, -9.2 , -1.32], [ 3.11, -8.26, -2.32, ..., -5.08, 5.33, 2.52], [-4.18, 7.98, 8.42, ..., 4.21, -0.03, -7.51]], [[ 1.22, -5.69, -8.72, ..., 5.78, 8.74, 1.44], [ 3.41, -7.45, 7.01, ..., 8.93, -6.01, 0.18], [ 3.8 , 2.92, -1.87, ..., -1.16, -3.31, -3.02], ..., [ 4.82, -9.82, 0.31, ..., 9.91, -0.45, 7.86], [ 7.24, 8.28, -3.13, ..., 9.12, -0.47, 6.16], [-6.61, -7.34, -0.56, ..., 1.41, -1.7 , 6.22]], [[ 3.46, 7.85, -8.23, ..., 1.33, 2.82, -4.05], [-8.87, -6.42, 2.28, ..., 9.72, -1.75, 5.01], [-0.26, -3.25, -9.16, ..., -1.69, 6.96, 4.63], ..., [-5.36, 2.84, -2.09, ..., 0.33, -2.88, 3.43], [ 5.72, 1.11, 2.11, ..., 0.27, -5.95, 3.39], [-7.02, -3.85, -5.44, ..., 1.64, -1.24, -2.74]], [[-2.39, -9.27, -8.12, ..., -7.86, 7.54, 4.99], [ 2.06, 3.84, -2.99, ..., 4.82, -9.29, -9.23], [ 0.21, -5.85, -8.45, ..., 4.35, -2.69, 0.34], ..., [-0.52, -2.59, 7.63, ..., -8.07, -3.51, 2.7 ], [ 4.93, -1.55, -0.65, ..., -0.87, 8.53, 9.97], [ 8.03, 2.32, -4.76, ..., -2.03, -4.48, -5.56]]]) >>> y [[[8, 16, 17, 35], [36, 55, 69, 88], [0, 2, 3, 4]], [[7, 21, 33, 35], [22, 57, 65, 75], [0, 1, 2, 4]], [[9, 19, 23, 27], [12, 19, 59, 72], [1, 2, 3, 4]]] >>> graph = generator.to_graph(x, framework='dgl', device='cpu') >>> graph Graph(num_nodes={'col': 100, 'ctx': 5, 'row': 50}, num_edges={('col', 'elem', 'ctx'): 500, ('row', 'elem', 'col'): 5000, ('row', 'elem', 'ctx'): 250}, metagraph=[('col', 'ctx', 'elem'), ('row', 'col', 'elem'), ('row', 'ctx', 'elem')]) >>> generator.save(file_name='example', single_file=True) """ def __init__(self, *args, **kwargs): super().__init__(n=3, *args, **kwargs) def _initialize_seed(self): RandomObject.initialization(self.seed) def _build_background(self): try: self.background[0] = getattr(BackgroundType, self.background[0]) except TypeError: pass return Background(*self.background) def _build_generator(self, class_call, params, contexts_index): return getattr(gen, class_call)(*params) def _build_patterns(self): patterns = ArrayList() if self.time_profile: self.time_profile = getattr(TimeProfile, str(self.time_profile).upper()) [patterns.add( TriclusterPattern(*[getattr(PatternType, pattern_type) for pattern_type in pattern] + [self.time_profile]) ) for pattern in self.patterns] return patterns def _build_structure(self): structure = TriclusterStructure() structure.setRowsSettings( getattr(Distribution, self.clusterdistribution[0][0]), *self.clusterdistribution[0][1:] ) structure.setColumnsSettings( getattr(Distribution, self.clusterdistribution[1][0]), *self.clusterdistribution[1][1:] ) structure.setContextsSettings( getattr(Distribution, self.clusterdistribution[2][0]), *self.clusterdistribution[2][1:] ) structure.setContiguity(getattr(Contiguity, self.contiguity)) return structure def _build_overlapping(self): overlapping = OverlappingSettings() overlapping.setPlaidCoherency(getattr(PlaidCoherency, self.plaidcoherency)) overlapping.setPercOfOverlappingTrics(self.percofoverlappingclusts) overlapping.setMaxTricsPerOverlappedArea(self.maxclustsperoverlappedarea) overlapping.setMaxPercOfOverlappingElements(self.maxpercofoverlappingelements) overlapping.setPercOfOverlappingRows(self.percofoverlappingrows) overlapping.setPercOfOverlappingColumns(self.percofoverlappingcolumns) overlapping.setPercOfOverlappingContexts(self.percofoverlappingcontexts) return overlapping @staticmethod def _java_to_numpy(generatedDataset): """ Extracts numpy array from Dataset object. Parameters ---------- generatedDataset: Dataset object Generated dataset (java object). Returns ------- numpy array Generated dataset as numpy array. Shape: (ncontexts, nrows, ncols) """ tensor = str(io.matrixToStringColOriented(generatedDataset, generatedDataset.getNumRows(), 0, False)) tensor = np.array( [np.array_split([tvc(val) for val in row.split('\t')[1:]], generatedDataset.getNumContexts()) for row in tensor.split('\n')][:-1] ) return tensor.reshape( (generatedDataset.getNumContexts(), generatedDataset.getNumRows(), generatedDataset.getNumCols()) ) @staticmethod def _java_to_sparse(generatedDataset): """ Extracts sparce tensor from Dataset object. Parameters ---------- generatedDataset: Dataset object Generated dataset (java object). Returns ------- COO tensor Generated dataset as COO tensor. **Shape**: (ncontexts, nrows, ncols) """ threshold = int(generatedDataset.getNumRows() / 10) steps = [i for i in range(int(generatedDataset.getNumRows() / threshold))] tensors = [] for step in steps: tensor = str(io.matrixToStringColOriented(generatedDataset, threshold, step, False)) tensor = COO.from_numpy(np.array( [np.array_split([tvc(val) for val in row.split('\t')[1:]], generatedDataset.getNumContexts()) for row in tensor.split('\n')][:-1] )) tensor = tensor.reshape((generatedDataset.getNumContexts(), threshold, generatedDataset.getNumCols())) tensors.append(tensor) return concatenate(tensors, axis=1) @staticmethod def _dense_to_dgl(x, device, cuda=0, nclusters=1, clust_init='zeros'): """ Extracts a tripartite dgl graph from a numpy array Parameters ---------- x: numpy array Data array. device: {'cpu', 'gpu'} Type of device for storing the tensor. cuda: int, default 0 Index of cuda device to use. Only used if device==True. nclusters: int, default 1 Number of clusters to be initialized in graph. clust_init: str or function, default 'zeros' Function to initialize clusters. If string it should be a function available in torch. Else it should point to a function with inputs in form (shape, dtype). Returns ------- heterograph object numpy array as tripartite dgl graph. **Shape**: (nrows + ncols + ncontexts, nrows * ncols * ncontexts * 3) """ # set (u,v) clust_init = loader(th, clust_init) tensor = th.tensor( [[i, j, z, elem] for z, ctx in enumerate(x) for i, row in enumerate(ctx) for j, elem in enumerate(row)] ).T graph_data = { ('row', 'elem', 'col'): (tensor[0].int(), tensor[1].int()), ('row', 'elem', 'ctx'): (tensor[0].int(), tensor[2].int()), ('col', 'elem', 'ctx'): (tensor[1].int(), tensor[2].int()), } # create graph G = dgl.heterograph(graph_data) # set weights G.edges[('row', 'elem', 'col')].data['w'] = tensor[3].float() G.edges[('row', 'elem', 'ctx')].data['w'] = tensor[3].float() G.edges[('col', 'elem', 'ctx')].data['w'] = tensor[3].float() # set cluster members for n, axis in enumerate(['ctx', 'row', 'col']): for i in range(nclusters): G.nodes[axis].data[i] = clust_init(x.shape[n], dtype=th.bool) if device == 'gpu': G = G.to('cuda:{}'.format(cuda)) return G @staticmethod def _dense_to_networkx(x, **kwargs): """ Extracts a tripartite networkx graph from numpy array Parameters ---------- x: numpy array Data array. **kwargs: any, default None Additional keywords have no effect but might be accepted for compatibility. Returns ------- Graph object numpy array as tripartite networkx graph. **Shape**: (nrows + ncols + ncontexts, nrows * ncols * ncontexts * 3) """ G = nx.MultiGraph() edges = np.array( [[('row-{}'.format(i), 'col-{}'.format(j), elem), ('row-{}'.format(i), 'ctx-{}'.format(z), elem), ('col-{}'.format(j), 'ctx-{}'.format(z), elem)] for z, ctx in enumerate(x) for i, row in enumerate(ctx) for j, elem in enumerate(row)] ) # reshape from (elements, n, edge) to (edges, edge) edges = edges.reshape(edges.shape[0] * edges.shape[1], edges.shape[2]) G.add_weighted_edges_from(edges) return G
[docs] def save(self, extension='default', file_name='example', path=None, single_file=None, **kwargs): """ Saves data files to chosen path. Parameters ---------- extension: {'default', 'csv'}, default 'default' Extension of saved data file. If default, uses Java class default. Else it returns a data file per context. file_name: str, default 'example_dataset' Saved files prefix. path: str, default None Path to save files. If None then files are saved in the current working directory. single_file: Bool, default None. If False dataset is saved in multiple data files. If None then if the dataset's size is larger then 10**5 it defaults to False, else True. Only used if extension=='default'. **kwargs: any, default None Additional keywords that are passed on. Examples -------- >>> generator = TriclusterGenerator(silence=True) >>> generator.generate() >>> generator.save(file_name='TricFiles', single_file=False) >>> generator.save(extension='csv', file_name='TricFiles', delimiter=';') """ if path is None: path = os.getcwd() + '/' self._start_silencing() if extension == 'csv': # check if dense exists if self.generatedDataset is None: raise AttributeError('No generated dataset exists. ' 'Data must first be generated using the .generate() method.') elif self.X is None: _, _ = self.to_tensor(in_memory=False) elif isinstance(self.X, COO): self.X = self._java_to_numpy(self.generatedDataset) # save data for i, arr in enumerate(self.X): np.savetxt('{}_dataset_ctx{}.csv'.format(os.path.join(path, file_name), i), arr, fmt="%d", **kwargs) # save json with open('{}_cluster_data.json'.format(os.path.join(path, file_name)), 'w') as outfile: json.dump(self.cluster_info, outfile) # save txt with open('{}_cluster_data.txt'.format(os.path.join(path, file_name)), 'w') as outfile: outfile.write(str(self.generatedDataset.getTricsInfo())) else: serv = GTricService() serv.setPath(path) serv.setSingleFileOutput(self._asses_memory(single_file, gends=self.generatedDataset)) serv.saveResult(self.generatedDataset, file_name + '_cluster_data', file_name + '_dataset') self._stop_silencing()
[docs]class TriclusterGeneratorbyConfig(TriclusterGenerator): """ This class initializes the generator via configuration file. **Examples** >>> from nclustgen import TriclusterGeneratorbyConfig >>> generator = TriclusterGeneratorbyConfig('example.json') >>> x, y = generator.generate(nrows=50, ncols=100, ncontexte=4, nclusters=2) >>> x array([[[ 3.94, -7.62, -2.68, ..., -1.66, 4.41, -3.8 ], [-2.27, -7.19, -3.42, ..., 7.19, -2.9 , -6.03], [-8.91, -9.46, -7.98, ..., -0.78, -7.66, -4.96], ..., [-7.93, 9.79, 2.95, ..., 2.01, 7.99, 6.15], [-4.25, -3.81, -1.43, ..., -0.61, -5.36, -8.09], [ 0.4 , -5.36, -3.68, ..., 8.5 , 6.8 , -7.34]], [[ 0.62, -1.18, -3.07, ..., 0.23, -8.38, 2.96], [ 6.37, 4.63, 6.15, ..., 9.13, 9.6 , 9.5 ], [-5.33, 0.15, 1.65, ..., 5.73, -4.64, -6.47], ..., [ 9.16, 4.75, 3.06, ..., 3.76, -3.09, -6.96], [ 3.6 , 5.54, -0.2 , ..., 1.09, 9.23, -0.62], [ 2.68, -6.15, -8.99, ..., 8.65, 9.89, 7.63]], [[ 0.55, -1.03, 6.35, ..., 3.88, 5.96, -6.52], [-0.71, 7.99, 2.56, ..., -7.15, 0.33, 7.9 ], [ 0.86, 2.99, 3.69, ..., 1.57, -5.23, 4.59], ..., [ 4.2 , 4.03, -9.11, ..., 5.28, 6.09, 1.19], [-0.31, 7.71, 7.57, ..., -3.57, -9.67, -9.89], [ 6.55, 4.69, -9.96, ..., -8.9 , 7.31, -0.13]]]) """ def __init__(self, file_path=None): """ **Parameters** file_path: str, default None Determines the path to the configuration file. If None then no parameters are passed to class. """ if file_path: f = open(file_path, ) params = json.load(f) f.close() super().__init__(**params) else: super().__init__()