Source code for janggu.model

"""Janggu - deep learning for genomics"""

import copy
import hashlib
import logging
import os
import re
import time

import h5py
import keras
import keras.backend as K
import tensorflow as tf
import numpy as np
from keras.callbacks import CSVLogger
from keras.callbacks import LambdaCallback
from keras.models import Model
from keras.models import load_model
from keras.utils import Sequence
from keras.utils import plot_model
from progress.bar import Bar
from pybedtools import BedTool

from janggu.data import split_train_test
from janggu.data.coverage import Cover
from janggu.data.dna import Bioseq
from janggu.data.data import JangguSequence
from janggu.data.data import _data_props
from janggu.data.dna import VariantStreamer
from janggu.data.genomic_indexer import GenomicIndexer
from janggu.evaluation import get_scorer
from janggu.layers import Complement
from janggu.layers import DnaConv2D
from janggu.layers import LocalAveragePooling2D
from janggu.layers import Reverse
from janggu.utils import _get_output_root_directory
from janggu.utils import _to_list

JANGGU_LAYERS = {'Reverse': Reverse,
                 'Complement': Complement,
                 'LocalAveragePooling2D': LocalAveragePooling2D,
                 'DnaConv2D': DnaConv2D}


def _convert_data(kerasmodel, data, layer):
    """Converts different data formats to
    {name:values} dict.

    keras support different data formats, e.g. np.array,
    list(np.array) or dict(key: np.array).
    This function normalizes all dataset to the dictionary
    style usage internally. This simplifies the compatibility
    checks at various places.

    Parameters
    ----------
    kerasmodel : keras.Model
        A keras Model object.
    data : Dataset, np.array, list or dict
        Dataset.
    layer : str
        Indication as to whether data is used as input or output.
        :code:`layer='output_layers'` or 'input_layers'.
    """
    # If we deal with Dataset, we convert it to a Dictionary
    # which is directly interpretable by keras
    if hasattr(data, "name") and hasattr(data, "shape"):
        # if it is a janggu.data.Dataset we get here
        c_data = {data.name: data}
    elif not hasattr(data, "name") and hasattr(data, "shape"):
        # if data is a numpy array we get here
        c_data = {kerasmodel.get_config()[layer][0][0]: data}
    elif isinstance(data, list):
        if hasattr(data[0], "name"):
            # if data is a list(Dataset) we get here
            c_data = {datum.name: datum for datum in data}
        else:
            # if data is a list(np.array) we get here
            c_data = {name[0]: datum for name, datum in
                      zip(kerasmodel.get_config()[layer],
                          data)}
    elif isinstance(data, dict):
        # if data is a dict, it can just be passed through
        c_data = data
    else:
        raise ValueError('Unknown data type: {}'.format(type(data)))

    return c_data


[docs]class Janggu(object): """Janggu class The class :class:`Janggu` maintains a :class:`keras.models.Model` object, that is an instance of a neural network. Furthermore, to the outside, Janggu behaves similarly to :class:`keras.models.Model` which allows you to create, fit, and evaluate the model. Parameters ----------- inputs : Input or list(Input) Input layer or list of Inputs as defined by keras. See https://keras.io/. outputs : Layer or list(Layer) Output layer or list of outputs. See https://keras.io/. name : str Name of the model. Examples -------- Define a Janggu object similar to keras.models.Model using Input and Output layers. .. code-block:: python from keras.layers import Input from keras.layers import Dense from janggu import Janggu # Define neural network layers using keras in_ = Input(shape=(10,), name='ip') layer = Dense(3)(in_) output = Dense(1, activation='sigmoid', name='out')(layer) # Instantiate a model. model = Janggu(inputs=in_, outputs=output, name='test_model') model.summary() """ timer = None _name = None def __init__(self, inputs, outputs, name=None): self.kerasmodel = Model(inputs, outputs, name='janggu') if not name: hasher = hashlib.md5() hasher.update(self.kerasmodel.to_json().encode('utf-8')) name = hasher.hexdigest() print("Generated model-id: '{}'".format(name)) if hasattr(outputs, '__len__') > 1: total_output = K.sum([K.sum(o, axis=-1) for o in self.kerasmodel.output], axis=-1) else: total_output = K.sum(self.kerasmodel.output, axis=-1) kinp = self.kerasmodel.input kinp = _to_list(kinp) if tf.__version__[0] == '1': grad = K.gradients(total_output, self.kerasmodel.input) self._influence = K.function(kinp, grad) else: def _influence(inputs): tfinput = [tf.cast(inp, tf.float32) for inp in inputs] with tf.GradientTape() as tape: tape.watch(tfinput) pred = self.kerasmodel(tfinput) if hasattr(outputs, '__len__') > 1: total_output = K.sum([K.sum(o, axis=-1) for o in pred], axis=-1) else: total_output = K.sum(pred, axis=-1) grad = tape.gradient(total_output, tfinput) return [g.numpy() for g in grad] self._influence = _influence self.name = name self.outputdir = _get_output_root_directory() if not os.path.exists(self.outputdir): # pragma: no cover # this is excluded from unit tests, because the testing # framework always provides a directory os.makedirs(self.outputdir) if not os.path.exists(os.path.join(self.outputdir, 'logs')): os.makedirs(os.path.join(self.outputdir, 'logs')) logfile = os.path.join(self.outputdir, 'logs', 'janggu.log') self.logger = logging.getLogger(self.name) logging.basicConfig(filename=logfile, level=logging.DEBUG, format='%(asctime)s:%(name)s:%(message)s', datefmt='%m/%d/%Y %H:%M:%S') self.logger.info("Model Summary:") self.kerasmodel.summary(print_fn=self.logger.info)
[docs] @classmethod def create_by_name(cls, name, custom_objects=None): """Creates a Janggu object by name. This option is used to load a pre-trained model. Parameters ---------- name : str Name of the model. custom_objects : dict or None This allows loading of custom layers using load_model. All janggu specific layers are automatically included as custom_objects. Default: None Examples -------- .. code-block:: python in_ = Input(shape=(10,), name='ip') layer = Dense(3)(in_) output = Dense(1, activation='sigmoid', name='out')(layer) # Instantiate a model. model = Janggu(inputs=in_, outputs=output, name='test_model') # saves the model to <janggu_results>/models model.save() # remove the original model del model # reload the model model = Janggu.create_by_name('test_model') """ path = cls._storage_path(name, _get_output_root_directory()) custom_objects = {} if not custom_objects else custom_objects custom_objects.update(JANGGU_LAYERS) model = load_model(path, custom_objects=custom_objects) return cls(model.inputs, model.outputs, name)
@property def name(self): """Model name""" return self._name @name.setter def name(self, name): if not isinstance(name, str): raise Exception("Name must be a string.") self._name = name
[docs] def save(self, filename=None, overwrite=True, show_shapes=True): """Saves the model. Parameters ---------- filename : str Filename of the stored model. Default: None. overwrite: bool Overwrite a stored model. Default: True. """ if not filename: # pragma: no cover filename = self._storage_path(self.name, self.outputdir) plotname = os.path.splitext(filename)[0] + '.png' try: plot_model(self.kerasmodel, to_file=plotname, show_shapes=show_shapes) except Exception: # pragma: no cover pylint: disable=broad-except # if graphviz is not installed on the system. self.logger.exception('plot_model failed, continue nevertheless: ') self.logger.info("Save model %s", filename) self.kerasmodel.save(filename, overwrite)
[docs] def summary(self): """Prints the model definition.""" self.kerasmodel.summary()
[docs] @classmethod def create(cls, template, modelparams=None, inputs=None, outputs=None, name=None): """Janggu constructor method. This method instantiates a Janggu model with model template and parameters. It also allows to automatically infer and extend the correct input and output layers for the network. Parameters ----------- template : function Python function that defines a model template of a neural network. The function signature must adhere to the signature :code:`template(inputs, inputs, outputs, modelparams)` and is expected to return :code:`(input_tensor, output_tensor)` of the neural network. modelparams : list or tuple or None Additional model parameters that are passed along to template upon creation of the neural network. For instance, this could contain number of neurons at each layer. Default: None. inputs : Dataset, list(Dataset) or None Input datasets from which the input layer shapes should be derived. Use this option together with the :code:`inputlayer` decorator (see Example below). outputs : Dataset, list(Dataset) or None Output datasets from which the output layer shapes should be derived. Use this option toghether with :code:`outputdense` or :code:`outputconv` decorators (see Example below). name : str or None Model name. If None, a unique model name is generated based on the model configuration and network architecture. Examples -------- Specify a model using a model template and parameters: .. code-block:: python def test_manual_model(inputs, inp, oup, params): in_ = Input(shape=(10,), name='ip') layer = Dense(params)(in_) output = Dense(1, activation='sigmoid', name='out')(in_) return in_, output # Defines the same model by invoking the definition function # and the create constructor. model = Janggu.create(template=test_manual_model, modelparams=3) model.summary() Specify a model using automatic input and output layer determination. That is, only the model body needs to be specified: .. code-block:: python import numpy as np from janggu import Janggu from janggu import inputlayer, outputdense from janggu.data import Array # Some random data which you would like to use as input for the # model. DATA = Array('ip', np.random.random((1000, 10))) LABELS = Array('out', np.random.randint(2, size=(1000, 1))) # The decorators inputlayer and outputdense # extract the layer shapes and append the respective layers # to the network # so that only the model body remains to be specified. # Note that the the decorator order matters. # inputlayer must be specified before outputdense. @inputlayer @outputdense(activation='sigmoid') def test_inferred_model(inputs, inp, oup, params): with inputs.use('ip') as in_: # the with block allows for easy # access of a specific named input. output = Dense(params)(in_) return in_, output # create a model. model = Janggu.create(template=test_inferred_model, modelparams=3, name='test_model', inputs=DATA, outputs=LABELS) """ model = create_model(template, modelparams, inputs, outputs) return cls(model.inputs, model.outputs, name)
[docs] def compile(self, *args, **kargs): #loss_weights=None, sample_weight_mode=None, #weighted_metrics=None, target_tensors=None): """Model compilation. This method delegates the compilation to keras.models.Model.compile. See also https://keras.io/models/model/ Examples -------- .. code:: python model.compile(optimizer='adadelta', loss='binary_crossentropy') """ self.kerasmodel.compile(*args, **kargs)
[docs] def fit(self, # pylint: disable=too-many-locals inputs=None, outputs=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, use_multiprocessing=False, workers=1): """Model fitting. This method is used to fit a given model. Most of parameters are directly delegated the fit_generator of the keras model. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for fitting the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. epochs : int Number of epochs. Default: 1. verbose : int Verbosity level. See https://keras.io. callbacks : List(keras.callbacks.Callback) Callbacks to be applied during training. See https://keras.io/callbacks validation_data : tuple, Sequence or None Validation data can be a tuple (input_dataset, output_dataset), or (input_dataset, output_dataset, sample_weights) or a keras.utils.Sequence instance or a list of validation chromsomoes. The latter choice only works with when using Cover and Bioseq dataset. This allows you to train on a dedicated set of chromosomes and to validate the performance on respective heldout chromosomes. If None, validation is not applied. shuffle : boolean shuffle batches. Default: True. class_weight : dict Class weights. See https://keras.io. sample_weight : np.array or None Sample weights. See https://keras.io. initial_epoch : int Initial epoch at which to start training. steps_per_epoch : int, None. Number of steps per epoch. If None, this value is determined from the dataset size and the batch_size. use_multiprocessing : boolean Whether to use multiprocessing. See https://keras.io. Default: False. workers : int Number of workers to use in multiprocessing mode. Default: 1. Examples -------- .. code-block:: python model.fit(DATA, LABELS) """ if not isinstance(inputs, Sequence): inputs = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs = _convert_data(self.kerasmodel, outputs, 'output_layers') hyper_params = { 'epochs': epochs, 'batch_size': batch_size, 'shuffle': shuffle, 'class_weight': class_weight, 'initial_epoch': initial_epoch, 'steps_per_epoch': steps_per_epoch, 'use_multiprocessing': use_multiprocessing, 'workers': workers } self.logger.info('Fit: %s', self.name) if isinstance(inputs, Sequence): self.logger.info('using custom Sequence') else: self.logger.info("Input:") self.__dim_logging(inputs) self.logger.info("Output:") self.__dim_logging(outputs) self.timer = time.time() history = None self.logger.info("Hyper-parameters:") for par_ in hyper_params: self.logger.info('%s: %s', par_, str(hyper_params[par_])) callbacks = [] if callbacks is None else callbacks callbacks.append(LambdaCallback(on_epoch_end=lambda epoch, logs: self.logger.info( "epoch %s: %s", epoch + 1, ' '.join(["{}=".format(k) + ('{:.4f}' if abs(logs[k]) > 1e-3 else '{:.4e}').format(logs[k]) for k in logs])))) if not os.path.exists(os.path.join(self.outputdir, 'evaluation')): os.mkdir(os.path.join(self.outputdir, 'evaluation')) if not os.path.exists(os.path.join(self.outputdir, 'evaluation', self.name)): os.mkdir(os.path.join(self.outputdir, 'evaluation', self.name)) callbacks.append(CSVLogger(os.path.join(self.outputdir, 'evaluation', self.name, 'training.log'))) if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): # input could be a sequence jseq = inputs else: jseq = JangguSequence(inputs, outputs, sample_weight, batch_size, shuffle=shuffle) if isinstance(validation_data, tuple): valinputs = _convert_data(self.kerasmodel, validation_data[0], 'input_layers') valoutputs = _convert_data(self.kerasmodel, validation_data[1], 'output_layers') sweights = validation_data[2] if len(validation_data) == 3 else None valjseq = JangguSequence(valinputs, valoutputs, sweights, batch_size, shuffle=False) elif isinstance(validation_data, Sequence): valjseq = validation_data elif isinstance(validation_data, list) and isinstance(validation_data[0], str): # if the validation data is a list of chromosomes that should # be used as validation dataset we end up here. # This is only possible, however, if all input and output datasets # are Cover or Bioseq dataset. if not all(hasattr(datum, 'gindexer') \ for datum in [jseq.inputs[k] for k in jseq.inputs] + [jseq.outputs[k] for k in jseq.outputs]): raise ValueError("Not all dataset are Cover or Bioseq dataset" " which is required for this options.") # then split the original dataset into training and validation set. train, val = split_train_test((jseq.inputs, jseq.outputs), validation_data) traininp, trainoup = train valinp, valoup = val self.logger.info("Split in training and validation set:") self.logger.info("Training-Input:") self.__dim_logging(traininp) self.logger.info("Training-Output:") self.__dim_logging(trainoup) self.logger.info("Validation-Input:") self.__dim_logging(valinp) self.logger.info("Validation-Output:") self.__dim_logging(valoup) jseq = JangguSequence(_convert_data(self.kerasmodel, traininp, 'input_layers'), _convert_data(self.kerasmodel, trainoup, 'output_layers'), sample_weights=None, batch_size=jseq.batch_size, shuffle=jseq.shuffle) valjseq = JangguSequence(_convert_data(self.kerasmodel, valinp, 'input_layers'), _convert_data(self.kerasmodel, valoup, 'output_layers'), sample_weights=None, batch_size=jseq.batch_size, shuffle=False) else: valjseq = None try: history = self.kerasmodel.fit_generator( jseq, epochs=epochs, validation_data=valjseq, class_weight=class_weight, initial_epoch=initial_epoch, shuffle=shuffle, use_multiprocessing=use_multiprocessing, max_queue_size=50, workers=workers, verbose=verbose, callbacks=callbacks) except Exception: # pragma: no cover # ignore the linter warning, the exception # is reraised anyways. self.logger.exception('fit_generator failed:') raise self.logger.info('#' * 40) for k in history.history: self.logger.info('%s: %f', k, history.history[k][-1]) self.logger.info('#' * 40) self.save() self._save_hyper(hyper_params) self.logger.info("Training finished after %1.3f s", time.time() - self.timer) return history
[docs] def predict(self, inputs, # pylint: disable=too-many-locals batch_size=None, verbose=0, steps=None, layername=None, datatags=None, callbacks=None, use_multiprocessing=False, workers=1): """Performs a prediction. This method predicts the targets. All of the parameters are directly delegated the predict_generator of the keras model. See https://keras.io/models/model/#methods. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for fitting the model. batch_size : int or None Batch size. If set to None a batch size of 32 is used. verbose : int Verbosity level. See https://keras.io. steps : int, None. Number of predict steps. If None, this value is determined from the dataset size and the batch_size. layername : str or None Layername for which the prediction should be performed. If None, the output layer will be used automatically. datatags : list(str) or None Tags to annotate the evaluation results. Default: None. callbacks : List(:code:`Scorer`) Scorer instances to be applied on the predictions. use_multiprocessing : boolean Whether to use multiprocessing for the prediction. Default: False. workers : int Number of workers to use. Default: 1. Examples -------- .. code-block:: python model.predict(DATA) """ if not isinstance(inputs, Sequence): inputs = _convert_data(self.kerasmodel, inputs, 'input_layers') self.logger.info('Predict: %s', self.name) if isinstance(inputs, Sequence): self.logger.info('using custom Sequence') else: self.logger.info("Input:") self.__dim_logging(inputs) self.timer = time.time() # if a desired layername is specified, the features # will be predicted. if layername: model = Janggu(self.kerasmodel.input, self.kerasmodel.get_layer(layername).output, name=self.name) else: model = self if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): jseq = inputs else: jseq = JangguSequence(inputs, None, None, batch_size=batch_size) try: preds = model.kerasmodel.predict_generator( jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers, verbose=verbose) except Exception: # pragma: no cover self.logger.exception('predict_generator failed:') raise prd = _convert_data(model.kerasmodel, preds, 'output_layers') if layername is not None: # no need to set an extra datatag. # if layername is present, it will be added to the tags if datatags is None: datatags = [layername] else: datatags.append(layername) for callback in callbacks or []: callback.score(model, prd, datatags=datatags) return preds
[docs] def evaluate(self, inputs=None, outputs=None, # pylint: disable=too-many-locals batch_size=None, sample_weight=None, steps=None, datatags=None, callbacks=None, use_multiprocessing=False, workers=1): """Evaluates the performance. This method is used to evaluate a given model. All of the parameters are directly delegated the evalute_generator of the keras model. See https://keras.io/models/model/#methods. Parameters ---------- inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence) Input Dataset or Sequence to use for evaluating the model. outputs : :code:`Dataset`, list(Dataset) or None Output Dataset containing the training targets. If a Sequence is used for inputs, outputs will have no effect. batch_size : int or None Batch size. If set to None a batch size of 32 is used. sample_weight : np.array or None Sample weights. See https://keras.io. steps : int, None. Number of predict steps. If None, this value is determined from the dataset size and the batch_size. datatags : list(str) or None Tags to annotate the evaluation results. Default: None. callbacks : List(:code:`Scorer` or str) Scorer instances to be applied on the predictions. Furthermore, commonly used scoring metrics can be added by name, including 'roc', 'auroc', 'prc', 'auprc' for evaluating binary classification applications and 'cor' (for Pearson's correlation), 'mae', 'mse' and 'var_explained' for regression applications. use_multiprocessing : boolean Whether to use multiprocessing for the prediction. Default: False. workers : int Number of workers to use. Default: 1. Examples -------- .. code-block:: python model.evaluate(DATA, LABELS) # binary classification evaluation with callbacks model.evaluate(DATA, LABELS, callcacks=['auprc', 'auroc']) """ self.logger.info('Evaluate: %s', self.name) if isinstance(inputs, Sequence): inputs_ = _convert_data(self.kerasmodel, inputs.inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, inputs.outputs, 'output_layers') self.logger.info('Using custom Sequence.') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) else: inputs_ = _convert_data(self.kerasmodel, inputs, 'input_layers') outputs_ = _convert_data(self.kerasmodel, outputs, 'output_layers') self.logger.info("Input:") self.__dim_logging(inputs_) self.logger.info("Output:") self.__dim_logging(outputs_) self.timer = time.time() if not batch_size: batch_size = 32 if isinstance(inputs, Sequence): jseq = inputs else: jseq = JangguSequence(inputs_, outputs_, sample_weight, batch_size=batch_size) try: values = self.kerasmodel.evaluate_generator( jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) except Exception: # pragma: no cover self.logger.exception('evaluate_generator failed:') raise self.logger.info('#' * 40) values = _to_list(values) for i, value in enumerate(values): self.logger.info('%s: %f', self.kerasmodel.metrics_names[i], value) self.logger.info('#' * 40) self.logger.info("Evaluation finished in %1.3f s", time.time() - self.timer) preds = self.kerasmodel.predict_generator(jseq, steps=steps, use_multiprocessing=use_multiprocessing, workers=workers) preds = _convert_data(self.kerasmodel, preds, 'output_layers') for callback in callbacks or []: callback = get_scorer(callback) callback.score(self, preds, outputs=outputs_, datatags=datatags) return values
[docs] def predict_variant_effect(self, # pylint: disable=too-many-locals bioseq, variants, conditions, output_folder, condition_filter=None, batch_size=None, annotation=None, ignore_reference_match=False, order=1): """Evaluates the performance. see help(predict_variant_effect) for a description of the parameters Examples -------- .. code-block:: python # Evaluate all variants and all conditions (outputs) model.predict_variant_effect(DATA, VARIANTS, CONDITIONS, 'vcfoutput') # Evaluate all variants and a subset of conditions (Ctcf output labels) model.predict_variant_effect(DATA, LABELS, CONDITIONS, 'vcfoutput_subset', contition_filter='Cfcf') """ return predict_variant_effect(self.kerasmodel, bioseq, variants, conditions, output_folder, condition_filter=condition_filter, batch_size=batch_size, annotation=annotation, ignore_reference_match=ignore_reference_match)
def __dim_logging(self, data): if isinstance(data, dict): for key in data: self.logger.info("\t%s: %s", key, data[key].shape) if hasattr(data, "shape"): data = [data] if isinstance(data, list): for datum in data: self.logger.info("\t%s", datum.shape)
[docs] def get_config(self): """Get model config.""" return self.kerasmodel.get_config()
@staticmethod def _storage_path(name, outputdir): """Returns the path to the model storage file.""" if not os.path.exists(os.path.join(outputdir, "models")): os.mkdir(os.path.join(outputdir, "models")) filename = os.path.join(outputdir, 'models', '{}.h5'.format(name)) return filename def _save_hyper(self, hyper_params, filename=None): """This method attaches the hyper parameters that were used to train the model to the hdf5 file. This method is supposed to be used after model training from within the fit method. It attaches the hyper parameter, e.g. epochs, batch_size, etc. to the hdf5 file that contains the model weights. The hyper parameters are added as attributes to the group 'model_weights'. Parameters ---------- hyper_parameters : dict Dictionary that contains the hyper parameters. filename : str Filename of the hdf5 file. This file must already exist. So save has to be called before. """ if not filename: # pragma: no cover filename = self._storage_path(self.name, self.outputdir) content = h5py.File(filename, 'r+') weights = content['model_weights'] for key in hyper_params: if hyper_params[key]: weights.attrs[key] = str(hyper_params[key]) content.close()
def model_from_json(json_string, custom_objects=None): """Wrapping keras.models.model_from_json""" custom_objects = {} if custom_objects is None else custom_objects custom_objects.update(JANGGU_LAYERS) return keras.models.model_from_json(json_string, custom_objects) def model_from_yaml(yaml_string, custom_objects=None): """Wrapping keras.models.model_from_yaml""" custom_objects = {} if custom_objects is None else custom_objects custom_objects.update(JANGGU_LAYERS) return keras.models.model_from_yaml(yaml_string, custom_objects) def create_model(template, modelparams=None, inputs=None, outputs=None, name=None): """Constructor method. This method instantiates a keras model with model template and parameters. It also allows to automatically infer and extend the correct input and output layers for the network. Parameters ----------- template : function Python function that defines a model template of a neural network. The function signature must adhere to the signature :code:`template(inputs, inputp, outputp, modelparams)` and is expected to return :code:`(input_tensor, output_tensor)` of the neural network. modelparams : list or tuple or None Additional model parameters that are passed along to template upon creation of the neural network. For instance, this could contain number of neurons at each layer. Default: None. inputs : Dataset, list(Dataset) or None Input datasets from which the input layer shapes should be derived. Use this option together with the :code:`inputlayer` decorator (see Example below). outputs : Dataset, list(Dataset) or None Output datasets from which the output layer shapes should be derived. Use this option toghether with :code:`outputdense` or :code:`outputconv` decorators (see Example below). name : str or None Model name. If None, a unique model name is generated based on the model configuration and network architecture. Examples -------- Specify a model using a model template and parameters: .. code-block:: python def test_manual_model(inputs, inp, oup, params): in_ = Input(shape=(10,), name='ip') layer = Dense(params)(in_) output = Dense(1, activation='sigmoid', name='out')(in_) return in_, output # Defines the same model by invoking the definition function # and the create constructor. model = create_model(template=test_manual_model, modelparams=3) model.summary() Specify a model using automatic input and output layer determination. That is, only the model body needs to be specified: .. code-block:: python import numpy as np from janggu import inputlayer, outputdense from janggu.data import Array # Some random data which you would like to use as input for the # model. DATA = Array('ip', np.random.random((1000, 10))) LABELS = Array('out', np.random.randint(2, size=(1000, 1))) # The decorators inputlayer and outputdense # extract the layer shapes and append the respective layers # to the network # so that only the model body remains to be specified. # Note that the the decorator order matters. # inputlayer must be specified before outputdense. @inputlayer @outputdense(activation='sigmoid') def test_inferred_model(inputs, inp, oup, params): with inputs.use('ip') as in_: # the with block allows for easy # access of a specific named input. output = Dense(params)(in_) return in_, output # create a model. model = create_model(template=test_inferred_model, modelparams=3, name='test_model', inputp=DATA, outputp=LABELS) """ inputs_ = _data_props(inputs) if inputs else None outputs_ = _data_props(outputs) if outputs else None input_tensors, output_tensors = template(None, inputs_, outputs_, modelparams) model = Model(inputs=input_tensors, outputs=output_tensors, name=name) return model
[docs]def input_attribution(model, inputs, # pylint: disable=too-many-locals chrom=None, start=None, end=None, idx=None): """Evaluates the integrated gradients method on the input coverage tracks. This allows to attribute feature importance values to the prediction scores. Integrated gradients have been introduced in Sundararajan, Taly and Yan, Axiomatic Attribution for Deep Networks. PMLR 70, 2017. The method can either be called, by specifying the region of interest directly by setting chrom, start and end. Alternatively, it is possible to specify the region index. For example, the n^th region of the dataset. Parameters ---------- model : Janggu Janggu model wrapper inputs : :code:`Dataset`, list(Dataset) Input Dataset. chrom : str or None Chromosome name. start : int or None Region start. end : int or None Region end. idx : int or None The index of the i^th sequence to use for the attribution. If idx is set chrom, start and end are ignored. Examples -------- .. code-block:: python # Suppose DATA is a Bioseq or Cover object # To query the input feature importance of a specific genomic region # use input_attribution(model, DATA, chrom='chr1', start=start, end=end) # To query the input feature importance using a specific index input_attribution(model, DATA, idx=0) """ inputs = _to_list(inputs) if idx is not None: iv = inputs[0].gindexer[idx] chrom, start, end = iv.chrom, iv.start, iv.end output_chrom, output_start, output_end = chrom, start, end # store original gindexer gindexers_save = [ip.gindexer for ip in inputs] # create new indexers ranging only over the selected region # if chrom, start, end was supplied retrieve the respective indices index_list = [gi.idx_by_region(include=output_chrom, start=output_start, end=output_end) for gi in gindexers_save] # first construct the union of indices index_set = set() for idx_list_el in index_list: index_set = index_set | set(idx_list_el) # only keep the indices that remain in the across all inputs # indices that are only present in some of the inputs are discarded. for idx_list_el in index_list: index_set = index_set & set(idx_list_el) idxs = list(index_set) idxs.sort() subgindexers = [copy.copy(gi) for gi in gindexers_save] for subgi in subgindexers: subgi.chrs = [subgi.chrs[i] for i in idxs] subgi.starts = [subgi.starts[i] for i in idxs] subgi.ends = [subgi.ends[i] for i in idxs] subgi.strand = [subgi.strand[i] for i in idxs] # assign it to the input datasets temporarily for inp, _ in enumerate(inputs): inputs[inp].gindexer = subgindexers[inp] try: #allocate arrays output = [np.zeros((1, output_end-output_start, inp.shape[-2], inp.shape[-1])) for inp in inputs] resols = [inp.garray.resolution for inp in inputs] for igi in range(len(inputs[0])): # current influence influence = [np.zeros((1,) + inp.shape[1:]) for inp in inputs] # get influence for current window with integrated gradient x_in = [inp[igi] for inp in inputs] for step in range(1, 51): grad = model._influence([x*step/50 for x in x_in]) for iinp, inp in enumerate(x_in): for idim, _ in np.ndenumerate(inp): influence[iinp][idim] += (x_in[iinp][idim]/50)*grad[iinp][idim] # scale length to nucleotide resolution influence = [np.repeat(influence[i], resols[i], axis=1) for i, _ in enumerate(inputs)] for iout in range(len(output)): if influence[iout].shape[1] < inputs[iout].gindexer[igi].length: order = inputs[iout].gindexer[igi].length - influence[iout].shape[1] else: order = 0 # incremetally add the influence results into the output # array for all subwindows in the genomic indexer if output_start < inputs[iout].gindexer[igi].start: ostart = inputs[iout].gindexer[igi].start - output_start lstart = 0 else: ostart = 0 lstart = output_start - inputs[iout].gindexer[igi].start if output_end > inputs[iout].gindexer[igi].end: oend = inputs[iout].gindexer[igi].end - output_start lend = inputs[iout].gindexer[igi].end - inputs[iout].gindexer[igi].start else: oend = output_end - output_start lend = output_end - inputs[iout].gindexer[igi].start # for mutually overlapping positions, we employ a heuristic # that keeps the maximum influence over the overlapping intervals # spanning the position m = np.zeros((2,) + (1, inputs[iout].gindexer[igi].length, ) \ + influence[iout].shape[2:], dtype=influence[iout].dtype) m[0][:, lstart:lend, :, :] = output[iout][:, (ostart):(oend), :, :] m[1][:, lstart:(lend - order), :, :] = \ influence[iout][:, lstart:(lend - order), :, :] m = np.abs(m).max(axis=0) m = m[:, lstart:lend, :, :] output[iout][:, ostart:oend, :, :] = m for iout in range(len(output)): # finally wrap the output up as coverage track output[iout] = Cover.create_from_array('attr_'+inputs[iout].name, output[iout], GenomicIndexer.create_from_region( chrom, start, end, '.', binsize=end-start, stepsize=1, flank=0), conditions=inputs[iout].conditions) for inp, _ in enumerate(inputs): # restore the initial genomic indexers inputs[inp].gindexer = gindexers_save[inp] except Exception: # pragma: no cover model.logger.exception('_influence failed:') raise return output
def predict_variant_effect(model, # pylint: disable=too-many-locals bioseq, variants, conditions, output_folder, condition_filter=None, batch_size=None, annotation=None, ignore_reference_match=False, order=1): """Evaluates the performance. Parameters ---------- model : :code:`keras.Model` A keras model bioseq : :code:`Bioseq` or str Bioseq container containing a reference genome or the reference genome file in fasta format. If a Bioseq object is used, it has to be loaded with store_whole_genome=True. This may be faster for large amounts of variants. Consider using cache=True as well, such that the genome only needs to be loaded once, and reloaded from the cache if needed. Alternatively, a string pointing to the reference genome sequence in FASTA format can be supplied from which the sequence context is extracted. variants : str File name of a VCF file containg the variants under study. conditions : list(str) Condition labels for each output prediction. output_folder : str The method produces an hdf5 and a bed file as output. The bed-file contains the variant positions while the hdf5 file contains the reference and alternative variant scores for each output feature. condition_filter : str or None Regular expression filter on which conditions should be evaluated. If None, all output conditions will be returned. batch_size : int, None. Batch size. If None, a batch_size of 128 is used. annotation : BedTool object or None BedTool holding feature annotation e.g. gene annotation. The annotation may be used to perform strand-specific variant effect predictions. Each variant is intersected with the annotation in order to derive the correct strandedness. If variants do not overlap with an annotation features or for missing annotation, the forward strand is used. ignore_reference_match : boolean Whether to ignore mismatches between the reference sequence and the reference base in the VCF file. If False, the variant will be skipped over and only matching positions are processed. Otherwise all variants will be processed. Default: False. order : int Order of the DNA sequence representation. Only relevant if the option bioseq is pointing to a reference genome file. If bioseq is already a Bioseq object, this options will be ignored. Default: 1. Returns ------- tuple: Tuple containing the output filenames: an hdf5 and a bed file. Examples -------- .. code-block:: python # Evaluate all variants and all conditions (outputs) predict_variant_effect(kerasmodel, DATA, VARIANTS, CONDITIONS, 'vcfoutput') # Evaluate all variants and a subset of conditions (Ctcf output labels) predict_variant_effect(kerasmodel, DATA, LABELS, CONDITIONS, 'vcfoutput_subset', contition_filter='Cfcf') """ if batch_size is None: batch_size = 128 def _get_input_layer_shape(layer): if isinstance(layer.input_shape, list): return layer.input_shape[0] return layer.input_shape if isinstance(bioseq, Bioseq): order = bioseq.garray.order if len(model.inputs) > 1: raise ValueError('Only one input layer supported for predict_variant_effect.') binsize = _get_input_layer_shape(model.layers[0])[1] + order - 1 # the network might output arbitrarily many # output. # With the filter option it is possible to # restrict the analysis to certain features. if condition_filter is None: conditions = [(idx, cond) for idx, cond in enumerate(conditions)] else: conditions = [(idx, cond) for idx, cond in enumerate(conditions) \ if hasattr(re.search(condition_filter, cond), 'start')] icond = [el[0] for el in conditions] local_model = model if len(conditions) != model.output_shape[-1]: raise ValueError("The number of conditions does not match with the " "number of network output units. [{}!={}]".format( len(conditions), model.output_shape[-1])) # get number of variants variantsstream = VariantStreamer(bioseq, variants, binsize, batch_size, annotation=annotation, ignore_reference_match=ignore_reference_match, order=order) nvariants = variantsstream.get_variant_count() h5file = h5py.File(os.path.join(output_folder, 'scores.hdf5'), 'w') h5file.create_dataset('labels', (len(conditions),), dtype=h5py.special_dtype(vlen=str), data=np.array([c[-1] for c in conditions], dtype=h5py.special_dtype(vlen=str))) refscore = h5file.create_dataset('refscore', (nvariants, len(conditions)), dtype='float16') altscore = h5file.create_dataset('altscore', (nvariants, len(conditions)), dtype='float16') diffscore = h5file.create_dataset('diffscore', (nvariants, len(conditions)), dtype='float16') logodds = h5file.create_dataset('logoddsscore', (nvariants, len(conditions)), dtype='float16') bar = Bar('Parsing {}: '.format(variants), max=int(np.ceil(nvariants/float(batch_size)))) chromlist = [] poslist = [] vnamelist = [] reflist = [] altlist = [] ibatch = 0 # read variants file for names, chroms, poss, ra, aa, reference, alternative in variantsstream.flow(): bar.next() if reference.shape[0] <= 0: # reached the end of the file break ref_score = local_model.predict_on_batch(reference) alt_score = local_model.predict_on_batch(alternative) chromlist += chroms poslist += poss vnamelist += names reflist += ra altlist += aa refscore[ibatch:(ibatch + ref_score.shape[0])] = ref_score[:, icond].astype('float16') altscore[ibatch:(ibatch + ref_score.shape[0])] = alt_score[:, icond].astype('float16') diffscore[ibatch:(ibatch + ref_score.shape[0])] = \ alt_score[:, icond].astype('float16') - ref_score[:, icond].astype('float16') logodds[ibatch:(ibatch + ref_score.shape[0])] = \ np.log(alt_score[:, icond].astype('float16')/ ref_score[:, icond].astype('float16') + 1e-7) ibatch += ref_score.shape[0] #form large string BedTool('\n'.join(['{} {} {} {}_{}>{}'.format(chrom, start, start+1, name, ref, alt) for chrom, start, name, ref, alt in zip(chromlist, poslist, vnamelist, reflist, altlist)]), from_string=True).saveas(os.path.join(output_folder, 'snps.bed.gz')) bar.finish() h5file.close() return (os.path.join(output_folder, 'scores.hdf5'), os.path.join(output_folder, 'snps.bed.gz'))