"""Janggu - deep learning for genomics"""
import copy
import hashlib
import logging
import os
import re
import time
import h5py
import keras
import keras.backend as K
import tensorflow as tf
import numpy as np
from keras.callbacks import CSVLogger
from keras.callbacks import LambdaCallback
from keras.models import Model
from keras.models import load_model
from keras.utils import Sequence
from keras.utils import plot_model
from progress.bar import Bar
from pybedtools import BedTool
from janggu.data import split_train_test
from janggu.data.coverage import Cover
from janggu.data.dna import Bioseq
from janggu.data.data import JangguSequence
from janggu.data.data import _data_props
from janggu.data.dna import VariantStreamer
from janggu.data.genomic_indexer import GenomicIndexer
from janggu.evaluation import get_scorer
from janggu.layers import Complement
from janggu.layers import DnaConv2D
from janggu.layers import LocalAveragePooling2D
from janggu.layers import Reverse
from janggu.utils import _get_output_root_directory
from janggu.utils import _to_list
JANGGU_LAYERS = {'Reverse': Reverse,
'Complement': Complement,
'LocalAveragePooling2D': LocalAveragePooling2D,
'DnaConv2D': DnaConv2D}
def _convert_data(kerasmodel, data, layer):
"""Converts different data formats to
{name:values} dict.
keras support different data formats, e.g. np.array,
list(np.array) or dict(key: np.array).
This function normalizes all dataset to the dictionary
style usage internally. This simplifies the compatibility
checks at various places.
Parameters
----------
kerasmodel : keras.Model
A keras Model object.
data : Dataset, np.array, list or dict
Dataset.
layer : str
Indication as to whether data is used as input or output.
:code:`layer='output_layers'` or 'input_layers'.
"""
# If we deal with Dataset, we convert it to a Dictionary
# which is directly interpretable by keras
if hasattr(data, "name") and hasattr(data, "shape"):
# if it is a janggu.data.Dataset we get here
c_data = {data.name: data}
elif not hasattr(data, "name") and hasattr(data, "shape"):
# if data is a numpy array we get here
c_data = {kerasmodel.get_config()[layer][0][0]: data}
elif isinstance(data, list):
if hasattr(data[0], "name"):
# if data is a list(Dataset) we get here
c_data = {datum.name: datum for datum in data}
else:
# if data is a list(np.array) we get here
c_data = {name[0]: datum for name, datum in
zip(kerasmodel.get_config()[layer],
data)}
elif isinstance(data, dict):
# if data is a dict, it can just be passed through
c_data = data
else:
raise ValueError('Unknown data type: {}'.format(type(data)))
return c_data
[docs]class Janggu(object):
"""Janggu class
The class :class:`Janggu` maintains a :class:`keras.models.Model`
object, that is an instance of a neural network.
Furthermore, to the outside, Janggu behaves similarly to
:class:`keras.models.Model` which allows you to create,
fit, and evaluate the model.
Parameters
-----------
inputs : Input or list(Input)
Input layer or list of Inputs as defined by keras.
See https://keras.io/.
outputs : Layer or list(Layer)
Output layer or list of outputs. See https://keras.io/.
name : str
Name of the model.
Examples
--------
Define a Janggu object similar to keras.models.Model
using Input and Output layers.
.. code-block:: python
from keras.layers import Input
from keras.layers import Dense
from janggu import Janggu
# Define neural network layers using keras
in_ = Input(shape=(10,), name='ip')
layer = Dense(3)(in_)
output = Dense(1, activation='sigmoid', name='out')(layer)
# Instantiate a model.
model = Janggu(inputs=in_, outputs=output, name='test_model')
model.summary()
"""
timer = None
_name = None
def __init__(self, inputs, outputs, name=None):
self.kerasmodel = Model(inputs, outputs, name='janggu')
if not name:
hasher = hashlib.md5()
hasher.update(self.kerasmodel.to_json().encode('utf-8'))
name = hasher.hexdigest()
print("Generated model-id: '{}'".format(name))
if hasattr(outputs, '__len__') > 1:
total_output = K.sum([K.sum(o, axis=-1) for o in self.kerasmodel.output], axis=-1)
else:
total_output = K.sum(self.kerasmodel.output, axis=-1)
kinp = self.kerasmodel.input
kinp = _to_list(kinp)
if tf.__version__[0] == '1':
grad = K.gradients(total_output, self.kerasmodel.input)
self._influence = K.function(kinp, grad)
else:
def _influence(inputs):
tfinput = [tf.cast(inp, tf.float32) for inp in inputs]
with tf.GradientTape() as tape:
tape.watch(tfinput)
pred = self.kerasmodel(tfinput)
if hasattr(outputs, '__len__') > 1:
total_output = K.sum([K.sum(o, axis=-1) for o in pred], axis=-1)
else:
total_output = K.sum(pred, axis=-1)
grad = tape.gradient(total_output, tfinput)
return [g.numpy() for g in grad]
self._influence = _influence
self.name = name
self.outputdir = _get_output_root_directory()
if not os.path.exists(self.outputdir): # pragma: no cover
# this is excluded from unit tests, because the testing
# framework always provides a directory
os.makedirs(self.outputdir)
if not os.path.exists(os.path.join(self.outputdir, 'logs')):
os.makedirs(os.path.join(self.outputdir, 'logs'))
logfile = os.path.join(self.outputdir, 'logs', 'janggu.log')
self.logger = logging.getLogger(self.name)
logging.basicConfig(filename=logfile,
level=logging.DEBUG,
format='%(asctime)s:%(name)s:%(message)s',
datefmt='%m/%d/%Y %H:%M:%S')
self.logger.info("Model Summary:")
self.kerasmodel.summary(print_fn=self.logger.info)
[docs] @classmethod
def create_by_name(cls, name, custom_objects=None):
"""Creates a Janggu object by name.
This option is used to load a pre-trained model.
Parameters
----------
name : str
Name of the model.
custom_objects : dict or None
This allows loading of custom layers using load_model.
All janggu specific layers are automatically included as custom_objects.
Default: None
Examples
--------
.. code-block:: python
in_ = Input(shape=(10,), name='ip')
layer = Dense(3)(in_)
output = Dense(1, activation='sigmoid', name='out')(layer)
# Instantiate a model.
model = Janggu(inputs=in_, outputs=output, name='test_model')
# saves the model to <janggu_results>/models
model.save()
# remove the original model
del model
# reload the model
model = Janggu.create_by_name('test_model')
"""
path = cls._storage_path(name, _get_output_root_directory())
custom_objects = {} if not custom_objects else custom_objects
custom_objects.update(JANGGU_LAYERS)
model = load_model(path, custom_objects=custom_objects)
return cls(model.inputs, model.outputs, name)
@property
def name(self):
"""Model name"""
return self._name
@name.setter
def name(self, name):
if not isinstance(name, str):
raise Exception("Name must be a string.")
self._name = name
[docs] def save(self, filename=None, overwrite=True, show_shapes=True):
"""Saves the model.
Parameters
----------
filename : str
Filename of the stored model. Default: None.
overwrite: bool
Overwrite a stored model. Default: True.
"""
if not filename: # pragma: no cover
filename = self._storage_path(self.name, self.outputdir)
plotname = os.path.splitext(filename)[0] + '.png'
try:
plot_model(self.kerasmodel, to_file=plotname, show_shapes=show_shapes)
except Exception: # pragma: no cover pylint: disable=broad-except
# if graphviz is not installed on the system.
self.logger.exception('plot_model failed, continue nevertheless: ')
self.logger.info("Save model %s", filename)
self.kerasmodel.save(filename, overwrite)
[docs] def summary(self):
"""Prints the model definition."""
self.kerasmodel.summary()
[docs] @classmethod
def create(cls, template, modelparams=None, inputs=None,
outputs=None, name=None):
"""Janggu constructor method.
This method instantiates a Janggu model with
model template and parameters. It also allows to
automatically infer and extend the correct
input and output layers for the network.
Parameters
-----------
template : function
Python function that defines a model template of a neural network.
The function signature must adhere to the signature
:code:`template(inputs, inputs, outputs, modelparams)`
and is expected to return
:code:`(input_tensor, output_tensor)` of the neural network.
modelparams : list or tuple or None
Additional model parameters that are passed along to template
upon creation of the neural network. For instance,
this could contain number of neurons at each layer.
Default: None.
inputs : Dataset, list(Dataset) or None
Input datasets from which the input layer shapes should be derived.
Use this option together with the :code:`inputlayer` decorator (see Example below).
outputs : Dataset, list(Dataset) or None
Output datasets from which the output layer shapes should be derived.
Use this option toghether with :code:`outputdense` or :code:`outputconv`
decorators (see Example below).
name : str or None
Model name. If None, a unique model name is generated
based on the model configuration and network architecture.
Examples
--------
Specify a model using a model template and parameters:
.. code-block:: python
def test_manual_model(inputs, inp, oup, params):
in_ = Input(shape=(10,), name='ip')
layer = Dense(params)(in_)
output = Dense(1, activation='sigmoid', name='out')(in_)
return in_, output
# Defines the same model by invoking the definition function
# and the create constructor.
model = Janggu.create(template=test_manual_model, modelparams=3)
model.summary()
Specify a model using automatic input and output layer determination.
That is, only the model body needs to be specified:
.. code-block:: python
import numpy as np
from janggu import Janggu
from janggu import inputlayer, outputdense
from janggu.data import Array
# Some random data which you would like to use as input for the
# model.
DATA = Array('ip', np.random.random((1000, 10)))
LABELS = Array('out', np.random.randint(2, size=(1000, 1)))
# The decorators inputlayer and outputdense
# extract the layer shapes and append the respective layers
# to the network
# so that only the model body remains to be specified.
# Note that the the decorator order matters.
# inputlayer must be specified before outputdense.
@inputlayer
@outputdense(activation='sigmoid')
def test_inferred_model(inputs, inp, oup, params):
with inputs.use('ip') as in_:
# the with block allows for easy
# access of a specific named input.
output = Dense(params)(in_)
return in_, output
# create a model.
model = Janggu.create(template=test_inferred_model, modelparams=3,
name='test_model',
inputs=DATA,
outputs=LABELS)
"""
model = create_model(template, modelparams, inputs, outputs)
return cls(model.inputs, model.outputs, name)
[docs] def compile(self, *args, **kargs):
#loss_weights=None, sample_weight_mode=None,
#weighted_metrics=None, target_tensors=None):
"""Model compilation.
This method delegates the compilation to
keras.models.Model.compile. See also https://keras.io/models/model/
Examples
--------
.. code:: python
model.compile(optimizer='adadelta', loss='binary_crossentropy')
"""
self.kerasmodel.compile(*args, **kargs)
[docs] def fit(self, # pylint: disable=too-many-locals
inputs=None,
outputs=None,
batch_size=None,
epochs=1,
verbose=1,
callbacks=None,
validation_data=None,
shuffle=True,
class_weight=None,
sample_weight=None,
initial_epoch=0,
steps_per_epoch=None,
use_multiprocessing=False,
workers=1):
"""Model fitting.
This method is used to fit a given model.
Most of parameters are directly delegated the
fit_generator of the keras model.
Parameters
----------
inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence)
Input Dataset or Sequence to use for fitting the model.
outputs : :code:`Dataset`, list(Dataset) or None
Output Dataset containing the training targets. If a Sequence
is used for inputs, outputs will have no effect.
batch_size : int or None
Batch size. If set to None a batch size of 32 is used.
epochs : int
Number of epochs. Default: 1.
verbose : int
Verbosity level. See https://keras.io.
callbacks : List(keras.callbacks.Callback)
Callbacks to be applied during training. See https://keras.io/callbacks
validation_data : tuple, Sequence or None
Validation data can be a tuple (input_dataset, output_dataset),
or (input_dataset, output_dataset, sample_weights) or
a keras.utils.Sequence instance or a list of validation chromsomoes.
The latter choice only works with when using Cover and Bioseq dataset.
This allows you to train on a dedicated set of chromosomes
and to validate the performance on respective heldout chromosomes.
If None, validation is not applied.
shuffle : boolean
shuffle batches. Default: True.
class_weight : dict
Class weights. See https://keras.io.
sample_weight : np.array or None
Sample weights. See https://keras.io.
initial_epoch : int
Initial epoch at which to start training.
steps_per_epoch : int, None.
Number of steps per epoch. If None, this value is determined from
the dataset size and the batch_size.
use_multiprocessing : boolean
Whether to use multiprocessing. See https://keras.io. Default: False.
workers : int
Number of workers to use in multiprocessing mode. Default: 1.
Examples
--------
.. code-block:: python
model.fit(DATA, LABELS)
"""
if not isinstance(inputs, Sequence):
inputs = _convert_data(self.kerasmodel, inputs, 'input_layers')
outputs = _convert_data(self.kerasmodel, outputs, 'output_layers')
hyper_params = {
'epochs': epochs,
'batch_size': batch_size,
'shuffle': shuffle,
'class_weight': class_weight,
'initial_epoch': initial_epoch,
'steps_per_epoch': steps_per_epoch,
'use_multiprocessing': use_multiprocessing,
'workers': workers
}
self.logger.info('Fit: %s', self.name)
if isinstance(inputs, Sequence):
self.logger.info('using custom Sequence')
else:
self.logger.info("Input:")
self.__dim_logging(inputs)
self.logger.info("Output:")
self.__dim_logging(outputs)
self.timer = time.time()
history = None
self.logger.info("Hyper-parameters:")
for par_ in hyper_params:
self.logger.info('%s: %s', par_, str(hyper_params[par_]))
callbacks = [] if callbacks is None else callbacks
callbacks.append(LambdaCallback(on_epoch_end=lambda epoch, logs: self.logger.info(
"epoch %s: %s",
epoch + 1,
' '.join(["{}=".format(k) +
('{:.4f}' if
abs(logs[k]) > 1e-3
else '{:.4e}').format(logs[k]) for k in logs]))))
if not os.path.exists(os.path.join(self.outputdir, 'evaluation')):
os.mkdir(os.path.join(self.outputdir, 'evaluation'))
if not os.path.exists(os.path.join(self.outputdir, 'evaluation', self.name)):
os.mkdir(os.path.join(self.outputdir, 'evaluation', self.name))
callbacks.append(CSVLogger(os.path.join(self.outputdir,
'evaluation',
self.name,
'training.log')))
if not batch_size:
batch_size = 32
if isinstance(inputs, Sequence):
# input could be a sequence
jseq = inputs
else:
jseq = JangguSequence(inputs, outputs, sample_weight, batch_size, shuffle=shuffle)
if isinstance(validation_data, tuple):
valinputs = _convert_data(self.kerasmodel, validation_data[0],
'input_layers')
valoutputs = _convert_data(self.kerasmodel, validation_data[1],
'output_layers')
sweights = validation_data[2] if len(validation_data) == 3 else None
valjseq = JangguSequence(valinputs, valoutputs, sweights, batch_size, shuffle=False)
elif isinstance(validation_data, Sequence):
valjseq = validation_data
elif isinstance(validation_data, list) and isinstance(validation_data[0], str):
# if the validation data is a list of chromosomes that should
# be used as validation dataset we end up here.
# This is only possible, however, if all input and output datasets
# are Cover or Bioseq dataset.
if not all(hasattr(datum, 'gindexer') \
for datum in [jseq.inputs[k] for k in jseq.inputs] +
[jseq.outputs[k] for k in jseq.outputs]):
raise ValueError("Not all dataset are Cover or Bioseq dataset"
" which is required for this options.")
# then split the original dataset into training and validation set.
train, val = split_train_test((jseq.inputs, jseq.outputs), validation_data)
traininp, trainoup = train
valinp, valoup = val
self.logger.info("Split in training and validation set:")
self.logger.info("Training-Input:")
self.__dim_logging(traininp)
self.logger.info("Training-Output:")
self.__dim_logging(trainoup)
self.logger.info("Validation-Input:")
self.__dim_logging(valinp)
self.logger.info("Validation-Output:")
self.__dim_logging(valoup)
jseq = JangguSequence(_convert_data(self.kerasmodel, traininp,
'input_layers'),
_convert_data(self.kerasmodel, trainoup,
'output_layers'),
sample_weights=None, batch_size=jseq.batch_size, shuffle=jseq.shuffle)
valjseq = JangguSequence(_convert_data(self.kerasmodel, valinp,
'input_layers'),
_convert_data(self.kerasmodel, valoup,
'output_layers'),
sample_weights=None, batch_size=jseq.batch_size, shuffle=False)
else:
valjseq = None
try:
history = self.kerasmodel.fit_generator(
jseq,
epochs=epochs,
validation_data=valjseq,
class_weight=class_weight,
initial_epoch=initial_epoch,
shuffle=shuffle,
use_multiprocessing=use_multiprocessing,
max_queue_size=50,
workers=workers,
verbose=verbose,
callbacks=callbacks)
except Exception: # pragma: no cover
# ignore the linter warning, the exception
# is reraised anyways.
self.logger.exception('fit_generator failed:')
raise
self.logger.info('#' * 40)
for k in history.history:
self.logger.info('%s: %f', k, history.history[k][-1])
self.logger.info('#' * 40)
self.save()
self._save_hyper(hyper_params)
self.logger.info("Training finished after %1.3f s",
time.time() - self.timer)
return history
[docs] def predict(self, inputs, # pylint: disable=too-many-locals
batch_size=None,
verbose=0,
steps=None,
layername=None,
datatags=None,
callbacks=None,
use_multiprocessing=False,
workers=1):
"""Performs a prediction.
This method predicts the targets.
All of the parameters are directly delegated the
predict_generator of the keras model.
See https://keras.io/models/model/#methods.
Parameters
----------
inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence)
Input Dataset or Sequence to use for fitting the model.
batch_size : int or None
Batch size. If set to None a batch size of 32 is used.
verbose : int
Verbosity level. See https://keras.io.
steps : int, None.
Number of predict steps. If None, this value is determined from
the dataset size and the batch_size.
layername : str or None
Layername for which the prediction should be performed. If None,
the output layer will be used automatically.
datatags : list(str) or None
Tags to annotate the evaluation results. Default: None.
callbacks : List(:code:`Scorer`)
Scorer instances to be applied on the predictions.
use_multiprocessing : boolean
Whether to use multiprocessing for the prediction. Default: False.
workers : int
Number of workers to use. Default: 1.
Examples
--------
.. code-block:: python
model.predict(DATA)
"""
if not isinstance(inputs, Sequence):
inputs = _convert_data(self.kerasmodel, inputs, 'input_layers')
self.logger.info('Predict: %s', self.name)
if isinstance(inputs, Sequence):
self.logger.info('using custom Sequence')
else:
self.logger.info("Input:")
self.__dim_logging(inputs)
self.timer = time.time()
# if a desired layername is specified, the features
# will be predicted.
if layername:
model = Janggu(self.kerasmodel.input,
self.kerasmodel.get_layer(layername).output,
name=self.name)
else:
model = self
if not batch_size:
batch_size = 32
if isinstance(inputs, Sequence):
jseq = inputs
else:
jseq = JangguSequence(inputs, None, None, batch_size=batch_size)
try:
preds = model.kerasmodel.predict_generator(
jseq,
steps=steps,
use_multiprocessing=use_multiprocessing,
workers=workers,
verbose=verbose)
except Exception: # pragma: no cover
self.logger.exception('predict_generator failed:')
raise
prd = _convert_data(model.kerasmodel, preds, 'output_layers')
if layername is not None:
# no need to set an extra datatag.
# if layername is present, it will be added to the tags
if datatags is None:
datatags = [layername]
else:
datatags.append(layername)
for callback in callbacks or []:
callback.score(model, prd, datatags=datatags)
return preds
[docs] def evaluate(self, inputs=None, outputs=None, # pylint: disable=too-many-locals
batch_size=None,
sample_weight=None,
steps=None,
datatags=None,
callbacks=None,
use_multiprocessing=False,
workers=1):
"""Evaluates the performance.
This method is used to evaluate a given model.
All of the parameters are directly delegated the
evalute_generator of the keras model.
See https://keras.io/models/model/#methods.
Parameters
----------
inputs : :code:`Dataset`, list(Dataset) or Sequence (keras.utils.Sequence)
Input Dataset or Sequence to use for evaluating the model.
outputs : :code:`Dataset`, list(Dataset) or None
Output Dataset containing the training targets. If a Sequence
is used for inputs, outputs will have no effect.
batch_size : int or None
Batch size. If set to None a batch size of 32 is used.
sample_weight : np.array or None
Sample weights. See https://keras.io.
steps : int, None.
Number of predict steps. If None, this value is determined from
the dataset size and the batch_size.
datatags : list(str) or None
Tags to annotate the evaluation results. Default: None.
callbacks : List(:code:`Scorer` or str)
Scorer instances to be applied on the predictions. Furthermore,
commonly used scoring metrics can be added by name, including
'roc', 'auroc', 'prc', 'auprc' for evaluating binary classification
applications and 'cor' (for Pearson's correlation), 'mae', 'mse'
and 'var_explained' for regression applications.
use_multiprocessing : boolean
Whether to use multiprocessing for the prediction. Default: False.
workers : int
Number of workers to use. Default: 1.
Examples
--------
.. code-block:: python
model.evaluate(DATA, LABELS)
# binary classification evaluation with callbacks
model.evaluate(DATA, LABELS, callcacks=['auprc', 'auroc'])
"""
self.logger.info('Evaluate: %s', self.name)
if isinstance(inputs, Sequence):
inputs_ = _convert_data(self.kerasmodel, inputs.inputs, 'input_layers')
outputs_ = _convert_data(self.kerasmodel, inputs.outputs, 'output_layers')
self.logger.info('Using custom Sequence.')
self.logger.info("Input:")
self.__dim_logging(inputs_)
self.logger.info("Output:")
self.__dim_logging(outputs_)
else:
inputs_ = _convert_data(self.kerasmodel, inputs, 'input_layers')
outputs_ = _convert_data(self.kerasmodel, outputs, 'output_layers')
self.logger.info("Input:")
self.__dim_logging(inputs_)
self.logger.info("Output:")
self.__dim_logging(outputs_)
self.timer = time.time()
if not batch_size:
batch_size = 32
if isinstance(inputs, Sequence):
jseq = inputs
else:
jseq = JangguSequence(inputs_, outputs_, sample_weight, batch_size=batch_size)
try:
values = self.kerasmodel.evaluate_generator(
jseq,
steps=steps,
use_multiprocessing=use_multiprocessing,
workers=workers)
except Exception: # pragma: no cover
self.logger.exception('evaluate_generator failed:')
raise
self.logger.info('#' * 40)
values = _to_list(values)
for i, value in enumerate(values):
self.logger.info('%s: %f', self.kerasmodel.metrics_names[i], value)
self.logger.info('#' * 40)
self.logger.info("Evaluation finished in %1.3f s",
time.time() - self.timer)
preds = self.kerasmodel.predict_generator(jseq, steps=steps,
use_multiprocessing=use_multiprocessing,
workers=workers)
preds = _convert_data(self.kerasmodel, preds, 'output_layers')
for callback in callbacks or []:
callback = get_scorer(callback)
callback.score(self, preds, outputs=outputs_, datatags=datatags)
return values
[docs] def predict_variant_effect(self, # pylint: disable=too-many-locals
bioseq,
variants,
conditions,
output_folder,
condition_filter=None,
batch_size=None,
annotation=None,
ignore_reference_match=False,
order=1):
"""Evaluates the performance.
see help(predict_variant_effect) for a description of
the parameters
Examples
--------
.. code-block:: python
# Evaluate all variants and all conditions (outputs)
model.predict_variant_effect(DATA, VARIANTS, CONDITIONS,
'vcfoutput')
# Evaluate all variants and a subset of conditions (Ctcf output labels)
model.predict_variant_effect(DATA, LABELS, CONDITIONS,
'vcfoutput_subset',
contition_filter='Cfcf')
"""
return predict_variant_effect(self.kerasmodel,
bioseq,
variants,
conditions,
output_folder,
condition_filter=condition_filter,
batch_size=batch_size,
annotation=annotation,
ignore_reference_match=ignore_reference_match)
def __dim_logging(self, data):
if isinstance(data, dict):
for key in data:
self.logger.info("\t%s: %s", key, data[key].shape)
if hasattr(data, "shape"):
data = [data]
if isinstance(data, list):
for datum in data:
self.logger.info("\t%s", datum.shape)
[docs] def get_config(self):
"""Get model config."""
return self.kerasmodel.get_config()
@staticmethod
def _storage_path(name, outputdir):
"""Returns the path to the model storage file."""
if not os.path.exists(os.path.join(outputdir, "models")):
os.mkdir(os.path.join(outputdir, "models"))
filename = os.path.join(outputdir, 'models', '{}.h5'.format(name))
return filename
def _save_hyper(self, hyper_params, filename=None):
"""This method attaches the hyper parameters that were used
to train the model to the hdf5 file.
This method is supposed to be used after model training
from within the fit method.
It attaches the hyper parameter, e.g. epochs, batch_size, etc.
to the hdf5 file that contains the model weights.
The hyper parameters are added as attributes to the
group 'model_weights'.
Parameters
----------
hyper_parameters : dict
Dictionary that contains the hyper parameters.
filename : str
Filename of the hdf5 file. This file must already exist.
So save has to be called before.
"""
if not filename: # pragma: no cover
filename = self._storage_path(self.name, self.outputdir)
content = h5py.File(filename, 'r+')
weights = content['model_weights']
for key in hyper_params:
if hyper_params[key]:
weights.attrs[key] = str(hyper_params[key])
content.close()
def model_from_json(json_string, custom_objects=None):
"""Wrapping keras.models.model_from_json"""
custom_objects = {} if custom_objects is None else custom_objects
custom_objects.update(JANGGU_LAYERS)
return keras.models.model_from_json(json_string, custom_objects)
def model_from_yaml(yaml_string, custom_objects=None):
"""Wrapping keras.models.model_from_yaml"""
custom_objects = {} if custom_objects is None else custom_objects
custom_objects.update(JANGGU_LAYERS)
return keras.models.model_from_yaml(yaml_string, custom_objects)
def create_model(template, modelparams=None, inputs=None,
outputs=None, name=None):
"""Constructor method.
This method instantiates a keras model with
model template and parameters. It also allows to
automatically infer and extend the correct
input and output layers for the network.
Parameters
-----------
template : function
Python function that defines a model template of a neural network.
The function signature must adhere to the signature
:code:`template(inputs, inputp, outputp, modelparams)`
and is expected to return
:code:`(input_tensor, output_tensor)` of the neural network.
modelparams : list or tuple or None
Additional model parameters that are passed along to template
upon creation of the neural network. For instance,
this could contain number of neurons at each layer.
Default: None.
inputs : Dataset, list(Dataset) or None
Input datasets from which the input layer shapes should be derived.
Use this option together with the :code:`inputlayer` decorator (see Example below).
outputs : Dataset, list(Dataset) or None
Output datasets from which the output layer shapes should be derived.
Use this option toghether with :code:`outputdense` or :code:`outputconv`
decorators (see Example below).
name : str or None
Model name. If None, a unique model name is generated
based on the model configuration and network architecture.
Examples
--------
Specify a model using a model template and parameters:
.. code-block:: python
def test_manual_model(inputs, inp, oup, params):
in_ = Input(shape=(10,), name='ip')
layer = Dense(params)(in_)
output = Dense(1, activation='sigmoid', name='out')(in_)
return in_, output
# Defines the same model by invoking the definition function
# and the create constructor.
model = create_model(template=test_manual_model, modelparams=3)
model.summary()
Specify a model using automatic input and output layer determination.
That is, only the model body needs to be specified:
.. code-block:: python
import numpy as np
from janggu import inputlayer, outputdense
from janggu.data import Array
# Some random data which you would like to use as input for the
# model.
DATA = Array('ip', np.random.random((1000, 10)))
LABELS = Array('out', np.random.randint(2, size=(1000, 1)))
# The decorators inputlayer and outputdense
# extract the layer shapes and append the respective layers
# to the network
# so that only the model body remains to be specified.
# Note that the the decorator order matters.
# inputlayer must be specified before outputdense.
@inputlayer
@outputdense(activation='sigmoid')
def test_inferred_model(inputs, inp, oup, params):
with inputs.use('ip') as in_:
# the with block allows for easy
# access of a specific named input.
output = Dense(params)(in_)
return in_, output
# create a model.
model = create_model(template=test_inferred_model, modelparams=3,
name='test_model',
inputp=DATA,
outputp=LABELS)
"""
inputs_ = _data_props(inputs) if inputs else None
outputs_ = _data_props(outputs) if outputs else None
input_tensors, output_tensors = template(None, inputs_,
outputs_, modelparams)
model = Model(inputs=input_tensors, outputs=output_tensors, name=name)
return model
def predict_variant_effect(model, # pylint: disable=too-many-locals
bioseq,
variants,
conditions,
output_folder,
condition_filter=None,
batch_size=None,
annotation=None,
ignore_reference_match=False,
order=1):
"""Evaluates the performance.
Parameters
----------
model : :code:`keras.Model`
A keras model
bioseq : :code:`Bioseq` or str
Bioseq container containing a reference genome or the reference genome file
in fasta format.
If a Bioseq object is used, it has to be loaded with store_whole_genome=True.
This may be faster for large amounts of variants.
Consider using cache=True as well, such that the genome only needs to be loaded
once, and reloaded from the cache if needed.
Alternatively, a string pointing to the reference genome sequence in FASTA
format can be supplied from which the sequence context is extracted.
variants : str
File name of a VCF file containg the variants under study.
conditions : list(str)
Condition labels for each output prediction.
output_folder : str
The method produces an hdf5 and a bed file as output.
The bed-file contains the variant positions while the
hdf5 file contains the reference and alternative variant scores
for each output feature.
condition_filter : str or None
Regular expression filter on which conditions should be evaluated.
If None, all output conditions will be returned.
batch_size : int, None.
Batch size. If None, a batch_size of 128 is used.
annotation : BedTool object or None
BedTool holding feature annotation e.g. gene annotation.
The annotation may be used to perform strand-specific variant effect
predictions. Each variant is intersected with the annotation
in order to derive the correct strandedness. If variants
do not overlap with an annotation features or for missing annotation,
the forward strand is used.
ignore_reference_match : boolean
Whether to ignore mismatches between the reference sequence and
the reference base in the VCF file. If False, the variant will
be skipped over and only matching positions are processed.
Otherwise all variants will be processed. Default: False.
order : int
Order of the DNA sequence representation. Only relevant if the option
bioseq is pointing to a reference genome file. If bioseq is already a
Bioseq object, this options will be ignored. Default: 1.
Returns
-------
tuple:
Tuple containing the output filenames: an hdf5 and a bed file.
Examples
--------
.. code-block:: python
# Evaluate all variants and all conditions (outputs)
predict_variant_effect(kerasmodel, DATA, VARIANTS, CONDITIONS,
'vcfoutput')
# Evaluate all variants and a subset of conditions (Ctcf output labels)
predict_variant_effect(kerasmodel, DATA, LABELS, CONDITIONS,
'vcfoutput_subset',
contition_filter='Cfcf')
"""
if batch_size is None:
batch_size = 128
def _get_input_layer_shape(layer):
if isinstance(layer.input_shape, list):
return layer.input_shape[0]
return layer.input_shape
if isinstance(bioseq, Bioseq):
order = bioseq.garray.order
if len(model.inputs) > 1:
raise ValueError('Only one input layer supported for predict_variant_effect.')
binsize = _get_input_layer_shape(model.layers[0])[1] + order - 1
# the network might output arbitrarily many
# output.
# With the filter option it is possible to
# restrict the analysis to certain features.
if condition_filter is None:
conditions = [(idx, cond) for idx, cond in enumerate(conditions)]
else:
conditions = [(idx, cond) for idx, cond in enumerate(conditions) \
if hasattr(re.search(condition_filter, cond), 'start')]
icond = [el[0] for el in conditions]
local_model = model
if len(conditions) != model.output_shape[-1]:
raise ValueError("The number of conditions does not match with the "
"number of network output units. [{}!={}]".format(
len(conditions),
model.output_shape[-1]))
# get number of variants
variantsstream = VariantStreamer(bioseq, variants, binsize, batch_size,
annotation=annotation,
ignore_reference_match=ignore_reference_match,
order=order)
nvariants = variantsstream.get_variant_count()
h5file = h5py.File(os.path.join(output_folder, 'scores.hdf5'), 'w')
h5file.create_dataset('labels', (len(conditions),),
dtype=h5py.special_dtype(vlen=str),
data=np.array([c[-1] for c in conditions],
dtype=h5py.special_dtype(vlen=str)))
refscore = h5file.create_dataset('refscore', (nvariants, len(conditions)),
dtype='float16')
altscore = h5file.create_dataset('altscore', (nvariants, len(conditions)),
dtype='float16')
diffscore = h5file.create_dataset('diffscore', (nvariants, len(conditions)),
dtype='float16')
logodds = h5file.create_dataset('logoddsscore', (nvariants, len(conditions)),
dtype='float16')
bar = Bar('Parsing {}: '.format(variants), max=int(np.ceil(nvariants/float(batch_size))))
chromlist = []
poslist = []
vnamelist = []
reflist = []
altlist = []
ibatch = 0
# read variants file
for names, chroms, poss, ra, aa, reference, alternative in variantsstream.flow():
bar.next()
if reference.shape[0] <= 0:
# reached the end of the file
break
ref_score = local_model.predict_on_batch(reference)
alt_score = local_model.predict_on_batch(alternative)
chromlist += chroms
poslist += poss
vnamelist += names
reflist += ra
altlist += aa
refscore[ibatch:(ibatch + ref_score.shape[0])] = ref_score[:, icond].astype('float16')
altscore[ibatch:(ibatch + ref_score.shape[0])] = alt_score[:, icond].astype('float16')
diffscore[ibatch:(ibatch + ref_score.shape[0])] = \
alt_score[:, icond].astype('float16') - ref_score[:, icond].astype('float16')
logodds[ibatch:(ibatch + ref_score.shape[0])] = \
np.log(alt_score[:, icond].astype('float16')/
ref_score[:, icond].astype('float16') + 1e-7)
ibatch += ref_score.shape[0]
#form large string
BedTool('\n'.join(['{} {} {} {}_{}>{}'.format(chrom, start, start+1, name, ref, alt)
for chrom, start, name, ref, alt in zip(chromlist,
poslist,
vnamelist, reflist, altlist)]),
from_string=True).saveas(os.path.join(output_folder, 'snps.bed.gz'))
bar.finish()
h5file.close()
return (os.path.join(output_folder, 'scores.hdf5'),
os.path.join(output_folder, 'snps.bed.gz'))