Deep learning has significantly improved the expressiveness of representations. However, present research still fails to understand why and how they work and cannot reliably predict when they fail. Moreover, the different characteristics of our physical world are commonly intermingled, making it impossible to study them individually. We incorporate novel paradigms for disentangling multiple object characteristics and present interpretable models to translate arbitrary network representations into semantically meaningful, interpretable concepts. We also obtain disentangled generative models that explain their latent representations by synthesis while being able to alter different object characteristics individually.
2022
Eulig, Elias; Ommer, Björn; Kachelrieß, Marc
Reconstructing invariances of CT image denoising networks using invertible neural networks Proceedings Article
In: Stayman, Joseph Webster (Ed.): 7th International Conference on Image Formation in X-Ray Computed Tomography, pp. 123040S, International Society for Optics and Photonics SPIE, 2022.
@inproceedings{10.1117/12.2647170,
title = {Reconstructing invariances of CT image denoising networks using invertible neural networks},
author = {Elias Eulig and Björn Ommer and Marc Kachelrieß},
editor = {Joseph Webster Stayman},
url = {https://doi.org/10.1117/12.2647170},
doi = {10.1117/12.2647170},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {7th International Conference on Image Formation in X-Ray Computed Tomography},
volume = {12304},
pages = {123040S},
publisher = {SPIE},
organization = {International Society for Optics and Photonics},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Islam, Md Amirul; Kowal, Matthew; Esser, Patrick; Jia, Sen; Ommer, Björn; Derpanis, Konstantinos G; Bruce, Neil
Shape or Texture: Understanding Discriminative Features in CNNs Conference
International Conference on Learning Representations (ICLR), 2021.
@conference{7031,
title = {Shape or Texture: Understanding Discriminative Features in CNNs},
author = {Md Amirul Islam and Matthew Kowal and Patrick Esser and Sen Jia and Björn Ommer and Konstantinos G Derpanis and Neil Bruce},
url = {https://arxiv.org/abs/2101.11604
},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {International Conference on Learning Representations (ICLR)},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2020
Esser, Patrick; Rombach, Robin; Ommer, Björn
A Disentangling Invertible Interpretation Network for Explaining Latent Representations Conference
Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020.
@conference{6932,
title = {A Disentangling Invertible Interpretation Network for Explaining Latent Representations},
author = {Patrick Esser and Robin Rombach and Björn Ommer},
url = {https://compvis.github.io/iin/
https://arxiv.org/abs/2004.13166},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {Neural networks have greatly boosted performance in computer vision by learning powerful representations of input data. The drawback of end-to-end training for maximal overall performance are black-box models whose hidden representations are lacking interpretability: Since distributed coding is optimal for latent layers to improve their robustness, attributing meaning to parts of a hidden feature vector or to individual neurons is hindered. We formulate interpretation as a translation of hidden representations onto semantic concepts that are comprehensible to the user. The mapping between both domains has to be bijective so that semantic modifications in the target domain correctly alter the original representation. The proposed invertible interpretation network can be transparently applied on top of existing architectures with no need to modify or retrain them. Consequently, we translate an original representation to an equivalent yet interpretable one and backwards without affecting the expressiveness and performance of the original. The invertible interpretation network disentangles the hidden representation into separate, semantically meaningful concepts. Moreover, we present an efficient approach to define semantic concepts by only sketching two images and also an unsupervised strategy. Experimental evaluation demonstrates the wide applicability to interpretation of existing classification and image generation networks as well as to semantically guided image manipulation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Esser, Patrick; Rombach, Robin; Ommer, Björn
A Note on Data Biases in Generative Models Conference
NeurIPS 2020 Workshop on Machine Learning for Creativity and Design, 2020.
@conference{7025,
title = {A Note on Data Biases in Generative Models},
author = {Patrick Esser and Robin Rombach and Björn Ommer},
url = {https://neurips2020creativity.github.io/
https://arxiv.org/abs/2012.02516},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {NeurIPS 2020 Workshop on Machine Learning for Creativity and Design},
abstract = {It is tempting to think that machines are less prone to unfairness and prejudice. However, machine learning approaches compute their outputs based on data. While biases can enter at any stage of the development pipeline, models are particularly receptive to mirror biases of the datasets they are trained on and therefore do not necessarily reflect truths about the world but, primarily, truths about the data. To raise awareness about the relationship between modern algorithms and the data that shape them, we use a conditional invertible neural network to disentangle the dataset-specific information from the information which is shared across different datasets. In this way, we can project the same image onto different datasets, thereby revealing their inherent biases. We use this methodology to (i) investigate the impact of dataset quality on the performance of generative models, (ii) show how societal biases of datasets are replicated by generative models, and (iii) present creative applications through unpaired transfer between diverse datasets such as photographs, oil portraits, and animes.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rombach, Robin; Esser, Patrick; Ommer, Björn
Making Sense of CNNs: Interpreting Deep Representations & Their Invariances with INNs Conference
IEEE European Conference on Computer Vision (ECCV), 2020.
@conference{6997,
title = {Making Sense of CNNs: Interpreting Deep Representations & Their Invariances with INNs},
author = {Robin Rombach and Patrick Esser and Björn Ommer},
url = {https://compvis.github.io/invariances/
https://arxiv.org/pdf/2008.01777.pdf},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {IEEE European Conference on Computer Vision (ECCV)},
abstract = {To tackle increasingly complex tasks, it has become an essential ability of neural networks to learn abstract representations. These task-specific representations and, particularly, the invariances they capture turn neural networks into black box models that lack interpretability. To open such a black box, it is, therefore, crucial to uncover the different semantic concepts a model has learned as well as those that it has learned to be invariant to. We present an approach based on INNs that (i) recovers the task-specific, learned invariances by disentangling the remaining factor of variation in the data and that (ii) invertibly transforms these recovered invariances combined with the model representation into an equally expressive one with accessible semantic concepts. As a consequence, neural network representations become understandable by providing the means to (i) expose their semantic meaning, (ii) semantically modify a representation, and (iii) visualize individual learned semantic concepts and invariances. Our invertible approach significantly extends the abilities to understand black box models by enabling post-hoc interpretations of state-of-the-art networks without compromising their performance.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rombach, Robin; Esser, Patrick; Ommer, Björn
Network Fusion for Content Creation with Conditional INNs Conference
CVPRW 2020 (AI for Content Creation), 2020.
@conference{7012,
title = {Network Fusion for Content Creation with Conditional INNs},
author = {Robin Rombach and Patrick Esser and Björn Ommer},
url = {https://compvis.github.io/network-fusion/
https://arxiv.org/abs/2005.13580},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {CVPRW 2020 (AI for Content Creation)},
abstract = {Artificial Intelligence for Content Creation has the potential to reduce the amount of manual content creation work significantly. While automation of laborious work is welcome, it is only useful if it allows users to control aspects of the creative process when desired. Furthermore, widespread adoption of semi-automatic content creation depends on low barriers regarding the expertise, computational budget and time required to obtain results and experiment with new techniques. With state-of-the-art approaches relying on task-specific models, multi-GPU setups and weeks of training time, we must find ways to reuse and recombine them to meet these requirements. Instead of designing and training methods for controllable content creation from scratch, we thus present a method to repurpose powerful, existing models for new tasks, even though they have never been designed for them. We formulate this problem as a translation between expert models, which includes common content creation scenarios, such as text-to-image and image-to-image translation, as a special case. As this translation is ambiguous, we learn a generative model of hidden representations of one expert conditioned on hidden representations of the other expert. Working on the level of hidden representations makes optimal use of the computational effort that went into the training of the expert model to produce these efficient, low-dimensional representations. Experiments demonstrate that our approach can translate from BERT, a state-of-the-art expert for text, to BigGAN, a state-of-the-art expert for images, to enable text-to-image generation, which neither of the experts can perform on its own. Additional experiments show the wide applicability of our approach across different conditional image synthesis tasks and improvements over existing methods for image modifications.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rombach, Robin; Esser, Patrick; Ommer, Björn
Network-to-Network Translation with Conditional Invertible Neural Networks Conference
Neural Information Processing Systems (NeurIPS) (Oral), 2020.
@conference{7011,
title = {Network-to-Network Translation with Conditional Invertible Neural Networks},
author = {Robin Rombach and Patrick Esser and Björn Ommer},
url = {https://compvis.github.io/net2net/
https://arxiv.org/abs/2005.13580},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
booktitle = {Neural Information Processing Systems (NeurIPS) (Oral)},
abstract = {Combining stimuli from diverse modalities into a coherent perception is a striking feat of intelligence of evolved brains. This work seeks its analogy in deep learning models and aims to establish relations between existing networks by faithfully combining the representations of these different domains. Therefore, we seek a model that can relate between different existing representations by learning a conditionally invertible mapping between them. The network demonstrates this capability by (i) providing generic transfer between diverse domains, (ii) enabling controlled content synthesis by allowing modification in other domains, and (iii) facilitating diagnosis of existing representations by translating them into an easily accessible domain. Our domain transfer network can translate between fixed representations without having to learn or finetune them. This allows users to utilize various existing domain-specific expert models from the literature that had been trained with extensive computational resources. Experiments on diverse conditional image synthesis tasks, competitive image modification results and experiments on image-to-image and text-to-image generation demonstrate the generic applicability of our approach. In particular, we translate between BERT and BigGAN, state-of-the-art text and image models to provide text-to-image generation, which neither of both experts can perform on their own.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2019
Kotovenko, Dmytro; Sanakoyeu, Artsiom; Lang, Sabine; Ommer, Björn
Content and Style Disentanglement for Artistic Style Transfer Conference
Proceedings of the Intl. Conf. on Computer Vision (ICCV), 2019.
@conference{6322,
title = {Content and Style Disentanglement for Artistic Style Transfer},
author = {Dmytro Kotovenko and Artsiom Sanakoyeu and Sabine Lang and Björn Ommer},
url = {https://compvis.github.io/content-style-disentangled-ST/
https://compvis.github.io/content-style-disentangled-ST/paper.pdf},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
booktitle = {Proceedings of the Intl. Conf. on Computer Vision (ICCV)},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Lorenz, Dominik; Bereska, Leonard; Milbich, Timo; Ommer, Björn
Unsupervised Part-Based Disentangling of Object Shape and Appearance Conference
Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Oral + Best paper finalist: top 45 / 5160 submissions), 2019.
@conference{6301,
title = {Unsupervised Part-Based Disentangling of Object Shape and Appearance},
author = {Dominik Lorenz and Leonard Bereska and Timo Milbich and Björn Ommer},
url = {https://compvis.github.io/unsupervised-disentangling/
https://arxiv.org/abs/1903.06946},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Oral + Best paper finalist: top 45 / 5160 submissions)},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Esser, Patrick; Haux, Johannes; Ommer, Björn
Unsupervised Robust Disentangling of Latent Characteristics for Image Synthesis Conference
Proceedings of the Intl. Conf. on Computer Vision (ICCV), 2019.
@conference{6323,
title = {Unsupervised Robust Disentangling of Latent Characteristics for Image Synthesis},
author = {Patrick Esser and Johannes Haux and Björn Ommer},
url = {https://compvis.github.io/robust-disentangling/
https://arxiv.org/abs/1910.10223},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
booktitle = {Proceedings of the Intl. Conf. on Computer Vision (ICCV)},
abstract = {Deep generative models come with the promise to learn an explainable representation for visual objects that allows image sampling, synthesis, and selective modification. The main challenge is to learn to properly model the independent latent characteristics of an object, especially its appearance and pose. We present a novel approach that learns disentangled representations of these characteristics and explains them individually. Training requires only pairs of images depicting the same object appearance, but no pose annotations. We propose an additional classifier that estimates the minimal amount of regularization required to enforce disentanglement. Thus both representations together can completely explain an image while being independent of each other. Previous methods based on adversarial approaches fail to enforce this independence, while methods based on variational approaches lead to uninformative representations. In experiments on diverse object categories, the approach successfully recombines pose and appearance to reconstruct and retarget novel synthesized images. We achieve significant improvements over state-of-the-art methods which utilize the same level of supervision, and reach performances comparable to those of pose-supervised approaches. However, we can handle the vast body of articulated object classes for which no pose models/annotations are available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2018
Esser, Patrick; Sutter, Ekaterina; Ommer, Björn
A Variational U-Net for Conditional Appearance and Shape Generation Conference
Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (short Oral), 2018.
@conference{6249,
title = {A Variational U-Net for Conditional Appearance and Shape Generation},
author = {Patrick Esser and Ekaterina Sutter and Björn Ommer},
url = {https://compvis.github.io/vunet/
https://arxiv.org/abs/1804.04694},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-02},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (short Oral)},
abstract = {Deep generative models have demonstrated great performance in image synthesis. However, results deteriorate in case of spatial deformations, since they generate images of objects directly, rather than modeling the intricate interplay of their inherent shape and appearance. We present a conditional U-Net for shape-guided image generation, conditioned on the output of a variational autoencoder for appearance. The approach is trained end-to-end on images, without requiring samples of the same object with varying pose or appearance. Experiments show that the model enables conditional image generation and transfer. Therefore, either shape or appearance can be retained from a query image, while freely altering the other. Moreover, appearance can be sampled due to its stochastic latent representation, while preserving shape. In quantitative and qualitative experiments on COCO, DeepFashion, shoes, Market-1501 and handbags, the approach demonstrates significant improvements over the state-of-the-art.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}