Skip to content

Module kerod.core.sampling_ops

Method to subsample minibatches by balancing positives and negatives.

Subsamples minibatches based on a pre-specified positive fraction in range [0,1]. The class presumes there are many more negatives than positive examples: if the desired sample_size cannot be achieved with the pre-specified positive fraction, it fills the rest with negative examples. If this is not sufficient for obtaining the desired sample_size, it returns fewer examples.

The main function to call is Subsample(self, indicator, labels). For convenience one can also call SubsampleWeights(self, weights, labels) which is defined in the minibatch_sampler base class.

When is_static is True, it implements a method that guarantees static shapes. It also ensures the length of output of the subsample is always sample_size, even when number of examples set to True in indicator is less than sample_size.

View Source
# Copyright 2017 The TensorFlow Authors and modified by Emilien Garreau. All Rights Reserved.

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# ==============================================================================

"""Method to subsample minibatches by balancing positives and negatives.

Subsamples minibatches based on a pre-specified positive fraction in range

[0,1]. The class presumes there are many more negatives than positive examples:

if the desired sample_size cannot be achieved with the pre-specified positive

fraction, it fills the rest with negative examples. If this is not sufficient

for obtaining the desired sample_size, it returns fewer examples.

The main function to call is Subsample(self, indicator, labels). For convenience

one can also call SubsampleWeights(self, weights, labels) which is defined in

the minibatch_sampler base class.

When is_static is True, it implements a method that guarantees static shapes.

It also ensures the length of output of the subsample is always sample_size, even

when number of examples set to True in indicator is less than sample_size.

"""

import tensorflow as tf

from kerod.utils import ops

def subsample_indicator(indicator, num_samples):

    """Subsample indicator vector.

    Given a boolean indicator vector with M elements set to `True`, the function

    assigns all but `num_samples` of these previously `True` elements to

    `False`. If `num_samples` is greater than M, the original indicator vector

    is returned.

    Arguments:

    - *indicator*: a 1-dimensional boolean tensor indicating which elements

        are allowed to be sampled and which are not.

    - *num_samples*: int32 scalar tensor

    Returns:

    A boolean tensor with the same shape as input (indicator) tensor

    """

    indices = tf.where(indicator)

    indices = tf.random.shuffle(indices)

    indices = tf.reshape(indices, [-1])

    num_samples = tf.minimum(tf.size(indices), num_samples)

    selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))

    selected_indicator = ops.indices_to_dense_vector(selected_indices, tf.shape(indicator)[0])

    return tf.equal(selected_indicator, 1)

def sample_balanced_positive_negative(indicator, sample_size, labels, positive_fraction=0.5):

    """Subsamples minibatches to a desired balance of positives and negatives.

    Arguments:

    - *indicator*: boolean tensor of shape [N] whose True entries can be sampled.

    - *sample_size*: desired batch size. If None, keeps all positive samples and

        randomly selects negative samples so that the positive sample fraction

        matches positive_fraction.

    - *labels*: boolean tensor of shape [N] denoting positive(=True) and negative

        (=False) examples.

    - *positive_fraction*: desired fraction of positive examples (scalar in [0,1])

        in the batch.

    Returns:

    *sampled_idx_indicator*: boolean tensor of shape [N], True for entries which are sampled.

    """

    negative_idx = tf.logical_not(labels)

    positive_idx = tf.logical_and(labels, indicator)

    negative_idx = tf.logical_and(negative_idx, indicator)

    # Sample positive and negative samples separately

    if sample_size is None:

        max_num_pos = tf.reduce_sum(tf.cast(positive_idx, dtype=tf.int32))

    else:

        max_num_pos = int(positive_fraction * sample_size)

    sampled_pos_idx = subsample_indicator(positive_idx, max_num_pos)

    num_sampled_pos = tf.reduce_sum(tf.cast(sampled_pos_idx, tf.int32))

    if sample_size is None:

        negative_positive_ratio = (1 - positive_fraction) / positive_fraction

        max_num_neg = tf.cast(negative_positive_ratio * tf.cast(num_sampled_pos, dtype=tf.float32),

                              dtype=tf.int32)

    else:

        max_num_neg = sample_size - num_sampled_pos

    sampled_neg_idx = subsample_indicator(negative_idx, max_num_neg)

    return tf.logical_or(sampled_pos_idx, sampled_neg_idx)

def batch_sample_balanced_positive_negative(indicators,

                                            sample_size,

                                            labels,

                                            positive_fraction=0.5,

                                            dtype=tf.float32):

    """Subsamples minibatches to a desired balance of positives and negatives.

    Arguments:

    - *indicator*: boolean tensor of shape [batch_size, N] whose True entries can be sampled.

    - *sample_size*: desired batch size. If None, keeps all positive samples and

        randomly selects negative samples so that the positive sample fraction

        matches positive_fraction.

    - *labels*: boolean tensor of shape [batch_size, N] denoting positive(=True) and negative

        (=False) examples.

    - *positive_fraction*: desired fraction of positive examples (scalar in [0,1])

        in the batch.

    Returns:

    A boolean tensor of shape [M, N], True for entries which are sampled.

    """

    def _minibatch_subsample_fn(inputs):

        indicators, targets = inputs

        return sample_balanced_positive_negative(tf.cast(indicators, tf.bool),

                                                 sample_size,

                                                 tf.cast(targets, tf.bool),

                                                 positive_fraction=positive_fraction)

    return tf.cast(tf.map_fn(_minibatch_subsample_fn, [indicators, labels],

                             dtype=tf.bool,

                             parallel_iterations=16,

                             back_prop=True),

                   dtype=dtype)

Functions

batch_sample_balanced_positive_negative

def batch_sample_balanced_positive_negative(
    indicators,
    sample_size,
    labels,
    positive_fraction=0.5,
    dtype=tf.float32
)

Subsamples minibatches to a desired balance of positives and negatives.

Arguments:

  • indicator: boolean tensor of shape [batch_size, N] whose True entries can be sampled.
  • sample_size: desired batch size. If None, keeps all positive samples and randomly selects negative samples so that the positive sample fraction matches positive_fraction.
  • labels: boolean tensor of shape [batch_size, N] denoting positive(=True) and negative (=False) examples.
  • positive_fraction: desired fraction of positive examples (scalar in [0,1]) in the batch.

Returns:

A boolean tensor of shape [M, N], True for entries which are sampled.

View Source
def batch_sample_balanced_positive_negative(indicators,

                                            sample_size,

                                            labels,

                                            positive_fraction=0.5,

                                            dtype=tf.float32):

    """Subsamples minibatches to a desired balance of positives and negatives.

    Arguments:

    - *indicator*: boolean tensor of shape [batch_size, N] whose True entries can be sampled.

    - *sample_size*: desired batch size. If None, keeps all positive samples and

        randomly selects negative samples so that the positive sample fraction

        matches positive_fraction.

    - *labels*: boolean tensor of shape [batch_size, N] denoting positive(=True) and negative

        (=False) examples.

    - *positive_fraction*: desired fraction of positive examples (scalar in [0,1])

        in the batch.

    Returns:

    A boolean tensor of shape [M, N], True for entries which are sampled.

    """

    def _minibatch_subsample_fn(inputs):

        indicators, targets = inputs

        return sample_balanced_positive_negative(tf.cast(indicators, tf.bool),

                                                 sample_size,

                                                 tf.cast(targets, tf.bool),

                                                 positive_fraction=positive_fraction)

    return tf.cast(tf.map_fn(_minibatch_subsample_fn, [indicators, labels],

                             dtype=tf.bool,

                             parallel_iterations=16,

                             back_prop=True),

                   dtype=dtype)

sample_balanced_positive_negative

def sample_balanced_positive_negative(
    indicator,
    sample_size,
    labels,
    positive_fraction=0.5
)

Subsamples minibatches to a desired balance of positives and negatives.

Arguments:

  • indicator: boolean tensor of shape [N] whose True entries can be sampled.
  • sample_size: desired batch size. If None, keeps all positive samples and randomly selects negative samples so that the positive sample fraction matches positive_fraction.
  • labels: boolean tensor of shape [N] denoting positive(=True) and negative (=False) examples.
  • positive_fraction: desired fraction of positive examples (scalar in [0,1]) in the batch.

Returns:

sampled_idx_indicator: boolean tensor of shape [N], True for entries which are sampled.

View Source
def sample_balanced_positive_negative(indicator, sample_size, labels, positive_fraction=0.5):

    """Subsamples minibatches to a desired balance of positives and negatives.

    Arguments:

    - *indicator*: boolean tensor of shape [N] whose True entries can be sampled.

    - *sample_size*: desired batch size. If None, keeps all positive samples and

        randomly selects negative samples so that the positive sample fraction

        matches positive_fraction.

    - *labels*: boolean tensor of shape [N] denoting positive(=True) and negative

        (=False) examples.

    - *positive_fraction*: desired fraction of positive examples (scalar in [0,1])

        in the batch.

    Returns:

    *sampled_idx_indicator*: boolean tensor of shape [N], True for entries which are sampled.

    """

    negative_idx = tf.logical_not(labels)

    positive_idx = tf.logical_and(labels, indicator)

    negative_idx = tf.logical_and(negative_idx, indicator)

    # Sample positive and negative samples separately

    if sample_size is None:

        max_num_pos = tf.reduce_sum(tf.cast(positive_idx, dtype=tf.int32))

    else:

        max_num_pos = int(positive_fraction * sample_size)

    sampled_pos_idx = subsample_indicator(positive_idx, max_num_pos)

    num_sampled_pos = tf.reduce_sum(tf.cast(sampled_pos_idx, tf.int32))

    if sample_size is None:

        negative_positive_ratio = (1 - positive_fraction) / positive_fraction

        max_num_neg = tf.cast(negative_positive_ratio * tf.cast(num_sampled_pos, dtype=tf.float32),

                              dtype=tf.int32)

    else:

        max_num_neg = sample_size - num_sampled_pos

    sampled_neg_idx = subsample_indicator(negative_idx, max_num_neg)

    return tf.logical_or(sampled_pos_idx, sampled_neg_idx)

subsample_indicator

def subsample_indicator(
    indicator,
    num_samples
)

Subsample indicator vector.

Given a boolean indicator vector with M elements set to True, the function assigns all but num_samples of these previously True elements to False. If num_samples is greater than M, the original indicator vector is returned.

Arguments: - indicator: a 1-dimensional boolean tensor indicating which elements are allowed to be sampled and which are not.

  • num_samples: int32 scalar tensor

Returns:

A boolean tensor with the same shape as input (indicator) tensor

View Source
def subsample_indicator(indicator, num_samples):

    """Subsample indicator vector.

    Given a boolean indicator vector with M elements set to `True`, the function

    assigns all but `num_samples` of these previously `True` elements to

    `False`. If `num_samples` is greater than M, the original indicator vector

    is returned.

    Arguments:

    - *indicator*: a 1-dimensional boolean tensor indicating which elements

        are allowed to be sampled and which are not.

    - *num_samples*: int32 scalar tensor

    Returns:

    A boolean tensor with the same shape as input (indicator) tensor

    """

    indices = tf.where(indicator)

    indices = tf.random.shuffle(indices)

    indices = tf.reshape(indices, [-1])

    num_samples = tf.minimum(tf.size(indices), num_samples)

    selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))

    selected_indicator = ops.indices_to_dense_vector(selected_indices, tf.shape(indicator)[0])

    return tf.equal(selected_indicator, 1)