Module kerod.dataset.preprocessing
None
None
View Source
import tensorflow as tf
from kerod.core import constants
from kerod.core.standard_fields import BoxField, DatasetField
from kerod.dataset.utils import filter_crowded_boxes, filter_bad_area
from kerod.dataset import augmentation as aug
def resize_to_min_dim(image, short_edge_length, max_dimension):
"""Resize an image given to the min size maintaining the aspect ratio.
If one of the image dimensions is bigger than the max_dimension after resizing, it will scale
the image such that its biggest dimension is equal to the max_dimension.
Arguments :
- *image*: A np.array of size [height, width, channels].
- *short_edge_length*: minimum image dimension.
- *max_dimension*: If the resized largest size is over max_dimension. Will use to max_dimension
to compute the resizing ratio.
Returns:
- *resized_image*: The input image resized with the aspect_ratio preserved in float32
Raises:
ValueError: If the max_dimension is above `kerod.core.constants.MAX_IMAGE_SIZE`
"""
if max_dimension > constants.MAX_IMAGE_DIMENSION:
raise ValueError(
f"The max_dimension can only be inferior or equal to {constants.MAX_IMAGE_DIMENSION}")
shape = tf.shape(image)
height = tf.cast(shape[0], tf.float32)
width = tf.cast(shape[1], tf.float32)
im_size_min = tf.minimum(height, width)
im_size_max = tf.maximum(height, width)
scale = short_edge_length / im_size_min
# Prevent the biggest axis from being more than MAX_SIZE
if tf.math.round(scale * im_size_max) > max_dimension:
scale = max_dimension / im_size_max
target_height = tf.cast(height * scale, dtype=tf.int32)
target_width = tf.cast(width * scale, dtype=tf.int32)
return tf.image.resize(tf.expand_dims(image, axis=0),
size=[target_height, target_width],
method=tf.image.ResizeMethod.BILINEAR)[0]
def preprocess(inputs, bgr=True, horizontal_flip=True, random_crop_size=None, padded_mask=False):
"""This operations performs a classical preprocessing operations for localization datasets:
- COCO
- Pascal Voc
You can download easily those dataset using [tensorflow dataset](https://www.tensorflow.org/datasets/catalog/overview).
Arguments:
- *inputs*: It can be either a [FeaturesDict](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/FeaturesDict) or a dict.
but it should have the following structures.
```python
inputs = FeaturesDict({
'image': Image(shape=(None, None, 3), dtype=tf.uint8),
'objects': Sequence({
'area': Tensor(shape=(), dtype=tf.int64), # area
'bbox': BBoxFeature(shape=(4,), dtype=tf.float32), # The values are between 0 and 1
'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=80),
}),
})
```
- *bgr*: Convert your input image to BGR (od.model.faster_rcnn.FasterRcnnFPNResnet50 needs it).
If you have open your image with `tf.image.decode_image` will open an image in RGB. However,
OpenCV will open it in BGR by default.
- *horizontal_flip*: Activate the random horizontal flip.
- *random_crop_size*: 1-D tensor with size the rank of `image` (e.g: (400, 600, 0)).
- *padded_mask*: If set to true return a mask of 1 of the image. When padded
we will know which parts is from the original image.
Returns:
- *inputs*:
1. image: A 3D tensor of float32 and shape [None, None, 3]
2. image_informations: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a `padded_batch` operations.
The models needs those information in order to clip the boxes to the proper dimension.
3. images_padding_mask: If padded_mask set to true return a 2D tensor of int8 and shape [None, None, 3].
Mask of the image if a padding is performed we will know where the original image was.
- *ground_truths*:
1. BoxField.BOXES: A tensor of shape [num_boxes, (y1, x1, y2, x2)] and resized to the image shape
2. BoxField.LABELS: A tensor of shape [num_boxes, ]
3. BoxField.NUM_BOXES: A tensor of shape (). It is usefull to unpad the data in case of a batched training
"""
image = inputs['image'][:, :, ::-1] if bgr else inputs['image']
image = tf.cast(image, tf.float32)
targets = inputs['objects']
if horizontal_flip:
image, targets[BoxField.BOXES] = aug.random_horizontal_flip(image, targets[BoxField.BOXES])
if random_crop_size is not None:
if tf.shape(image)[0] < random_crop_size[0] or tf.shape(image)[1] < random_crop_size[1]:
image = resize_to_min_dim(image, max(random_crop_size), 1333.0)
image, targets = aug.random_random_crop(image, random_crop_size, targets)
if 'is_crowd' in targets:
targets = filter_crowded_boxes(targets)
targets = filter_bad_area(targets)
image = resize_to_min_dim(image, 800.0, 1333.0)
image_information = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
inputs = {DatasetField.IMAGES: image, DatasetField.IMAGES_INFO: image_information}
if padded_mask:
inputs[DatasetField.IMAGES_PMASK] = tf.ones((tf.shape(image)[0], tf.shape(image)[1]),
dtype=tf.int8)
ground_truths = {
BoxField.BOXES: targets[BoxField.BOXES] * tf.tile(image_information[tf.newaxis], [1, 2]),
BoxField.LABELS: tf.cast(targets[BoxField.LABELS], tf.int32),
BoxField.NUM_BOXES: tf.shape(targets[BoxField.LABELS]),
BoxField.WEIGHTS: tf.fill(tf.shape(targets[BoxField.LABELS]), 1.0)
}
return inputs, ground_truths
def expand_dims_for_single_batch(inputs, ground_truths):
"""In order to train your model you need to add a batch dimension to the output of the preprocess
function. For a single batch operation this method is faster:
- `expand_dims`:
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.map(expand_dims_for_single_batch, num_parallel_calls=tf.data.experimental.AUTOTUNE)
```
> Execution time: 0.002636657891998766
- `batch`
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.batch(1)
```
> Execution time: 0.004332915792008862
- `padded_batch`
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.padded_batch(batch_size, padded_shapes=padded_shapes)
```
> Execution time: 0.0055130551019974515
Returns:
- *inputs*: The features and the ground_truths are mixed together
1. DatasetField.IMAGES: A 3D tensor of float32 and shape [None, None, 3]
2. DatasetField.IMAGES_INFO: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a `padded_batch` operations.
The models needs those information in order to clip the boxes to the proper dimension.
- *ground_truths*:
1. BoxField.BOXES: A tensor of shape [1, num_boxes, (y1, x1, y2, x2)] and resized to the image shape
2. BoxField.LABELS: A tensor of shape [1, num_boxes, ]
3. BoxField.NUM_BOXES: A tensor of shape [1, 1]. It is usefull to unpad the data in case of a batched training
4. BoxField.WEIGHTS: A tensor of shape [1]
"""
inputs = {
DatasetField.IMAGES: inputs[DatasetField.IMAGES][None],
DatasetField.IMAGES_INFO: inputs[DatasetField.IMAGES_INFO][None]
}
ground_truths = {
BoxField.BOXES: ground_truths[BoxField.BOXES][None],
BoxField.LABELS: ground_truths[BoxField.LABELS][None],
BoxField.NUM_BOXES: ground_truths[BoxField.NUM_BOXES][None],
BoxField.WEIGHTS: ground_truths[BoxField.WEIGHTS][None]
}
return inputs, ground_truths
Functions
expand_dims_for_single_batch
def expand_dims_for_single_batch(
inputs,
ground_truths
)
In order to train your model you need to add a batch dimension to the output of the preprocess
function. For a single batch operation this method is faster:
expand_dims
:
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.map(expand_dims_for_single_batch, num_parallel_calls=tf.data.experimental.AUTOTUNE)
Execution time: 0.002636657891998766
batch
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.batch(1)
Execution time: 0.004332915792008862
padded_batch
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.padded_batch(batch_size, padded_shapes=padded_shapes)
Execution time: 0.0055130551019974515
Returns:
-
inputs: The features and the ground_truths are mixed together
- DatasetField.IMAGES: A 3D tensor of float32 and shape [None, None, 3]
- DatasetField.IMAGES_INFO: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a
padded_batch
operations. The models needs those information in order to clip the boxes to the proper dimension.
-
ground_truths:
- BoxField.BOXES: A tensor of shape [1, num_boxes, (y1, x1, y2, x2)] and resized to the image shape
- BoxField.LABELS: A tensor of shape [1, num_boxes, ]
- BoxField.NUM_BOXES: A tensor of shape [1, 1]. It is usefull to unpad the data in case of a batched training
- BoxField.WEIGHTS: A tensor of shape [1]
View Source
def expand_dims_for_single_batch(inputs, ground_truths):
"""In order to train your model you need to add a batch dimension to the output of the preprocess
function. For a single batch operation this method is faster:
- `expand_dims`:
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.map(expand_dims_for_single_batch, num_parallel_calls=tf.data.experimental.AUTOTUNE)
```
> Execution time: 0.002636657891998766
- `batch`
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.batch(1)
```
> Execution time: 0.004332915792008862
- `padded_batch`
```python
ds_train = tfds.load(name="voc", split="train", shuffle_files=True)
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.padded_batch(batch_size, padded_shapes=padded_shapes)
```
> Execution time: 0.0055130551019974515
Returns:
- *inputs*: The features and the ground_truths are mixed together
1. DatasetField.IMAGES: A 3D tensor of float32 and shape [None, None, 3]
2. DatasetField.IMAGES_INFO: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a `padded_batch` operations.
The models needs those information in order to clip the boxes to the proper dimension.
- *ground_truths*:
1. BoxField.BOXES: A tensor of shape [1, num_boxes, (y1, x1, y2, x2)] and resized to the image shape
2. BoxField.LABELS: A tensor of shape [1, num_boxes, ]
3. BoxField.NUM_BOXES: A tensor of shape [1, 1]. It is usefull to unpad the data in case of a batched training
4. BoxField.WEIGHTS: A tensor of shape [1]
"""
inputs = {
DatasetField.IMAGES: inputs[DatasetField.IMAGES][None],
DatasetField.IMAGES_INFO: inputs[DatasetField.IMAGES_INFO][None]
}
ground_truths = {
BoxField.BOXES: ground_truths[BoxField.BOXES][None],
BoxField.LABELS: ground_truths[BoxField.LABELS][None],
BoxField.NUM_BOXES: ground_truths[BoxField.NUM_BOXES][None],
BoxField.WEIGHTS: ground_truths[BoxField.WEIGHTS][None]
}
return inputs, ground_truths
preprocess
def preprocess(
inputs,
bgr=True,
horizontal_flip=True,
random_crop_size=None,
padded_mask=False
)
This operations performs a classical preprocessing operations for localization datasets:
- COCO
- Pascal Voc
You can download easily those dataset using tensorflow dataset.
Arguments:
- inputs: It can be either a FeaturesDict or a dict. but it should have the following structures.
inputs = FeaturesDict({
'image': Image(shape=(None, None, 3), dtype=tf.uint8),
'objects': Sequence({
'area': Tensor(shape=(), dtype=tf.int64), # area
'bbox': BBoxFeature(shape=(4,), dtype=tf.float32), # The values are between 0 and 1
'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=80),
}),
})
-
bgr: Convert your input image to BGR (od.model.faster_rcnn.FasterRcnnFPNResnet50 needs it). If you have open your image with
tf.image.decode_image
will open an image in RGB. However, OpenCV will open it in BGR by default. -
horizontal_flip: Activate the random horizontal flip.
- random_crop_size: 1-D tensor with size the rank of
image
(e.g: (400, 600, 0)). - padded_mask: If set to true return a mask of 1 of the image. When padded we will know which parts is from the original image.
Returns:
- inputs:
- image: A 3D tensor of float32 and shape [None, None, 3]
- image_informations: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a
padded_batch
operations. The models needs those information in order to clip the boxes to the proper dimension. - images_padding_mask: If padded_mask set to true return a 2D tensor of int8 and shape [None, None, 3]. Mask of the image if a padding is performed we will know where the original image was.
- ground_truths:
- BoxField.BOXES: A tensor of shape [num_boxes, (y1, x1, y2, x2)] and resized to the image shape
- BoxField.LABELS: A tensor of shape [num_boxes, ]
- BoxField.NUM_BOXES: A tensor of shape (). It is usefull to unpad the data in case of a batched training
View Source
def preprocess(inputs, bgr=True, horizontal_flip=True, random_crop_size=None, padded_mask=False):
"""This operations performs a classical preprocessing operations for localization datasets:
- COCO
- Pascal Voc
You can download easily those dataset using [tensorflow dataset](https://www.tensorflow.org/datasets/catalog/overview).
Arguments:
- *inputs*: It can be either a [FeaturesDict](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/FeaturesDict) or a dict.
but it should have the following structures.
```python
inputs = FeaturesDict({
'image': Image(shape=(None, None, 3), dtype=tf.uint8),
'objects': Sequence({
'area': Tensor(shape=(), dtype=tf.int64), # area
'bbox': BBoxFeature(shape=(4,), dtype=tf.float32), # The values are between 0 and 1
'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=80),
}),
})
```
- *bgr*: Convert your input image to BGR (od.model.faster_rcnn.FasterRcnnFPNResnet50 needs it).
If you have open your image with `tf.image.decode_image` will open an image in RGB. However,
OpenCV will open it in BGR by default.
- *horizontal_flip*: Activate the random horizontal flip.
- *random_crop_size*: 1-D tensor with size the rank of `image` (e.g: (400, 600, 0)).
- *padded_mask*: If set to true return a mask of 1 of the image. When padded
we will know which parts is from the original image.
Returns:
- *inputs*:
1. image: A 3D tensor of float32 and shape [None, None, 3]
2. image_informations: A 1D tensor of float32 and shape [(height, width),]. It contains the shape
of the image without any padding. It can be usefull if it followed by a `padded_batch` operations.
The models needs those information in order to clip the boxes to the proper dimension.
3. images_padding_mask: If padded_mask set to true return a 2D tensor of int8 and shape [None, None, 3].
Mask of the image if a padding is performed we will know where the original image was.
- *ground_truths*:
1. BoxField.BOXES: A tensor of shape [num_boxes, (y1, x1, y2, x2)] and resized to the image shape
2. BoxField.LABELS: A tensor of shape [num_boxes, ]
3. BoxField.NUM_BOXES: A tensor of shape (). It is usefull to unpad the data in case of a batched training
"""
image = inputs['image'][:, :, ::-1] if bgr else inputs['image']
image = tf.cast(image, tf.float32)
targets = inputs['objects']
if horizontal_flip:
image, targets[BoxField.BOXES] = aug.random_horizontal_flip(image, targets[BoxField.BOXES])
if random_crop_size is not None:
if tf.shape(image)[0] < random_crop_size[0] or tf.shape(image)[1] < random_crop_size[1]:
image = resize_to_min_dim(image, max(random_crop_size), 1333.0)
image, targets = aug.random_random_crop(image, random_crop_size, targets)
if 'is_crowd' in targets:
targets = filter_crowded_boxes(targets)
targets = filter_bad_area(targets)
image = resize_to_min_dim(image, 800.0, 1333.0)
image_information = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
inputs = {DatasetField.IMAGES: image, DatasetField.IMAGES_INFO: image_information}
if padded_mask:
inputs[DatasetField.IMAGES_PMASK] = tf.ones((tf.shape(image)[0], tf.shape(image)[1]),
dtype=tf.int8)
ground_truths = {
BoxField.BOXES: targets[BoxField.BOXES] * tf.tile(image_information[tf.newaxis], [1, 2]),
BoxField.LABELS: tf.cast(targets[BoxField.LABELS], tf.int32),
BoxField.NUM_BOXES: tf.shape(targets[BoxField.LABELS]),
BoxField.WEIGHTS: tf.fill(tf.shape(targets[BoxField.LABELS]), 1.0)
}
return inputs, ground_truths
resize_to_min_dim
def resize_to_min_dim(
image,
short_edge_length,
max_dimension
)
Resize an image given to the min size maintaining the aspect ratio.
If one of the image dimensions is bigger than the max_dimension after resizing, it will scale the image such that its biggest dimension is equal to the max_dimension.
Arguments :
- image: A np.array of size [height, width, channels].
- short_edge_length: minimum image dimension.
- max_dimension: If the resized largest size is over max_dimension. Will use to max_dimension to compute the resizing ratio.
Returns: - resized_image: The input image resized with the aspect_ratio preserved in float32
Raises:
ValueError: If the max_dimension is above kerod.core.constants.MAX_IMAGE_SIZE
View Source
def resize_to_min_dim(image, short_edge_length, max_dimension):
"""Resize an image given to the min size maintaining the aspect ratio.
If one of the image dimensions is bigger than the max_dimension after resizing, it will scale
the image such that its biggest dimension is equal to the max_dimension.
Arguments :
- *image*: A np.array of size [height, width, channels].
- *short_edge_length*: minimum image dimension.
- *max_dimension*: If the resized largest size is over max_dimension. Will use to max_dimension
to compute the resizing ratio.
Returns:
- *resized_image*: The input image resized with the aspect_ratio preserved in float32
Raises:
ValueError: If the max_dimension is above `kerod.core.constants.MAX_IMAGE_SIZE`
"""
if max_dimension > constants.MAX_IMAGE_DIMENSION:
raise ValueError(
f"The max_dimension can only be inferior or equal to {constants.MAX_IMAGE_DIMENSION}")
shape = tf.shape(image)
height = tf.cast(shape[0], tf.float32)
width = tf.cast(shape[1], tf.float32)
im_size_min = tf.minimum(height, width)
im_size_max = tf.maximum(height, width)
scale = short_edge_length / im_size_min
# Prevent the biggest axis from being more than MAX_SIZE
if tf.math.round(scale * im_size_max) > max_dimension:
scale = max_dimension / im_size_max
target_height = tf.cast(height * scale, dtype=tf.int32)
target_width = tf.cast(width * scale, dtype=tf.int32)
return tf.image.resize(tf.expand_dims(image, axis=0),
size=[target_height, target_width],
method=tf.image.ResizeMethod.BILINEAR)[0]