|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Image-centric preprocessing ops.
|
|
|
|
All preprocessing ops should return a data processing functors. A data
|
|
is represented as a dictionary of (TF) tensors. The functors output a modified
|
|
dictionary.
|
|
|
|
The key named "image" is commonly used for the image, and is a 3D tensor of
|
|
shape (height x width x channels).
|
|
"""
|
|
|
|
from big_vision.pp import utils
|
|
from big_vision.pp.registry import Registry
|
|
|
|
import tensorflow as tf
|
|
|
|
|
|
@Registry.register("preprocess_ops.decode")
|
|
@utils.InKeyOutKey()
|
|
def get_decode(channels=3, precise=False):
|
|
"""Decode an encoded image string, see tf.io.decode_image.
|
|
|
|
Args:
|
|
channels: see tf.io.decode_image.
|
|
precise: if False, use default TF image decoding algorithm.
|
|
If True, change DCT method for JPEG decoding to match PIL/cv2/PyTorch.
|
|
See also (internal link) for a concrete example.
|
|
|
|
Returns:
|
|
The decoded image.
|
|
"""
|
|
|
|
def _decode(image):
|
|
if precise:
|
|
return tf.image.decode_jpeg(
|
|
image, channels=channels, dct_method="INTEGER_ACCURATE")
|
|
else:
|
|
return tf.io.decode_image(
|
|
image, channels=channels, expand_animations=False)
|
|
|
|
return _decode
|
|
|
|
|
|
@Registry.register("preprocess_ops.resize")
|
|
@utils.InKeyOutKey()
|
|
def get_resize(size, method="bilinear", antialias=False):
|
|
"""Resizes image to a given size.
|
|
|
|
Args:
|
|
size: either an integer H, where H is both the new height and width
|
|
of the resized image, or a list or tuple [H, W] of integers, where H and W
|
|
are new image"s height and width respectively.
|
|
method: resize method, see tf.image.resize docs for options.
|
|
antialias: see tf.image.resize. Ideally set to True for all new configs.
|
|
|
|
Returns:
|
|
A function for resizing an image.
|
|
|
|
"""
|
|
size = utils.maybe_repeat(size, 2)
|
|
|
|
def _resize(image):
|
|
"""Resizes image to a given size."""
|
|
|
|
|
|
|
|
|
|
|
|
dtype = image.dtype
|
|
tf_dtype = tf.type_spec_from_value(image).dtype
|
|
image = tf.image.resize(image, size, method=method, antialias=antialias)
|
|
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
|
|
|
|
return _resize
|
|
|
|
|
|
|
|
|
|
|
|
def _resize_factor(image, factor, method="area", antialias=True):
|
|
"""Resizes the image by a (float) `factor`, keeping the aspect ratio fixed."""
|
|
h, w = tf.shape(image)[0], tf.shape(image)[1]
|
|
|
|
h = tf.cast(tf.round(tf.cast(h, tf.float32) * factor), tf.int32)
|
|
w = tf.cast(tf.round(tf.cast(w, tf.float32) * factor), tf.int32)
|
|
|
|
dtype = image.dtype
|
|
tf_dtype = tf.type_spec_from_value(image).dtype
|
|
image = tf.image.resize(image, (h, w), method=method, antialias=antialias)
|
|
return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
|
|
|
|
|
|
@Registry.register("preprocess_ops.resize_small")
|
|
@utils.InKeyOutKey()
|
|
def get_resize_small(smaller_size, method="area", antialias=False):
|
|
"""Resizes the smaller side to `smaller_size` keeping aspect ratio.
|
|
|
|
Args:
|
|
smaller_size: an integer, that represents a new size of the smaller side of
|
|
an input image.
|
|
method: the resize method. `area` is a meaningful, bwd-compat default.
|
|
antialias: see tf.image.resize. Ideally set to True for all new configs.
|
|
|
|
Returns:
|
|
A function, that resizes an image and preserves its aspect ratio.
|
|
|
|
Note:
|
|
backwards-compat for "area"+antialias tested here:
|
|
(internal link)
|
|
"""
|
|
|
|
def _resize_small(image):
|
|
h, w = tf.shape(image)[0], tf.shape(image)[1]
|
|
factor = (
|
|
tf.cast(smaller_size, tf.float32) /
|
|
tf.cast(tf.minimum(h, w), tf.float32))
|
|
return _resize_factor(image, factor, method=method, antialias=antialias)
|
|
return _resize_small
|
|
|
|
|
|
@Registry.register("preprocess_ops.resize_long")
|
|
@utils.InKeyOutKey()
|
|
def get_resize_long(longer_size, method="area", antialias=True):
|
|
"""Resizes the longer side to `longer_size` keeping aspect ratio.
|
|
|
|
Args:
|
|
longer_size: an integer, that represents a new size of the longer side of
|
|
an input image.
|
|
method: the resize method. `area` is a meaningful, bwd-compat default.
|
|
antialias: see tf.image.resize. Ideally set to True for all new configs.
|
|
|
|
Returns:
|
|
A function, that resizes an image and preserves its aspect ratio.
|
|
"""
|
|
|
|
def _resize_long(image):
|
|
h, w = tf.shape(image)[0], tf.shape(image)[1]
|
|
factor = (
|
|
tf.cast(longer_size, tf.float32) /
|
|
tf.cast(tf.maximum(h, w), tf.float32))
|
|
return _resize_factor(image, factor, method=method, antialias=antialias)
|
|
return _resize_long
|
|
|
|
|
|
@Registry.register("preprocess_ops.inception_crop")
|
|
@utils.InKeyOutKey()
|
|
def get_inception_crop(size=None, area_min=5, area_max=100,
|
|
method="bilinear", antialias=False):
|
|
"""Makes inception-style image crop.
|
|
|
|
Inception-style crop is a random image crop (its size and aspect ratio are
|
|
random) that was used for training Inception models, see
|
|
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
|
|
|
|
Args:
|
|
size: Resize image to [size, size] after crop.
|
|
area_min: minimal crop area.
|
|
area_max: maximal crop area.
|
|
method: rezied method, see tf.image.resize docs for options.
|
|
antialias: see tf.image.resize. Ideally set to True for all new configs.
|
|
|
|
Returns:
|
|
A function, that applies inception crop.
|
|
"""
|
|
|
|
def _inception_crop(image):
|
|
begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
|
|
tf.shape(image),
|
|
tf.zeros([0, 0, 4], tf.float32),
|
|
area_range=(area_min / 100, area_max / 100),
|
|
min_object_covered=0,
|
|
use_image_if_no_bounding_boxes=True)
|
|
crop = tf.slice(image, begin, crop_size)
|
|
|
|
|
|
crop.set_shape([None, None, image.shape[-1]])
|
|
if size:
|
|
crop = get_resize(size, method, antialias)({"image": crop})["image"]
|
|
return crop
|
|
|
|
return _inception_crop
|
|
|
|
|
|
@Registry.register("preprocess_ops.decode_jpeg_and_inception_crop")
|
|
@utils.InKeyOutKey()
|
|
def get_decode_jpeg_and_inception_crop(size=None, area_min=5, area_max=100,
|
|
ratio_min=0.75, ratio_max=1.33,
|
|
method="bilinear", antialias=False):
|
|
"""Decode jpeg string and make inception-style image crop.
|
|
|
|
Inception-style crop is a random image crop (its size and aspect ratio are
|
|
random) that was used for training Inception models, see
|
|
https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
|
|
|
|
Args:
|
|
size: Resize image to [size, size] after crop.
|
|
area_min: minimal crop area.
|
|
area_max: maximal crop area.
|
|
ratio_min: minimal aspect ratio.
|
|
ratio_max: maximal aspect ratio.
|
|
method: rezied method, see tf.image.resize docs for options.
|
|
antialias: see tf.image.resize. Ideally set to True for all new configs.
|
|
|
|
Returns:
|
|
A function, that applies inception crop.
|
|
"""
|
|
|
|
def _inception_crop(image_data):
|
|
shape = tf.image.extract_jpeg_shape(image_data)
|
|
begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
|
|
shape,
|
|
tf.zeros([0, 0, 4], tf.float32),
|
|
area_range=(area_min / 100, area_max / 100),
|
|
aspect_ratio_range=(ratio_min, ratio_max),
|
|
min_object_covered=0,
|
|
use_image_if_no_bounding_boxes=True)
|
|
|
|
|
|
offset_y, offset_x, _ = tf.unstack(begin)
|
|
target_height, target_width, _ = tf.unstack(crop_size)
|
|
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
|
|
image = tf.image.decode_and_crop_jpeg(image_data, crop_window, channels=3)
|
|
|
|
if size:
|
|
image = get_resize(size, method, antialias)({"image": image})["image"]
|
|
|
|
return image
|
|
|
|
return _inception_crop
|
|
|
|
|
|
@Registry.register("preprocess_ops.random_crop")
|
|
@utils.InKeyOutKey()
|
|
def get_random_crop(crop_size):
|
|
"""Makes a random crop of a given size.
|
|
|
|
Args:
|
|
crop_size: either an integer H, where H is both the height and width of the
|
|
random crop, or a list or tuple [H, W] of integers, where H and W are
|
|
height and width of the random crop respectively.
|
|
|
|
Returns:
|
|
A function, that applies random crop.
|
|
"""
|
|
crop_size = utils.maybe_repeat(crop_size, 2)
|
|
|
|
def _crop(image):
|
|
return tf.image.random_crop(image, (*crop_size, image.shape[-1]))
|
|
|
|
return _crop
|
|
|
|
|
|
@Registry.register("preprocess_ops.central_crop")
|
|
@utils.InKeyOutKey()
|
|
def get_central_crop(crop_size=None):
|
|
"""Makes central crop of a given size.
|
|
|
|
Args:
|
|
crop_size: either an integer H, where H is both the height and width of the
|
|
central crop, or a list or tuple [H, W] of integers, where H and W are
|
|
height and width of the central crop respectively. If `crop_size` is not
|
|
specified, then the largest possible center crop will be taken.
|
|
|
|
Returns:
|
|
A function, that applies central crop.
|
|
"""
|
|
if crop_size:
|
|
crop_size = utils.maybe_repeat(crop_size, 2)
|
|
|
|
def _crop(image):
|
|
if crop_size:
|
|
h, w = crop_size[0], crop_size[1]
|
|
else:
|
|
h = w = tf.minimum(tf.shape(image)[0], tf.shape(image)[1])
|
|
dy = (tf.shape(image)[0] - h) // 2
|
|
dx = (tf.shape(image)[1] - w) // 2
|
|
return tf.image.crop_to_bounding_box(image, dy, dx, h, w)
|
|
|
|
return _crop
|
|
|
|
|
|
@Registry.register("preprocess_ops.flip_lr")
|
|
@utils.InKeyOutKey()
|
|
def get_random_flip_lr():
|
|
"""Flips an image horizontally with probability 50%."""
|
|
|
|
def _random_flip_lr_pp(image):
|
|
return tf.image.random_flip_left_right(image)
|
|
|
|
return _random_flip_lr_pp
|
|
|
|
|
|
@Registry.register("preprocess_ops.vgg_value_range")
|
|
@utils.InKeyOutKey()
|
|
def get_vgg_value_range(
|
|
mean=(0.485 * 255, 0.456 * 255, 0.406 * 255),
|
|
std=(0.229 * 255, 0.224 * 255, 0.225 * 255),
|
|
):
|
|
"""VGG-style preprocessing, subtracts mean and divides by stddev.
|
|
|
|
This preprocessing is very common for ImageNet pre-trained models since VGG,
|
|
and to this day the standard for models coming from most PyTorch codes.
|
|
|
|
Args:
|
|
mean: Tuple of values to be subtracted. Default to widespread VGG values.
|
|
std: Tuple of values to be divided by. Default to widespread VGG values.
|
|
|
|
Returns:
|
|
A function to rescale the values.
|
|
"""
|
|
mean = tf.constant(mean, tf.float32)
|
|
std = tf.constant(std, tf.float32)
|
|
|
|
def _vgg_value_range(image):
|
|
return (tf.cast(image, tf.float32) - mean) / std
|
|
return _vgg_value_range
|
|
|
|
|
|
@Registry.register("preprocess_ops.clip_value_range")
|
|
@utils.InKeyOutKey()
|
|
def get_clip_value_range():
|
|
mean = (0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255)
|
|
std = (0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255)
|
|
|
|
def _clip_value_range(image):
|
|
return (tf.cast(image, tf.float32) - mean) / std
|
|
return _clip_value_range
|
|
|
|
|
|
@Registry.register("preprocess_ops.convert_to_video")
|
|
@utils.InKeyOutKey()
|
|
def get_convert_to_video(num_frames):
|
|
"""Converts an image to a video with zero padded frames.
|
|
|
|
Args:
|
|
num_frames: total number of frames that the video should have.
|
|
|
|
Returns:
|
|
A function for converting an image to a video.
|
|
"""
|
|
|
|
def _convert_to_video(image):
|
|
return tf.pad(
|
|
tf.expand_dims(image, axis=0),
|
|
[[0, num_frames - 1], [0, 0], [0, 0], [0, 0]],
|
|
)
|
|
|
|
return _convert_to_video
|
|
|