Paligemma / big_vision_repo /big_vision /pp /ops_image.py

Upload 304 files

fa1a600 verified 10 months ago

12.7 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Image-centric preprocessing ops.

	All preprocessing ops should return a data processing functors. A data
	is represented as a dictionary of (TF) tensors. The functors output a modified
	dictionary.

	The key named "image" is commonly used for the image, and is a 3D tensor of
	shape (height x width x channels).
	"""

	from big_vision.pp import utils
	from big_vision.pp.registry import Registry

	import tensorflow as tf


	@Registry.register("preprocess_ops.decode")
	@utils.InKeyOutKey()
	def get_decode(channels=3, precise=False):
	"""Decode an encoded image string, see tf.io.decode_image.

	Args:
	channels: see tf.io.decode_image.
	precise: if False, use default TF image decoding algorithm.
	If True, change DCT method for JPEG decoding to match PIL/cv2/PyTorch.
	See also (internal link) for a concrete example.

	Returns:
	The decoded image.
	"""

	def _decode(image):
	if precise:
	return tf.image.decode_jpeg( # Also supports png btw.
	image, channels=channels, dct_method="INTEGER_ACCURATE")
	else:
	return tf.io.decode_image(
	image, channels=channels, expand_animations=False)

	return _decode


	@Registry.register("preprocess_ops.resize")
	@utils.InKeyOutKey()
	def get_resize(size, method="bilinear", antialias=False):
	"""Resizes image to a given size.

	Args:
	size: either an integer H, where H is both the new height and width
	of the resized image, or a list or tuple [H, W] of integers, where H and W
	are new image"s height and width respectively.
	method: resize method, see tf.image.resize docs for options.
	antialias: see tf.image.resize. Ideally set to True for all new configs.

	Returns:
	A function for resizing an image.

	"""
	size = utils.maybe_repeat(size, 2)

	def _resize(image):
	"""Resizes image to a given size."""
	# Note: use TF-2 version of tf.image.resize as the version in TF-1 is
	# buggy: https://github.com/tensorflow/tensorflow/issues/6720.
	# In particular it was not equivariant with rotation and lead to the network
	# to learn a shortcut in self-supervised rotation task, if rotation was
	# applied after resize.
	dtype = image.dtype
	tf_dtype = tf.type_spec_from_value(image).dtype
	image = tf.image.resize(image, size, method=method, antialias=antialias)
	return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)

	return _resize


	# This functionality is used by resize_small and resize_long. But we're not
	# registering it as a pp op yet, as there is no need for it. However, it can
	# probably be slightly generalized into "scale augmentation" eventually.
	def _resize_factor(image, factor, method="area", antialias=True):
	"""Resizes the image by a (float) `factor`, keeping the aspect ratio fixed."""
	h, w = tf.shape(image)[0], tf.shape(image)[1]

	h = tf.cast(tf.round(tf.cast(h, tf.float32) * factor), tf.int32)
	w = tf.cast(tf.round(tf.cast(w, tf.float32) * factor), tf.int32)

	dtype = image.dtype
	tf_dtype = tf.type_spec_from_value(image).dtype
	image = tf.image.resize(image, (h, w), method=method, antialias=antialias)
	return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)


	@Registry.register("preprocess_ops.resize_small")
	@utils.InKeyOutKey()
	def get_resize_small(smaller_size, method="area", antialias=False):
	"""Resizes the smaller side to `smaller_size` keeping aspect ratio.

	Args:
	smaller_size: an integer, that represents a new size of the smaller side of
	an input image.
	method: the resize method. `area` is a meaningful, bwd-compat default.
	antialias: see tf.image.resize. Ideally set to True for all new configs.

	Returns:
	A function, that resizes an image and preserves its aspect ratio.

	Note:
	backwards-compat for "area"+antialias tested here:
	(internal link)
	"""

	def _resize_small(image): # pylint: disable=missing-docstring
	h, w = tf.shape(image)[0], tf.shape(image)[1]
	factor = (
	tf.cast(smaller_size, tf.float32) /
	tf.cast(tf.minimum(h, w), tf.float32))
	return _resize_factor(image, factor, method=method, antialias=antialias)
	return _resize_small


	@Registry.register("preprocess_ops.resize_long")
	@utils.InKeyOutKey()
	def get_resize_long(longer_size, method="area", antialias=True):
	"""Resizes the longer side to `longer_size` keeping aspect ratio.

	Args:
	longer_size: an integer, that represents a new size of the longer side of
	an input image.
	method: the resize method. `area` is a meaningful, bwd-compat default.
	antialias: see tf.image.resize. Ideally set to True for all new configs.

	Returns:
	A function, that resizes an image and preserves its aspect ratio.
	"""

	def _resize_long(image): # pylint: disable=missing-docstring
	h, w = tf.shape(image)[0], tf.shape(image)[1]
	factor = (
	tf.cast(longer_size, tf.float32) /
	tf.cast(tf.maximum(h, w), tf.float32))
	return _resize_factor(image, factor, method=method, antialias=antialias)
	return _resize_long


	@Registry.register("preprocess_ops.inception_crop")
	@utils.InKeyOutKey()
	def get_inception_crop(size=None, area_min=5, area_max=100,
	method="bilinear", antialias=False):
	"""Makes inception-style image crop.

	Inception-style crop is a random image crop (its size and aspect ratio are
	random) that was used for training Inception models, see
	https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.

	Args:
	size: Resize image to [size, size] after crop.
	area_min: minimal crop area.
	area_max: maximal crop area.
	method: rezied method, see tf.image.resize docs for options.
	antialias: see tf.image.resize. Ideally set to True for all new configs.

	Returns:
	A function, that applies inception crop.
	"""

	def _inception_crop(image): # pylint: disable=missing-docstring
	begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
	tf.shape(image),
	tf.zeros([0, 0, 4], tf.float32),
	area_range=(area_min / 100, area_max / 100),
	min_object_covered=0, # Don't enforce a minimum area.
	use_image_if_no_bounding_boxes=True)
	crop = tf.slice(image, begin, crop_size)
	# Unfortunately, the above operation loses the depth-dimension. So we need
	# to restore it the manual way.
	crop.set_shape([None, None, image.shape[-1]])
	if size:
	crop = get_resize(size, method, antialias)({"image": crop})["image"]
	return crop

	return _inception_crop


	@Registry.register("preprocess_ops.decode_jpeg_and_inception_crop")
	@utils.InKeyOutKey()
	def get_decode_jpeg_and_inception_crop(size=None, area_min=5, area_max=100,
	ratio_min=0.75, ratio_max=1.33,
	method="bilinear", antialias=False):
	"""Decode jpeg string and make inception-style image crop.

	Inception-style crop is a random image crop (its size and aspect ratio are
	random) that was used for training Inception models, see
	https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.

	Args:
	size: Resize image to [size, size] after crop.
	area_min: minimal crop area.
	area_max: maximal crop area.
	ratio_min: minimal aspect ratio.
	ratio_max: maximal aspect ratio.
	method: rezied method, see tf.image.resize docs for options.
	antialias: see tf.image.resize. Ideally set to True for all new configs.

	Returns:
	A function, that applies inception crop.
	"""

	def _inception_crop(image_data): # pylint: disable=missing-docstring
	shape = tf.image.extract_jpeg_shape(image_data)
	begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
	shape,
	tf.zeros([0, 0, 4], tf.float32),
	area_range=(area_min / 100, area_max / 100),
	aspect_ratio_range=(ratio_min, ratio_max),
	min_object_covered=0, # Don't enforce a minimum area.
	use_image_if_no_bounding_boxes=True)

	# Crop the image to the specified bounding box.
	offset_y, offset_x, _ = tf.unstack(begin)
	target_height, target_width, _ = tf.unstack(crop_size)
	crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
	image = tf.image.decode_and_crop_jpeg(image_data, crop_window, channels=3)

	if size:
	image = get_resize(size, method, antialias)({"image": image})["image"]

	return image

	return _inception_crop


	@Registry.register("preprocess_ops.random_crop")
	@utils.InKeyOutKey()
	def get_random_crop(crop_size):
	"""Makes a random crop of a given size.

	Args:
	crop_size: either an integer H, where H is both the height and width of the
	random crop, or a list or tuple [H, W] of integers, where H and W are
	height and width of the random crop respectively.

	Returns:
	A function, that applies random crop.
	"""
	crop_size = utils.maybe_repeat(crop_size, 2)

	def _crop(image):
	return tf.image.random_crop(image, (*crop_size, image.shape[-1]))

	return _crop


	@Registry.register("preprocess_ops.central_crop")
	@utils.InKeyOutKey()
	def get_central_crop(crop_size=None):
	"""Makes central crop of a given size.

	Args:
	crop_size: either an integer H, where H is both the height and width of the
	central crop, or a list or tuple [H, W] of integers, where H and W are
	height and width of the central crop respectively. If `crop_size` is not
	specified, then the largest possible center crop will be taken.

	Returns:
	A function, that applies central crop.
	"""
	if crop_size:
	crop_size = utils.maybe_repeat(crop_size, 2)

	def _crop(image):
	if crop_size:
	h, w = crop_size[0], crop_size[1]
	else:
	h = w = tf.minimum(tf.shape(image)[0], tf.shape(image)[1])
	dy = (tf.shape(image)[0] - h) // 2
	dx = (tf.shape(image)[1] - w) // 2
	return tf.image.crop_to_bounding_box(image, dy, dx, h, w)

	return _crop


	@Registry.register("preprocess_ops.flip_lr")
	@utils.InKeyOutKey()
	def get_random_flip_lr():
	"""Flips an image horizontally with probability 50%."""

	def _random_flip_lr_pp(image):
	return tf.image.random_flip_left_right(image)

	return _random_flip_lr_pp


	@Registry.register("preprocess_ops.vgg_value_range")
	@utils.InKeyOutKey()
	def get_vgg_value_range(
	mean=(0.485 * 255, 0.456 * 255, 0.406 * 255),
	std=(0.229 * 255, 0.224 * 255, 0.225 * 255),
	):
	"""VGG-style preprocessing, subtracts mean and divides by stddev.

	This preprocessing is very common for ImageNet pre-trained models since VGG,
	and to this day the standard for models coming from most PyTorch codes.

	Args:
	mean: Tuple of values to be subtracted. Default to widespread VGG values.
	std: Tuple of values to be divided by. Default to widespread VGG values.

	Returns:
	A function to rescale the values.
	"""
	mean = tf.constant(mean, tf.float32)
	std = tf.constant(std, tf.float32)

	def _vgg_value_range(image):
	return (tf.cast(image, tf.float32) - mean) / std
	return _vgg_value_range


	@Registry.register("preprocess_ops.clip_value_range")
	@utils.InKeyOutKey()
	def get_clip_value_range():
	mean = (0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255)
	std = (0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255)

	def _clip_value_range(image):
	return (tf.cast(image, tf.float32) - mean) / std
	return _clip_value_range


	@Registry.register("preprocess_ops.convert_to_video")
	@utils.InKeyOutKey()
	def get_convert_to_video(num_frames):
	"""Converts an image to a video with zero padded frames.

	Args:
	num_frames: total number of frames that the video should have.

	Returns:
	A function for converting an image to a video.
	"""

	def _convert_to_video(image):
	return tf.pad(
	tf.expand_dims(image, axis=0),
	[[0, num_frames - 1], [0, 0], [0, 0], [0, 0]],
	)

	return _convert_to_video