Paligemma / big_vision_repo /big_vision /evaluators /fewshot_lsr.py

Upload 304 files

fa1a600 verified 10 months ago

9.55 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Utils for few-shot evaluation."""
	# pylint: disable=consider-using-from-import,g-importing-member

	import functools

	import big_vision.datasets.core as ds_core
	import big_vision.input_pipeline as input_pipeline
	import big_vision.pp.builder as pp_builder
	import big_vision.utils as u
	import jax
	import jax.numpy as jnp
	from jax.sharding import NamedSharding as Sharding
	from jax.sharding import PartitionSpec as P
	import numpy as np

	BIAS_CONSTANT = 100.0

	# Temporary global flag to facilitate backwards compatability. Will be removed
	# by the end of year 2023.
	API = "jit"


	# Setup function for few-shot regression on CPU to avoid "polluting" the TPU.
	@u.jit_cpu(static_argnums=(2,))
	def _precompute_cache(x, y, num_classes):
	"""Cache quantities to speed-up the computation of L2-regularized least-sq."""
	# Whiten
	mean = jnp.mean(x, axis=0, keepdims=True)
	std = jnp.std(x, axis=0, keepdims=True) + 1e-5
	x = (x - mean) / std

	# Add a constant feature for the bias, large so it's almost unregularized:
	x = jnp.pad(x, ((0, 0), (0, 1)), constant_values=BIAS_CONSTANT)

	# To one-hot representation rescaled into {-1, 1}
	y = 2.0 * jax.nn.one_hot(y, num_classes) - 1.0

	num_points, dim = x.shape
	# Let N be the number of points, D the dimension and C the number of classes.
	# We have x of shape (N, D) and y of shape (N, C).
	# For least-squares, we can compute
	#
	# (A) when N >= D, (x^T x + l2 Id)^{-1} x^T y
	# (B) when D > N, x^T (x x^T + l2 Id)^{-1} y
	#
	# We pre-compute the eigen-decomposition of either x^T x or x x^T which
	# becomes q diag(eigs) q^T with q unitary matrix either (D, D) or (N, N)
	# and eigs a vector (D,) or (N,).
	#
	# For any l2 > 0, we can compute (x^T x + l2 Id)^{-1} or (x x^T + l2 Id)^{-1}
	# by simply computing q (diag(eigs) + l2 Id)^{-1} q^T.
	# (SVD would be more natural here, but it proved slower, so we use eigh)
	#
	# Both cases (A) and (B) can be viewed as lhs (diag(eigs) + l2 Id)^{-1} rhs,
	# where lhs/rhs are pre-computed left/right-hand sides to specify.
	#
	# Detailed evaluation in terms of time and fewshot metrics can be found in
	# (internal link)
	#
	# Implemented by Rodolphe Jenatton.
	if num_points >= dim:
	eigs, q = jnp.linalg.eigh(x.T @ x)
	rhs = q.T @ (x.T @ y)
	lhs = q
	else:
	eigs, q = jnp.linalg.eigh(x @ x.T)
	rhs = q.T @ y
	lhs = x.T @ q

	cache = {
	"eigs": eigs,
	"rhs": rhs,
	"lhs": lhs,
	"mean": mean,
	"std": std
	}
	return cache


	@u.jit_cpu()
	def _eig_fewshot_acc_fn(cache, x_test, y_test, l2_reg):
	"""Computes (x,y) linear regression accuracy on (x_test, y_test)."""

	x_test = (x_test - cache["mean"]) / cache["std"]
	x_test = jnp.pad(x_test, ((0, 0), (0, 1)), constant_values=BIAS_CONSTANT)

	rhs = cache["rhs"]
	lhs = cache["lhs"]
	eigs = cache["eigs"]

	# See comments in _precompute_cache for context about the formula.
	scaling = 1.0 / (eigs + l2_reg * jnp.ones_like(eigs))
	scaling = scaling.reshape((1, -1))
	w = (lhs * scaling) @ rhs
	# Predict test-set values and measure their accuracy
	preds = jnp.argmax(x_test @ w, axis=1)
	return jnp.mean(preds == y_test)


	class Evaluator:
	"""Class for few-shot evaluation."""

	def __init__(self, predict_fn, batch_size,
	datasets, shots, l2_reg,
	pp_train, pp_eval, display_first,
	representation_layer=None, num_seeds=3,
	label_key="label", mask_key="_mask", data_dir=None, *,
	devices):
	self.datasets = datasets
	self.shots = shots
	self.l2_reg = l2_reg
	self.batch_size = batch_size
	self.pp_tr = pp_train
	self.pp_te = pp_eval
	self.display_first = display_first
	self._datasets = {} # Cache for tfds data. Persists while object is alive.
	self._repr = {} # Cache for precomputed repr. Persists within the run call.
	self.num_seeds = num_seeds
	self.label_key = label_key
	self.mask_key = mask_key
	self.data_dir = data_dir
	self.devices = devices
	self.mesh = jax.sharding.Mesh(devices, ("devices",))
	self.repr_fn = self.get_representation_fn(
	predict_fn, representation_layer)

	def get_representation_fn(self, predict_fn, representation_layer):
	# `out_shardings=Sharding(self.mesh, P())` will "all_gather" the outputs.
	@functools.partial(jax.jit, out_shardings=Sharding(self.mesh, P()))
	def _repr_fn(train_state, batch, labels, mask):
	zimg, *_, out = predict_fn(train_state, batch)
	if representation_layer is not None:
	rep = u.tree_get(out, representation_layer)
	else:
	rep = zimg
	return rep, labels, mask
	return _repr_fn

	# Setup input pipeline.
	def _get_dataset(self, dataset, train_split, test_split):
	"""Lazy-loads given dataset."""
	key = (dataset, train_split, test_split)
	try:
	return self._datasets[key]
	except KeyError:
	# NOTE: only supporting TFDS data for now for bwd compat/lazyness.
	train_data = ds_core.get(
	name=dataset, split=train_split, data_dir=self.data_dir
	)
	test_data = ds_core.get(
	name=dataset, split=test_split, data_dir=self.data_dir
	)
	train_ds, batches_tr = input_pipeline.make_for_inference(
	train_data.get_tfdata(ordered=True),
	num_ex_per_process=train_data.num_examples_per_process(),
	batch_size=self.batch_size,
	preprocess_fn=pp_builder.get_preprocess_fn(self.pp_tr))
	test_ds, batches_te = input_pipeline.make_for_inference(
	test_data.get_tfdata(ordered=True),
	num_ex_per_process=test_data.num_examples_per_process(),
	batch_size=self.batch_size,
	preprocess_fn=pp_builder.get_preprocess_fn(self.pp_te))

	num_classes = train_data.builder.info.features[self.label_key].num_classes
	return self._datasets.setdefault(
	key, (train_ds, batches_tr, test_ds, batches_te, num_classes))

	def _get_repr(self, params, data, steps):
	"""Compute representation for the whole dataset."""
	pre_logits_list = []
	labels_list = []
	for batch, _ in zip(
	input_pipeline.start_global(data, self.devices, 0), range(steps)):
	labels, mask = batch.pop(self.label_key), batch.pop(self.mask_key)
	pre_logits, labels, mask = jax.device_get(self.repr_fn(
	params, batch, labels, mask))
	mask = mask.astype(bool)
	pre_logits_list.append(pre_logits[mask])
	labels_list.append(labels[mask])
	pre_logits = np.concatenate(pre_logits_list, axis=0)
	labels = np.concatenate(labels_list, axis=0)

	return pre_logits, labels

	def compute_fewshot_metrics(self, train_state, seed,
	dataset, train_split, test_split):
	"""Compute few-shot metrics on one dataset."""
	if dataset in self._repr:
	repr_train, labels_train, repr_test, labels_test, num_classes = (
	self._repr[dataset])
	else:
	train_ds, steps_tr, test_ds, steps_te, num_classes = self._get_dataset(
	dataset, train_split, test_split)
	repr_train, labels_train = self._get_repr(train_state, train_ds, steps_tr)
	repr_test, labels_test = self._get_repr(train_state, test_ds, steps_te)
	self._repr[dataset] = (repr_train, labels_train,
	repr_test, labels_test,
	num_classes)

	# Collect where we have samples of which classes.
	rng = np.random.default_rng(seed)
	class_indices = [rng.permutation(np.where(labels_train == cls_i)[0])
	for cls_i in range(num_classes)]

	results = {}
	for shots in self.shots:
	all_idx = [indices[:shots] for indices in class_indices]
	all_idx = np.concatenate(all_idx, axis=0)
	x = u.put_cpu(repr_train[all_idx])
	y = u.put_cpu(labels_train[all_idx])
	repr_test, labels_test = u.put_cpu((repr_test, labels_test))

	# Note the code is optimized to solve multiple LSR tasks for changing l2
	# strength, even though we currently used the fixed l2_reg constant.
	cache = _precompute_cache(x, y, num_classes)
	acc = _eig_fewshot_acc_fn(
	cache, repr_test, labels_test, u.put_cpu(self.l2_reg))
	results[shots] = jax.device_get(acc)

	return results

	def run(self, train_state):
	"""New API executed in terms of old API."""
	self._repr = {}
	for seed in range(self.num_seeds):
	for name, dataset_args in self.datasets.items():
	result = self.compute_fewshot_metrics(train_state, seed, *dataset_args)
	for shots, v in result.items():
	prefix = "a/" if (name, shots) in self.display_first else "z/"
	suffix = f"-seed-{seed}"
	yield f"{prefix}{name}_{shots}shot{suffix}", v