ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions

xet

Community

ydshieh commited on Dec 18, 2021

Commit

eabb817

1 Parent(s): a897ce1

make block_size 0 work

Browse files

Files changed (1) hide show

run_image_captioning_flax.py +58 -34

run_image_captioning_flax.py CHANGED Viewed

@@ -785,6 +785,14 @@ def main():
         return model_inputs
     features = datasets.Features(
         {
             "pixel_values": datasets.Array3D(
@@ -801,6 +809,10 @@ def main():
         }
     )
     if training_args.do_train:
         if "train" not in dataset:
             raise ValueError("--do_train requires a train dataset")
@@ -810,15 +822,18 @@ def main():
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
-            tokenization_fn,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
-            remove_columns=[x for x in column_names if x != image_column],
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on train dataset",
             fn_kwargs={"max_target_length": data_args.max_target_length},
         )
     if training_args.do_eval:
         if "validation" not in dataset:
@@ -829,15 +844,18 @@ def main():
         if data_args.max_eval_samples is not None:
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
-            tokenization_fn,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
-            remove_columns=[x for x in column_names if x != image_column],
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on validation dataset",
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
         )
     if training_args.do_predict:
         if "test" not in dataset:
@@ -848,15 +866,18 @@ def main():
         if data_args.max_predict_samples is not None:
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
         predict_dataset = predict_dataset.map(
-            tokenization_fn,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
-            remove_columns=[x for x in column_names if x != image_column],
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on prediction dataset",
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
         )
     # Split the dataset into several chunks - each chunk is processed (.map) without cache to create a
     # data loader separately (in a sequential order).
@@ -894,46 +915,49 @@ def main():
             split: str = ""
     ):
-        if not block_size:
-            block_size = len(ds)
-        steps_per_block = block_size // batch_size
-        num_examples = len(ds)
-        steps = num_examples // batch_size
-        num_splits = steps // steps_per_block + int(steps % steps_per_block > 0)
         if shuffle:
             indices = jax.random.permutation(rng, len(ds))
             indices = np.asarray(indices)
         else:
             indices = np.arange(len(ds))
         for idx in range(num_splits):
-            start_idx = block_size * idx
-            end_idx = block_size * (idx + 1)
-            selected_indices = indices[start_idx:end_idx]
-            _ds = ds.select(selected_indices)
-            names = {
-                "train": "train",
-                "valid": "validation",
-                "test": "prediction",
-            }
-            _ds = _ds.map(
-                feature_extraction_fn,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=[image_column],
-                load_from_cache_file=not data_args.overwrite_cache,
-                features=features,
-                keep_in_memory=keep_in_memory,
-                desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
-            )
-            _ds = _ds.with_format("numpy")
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)

         return model_inputs
+    def preprocess_fn(examples, max_target_length):
+        model_inputs = {}
+        model_inputs.update(tokenization_fn(examples, max_target_length))
+        model_inputs.update(feature_extraction_fn(model_inputs))
+        return model_inputs
     features = datasets.Features(
         {
             "pixel_values": datasets.Array3D(
         }
     )
+    function_kwarg = preprocess_fn if not training_args.block_size else tokenization_fn
+    features_kwarg = features if not training_args.block_size else None
+    remove_columns_kwarg = [x for x in column_names if x != image_column or not training_args.block_size]
     if training_args.do_train:
         if "train" not in dataset:
             raise ValueError("--do_train requires a train dataset")
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
+            function=function_kwarg,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
+            remove_columns=remove_columns_kwarg,
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on train dataset",
             fn_kwargs={"max_target_length": data_args.max_target_length},
+            features=features_kwarg,
         )
+        if not training_args.block_size:
+            train_dataset = train_dataset.with_format("numpy")
     if training_args.do_eval:
         if "validation" not in dataset:
         if data_args.max_eval_samples is not None:
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
+            function=function_kwarg,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
+            remove_columns=remove_columns_kwarg,
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on validation dataset",
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
+            features=features_kwarg,
         )
+        if not training_args.block_size:
+            eval_dataset = eval_dataset.with_format("numpy")
     if training_args.do_predict:
         if "test" not in dataset:
         if data_args.max_predict_samples is not None:
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
         predict_dataset = predict_dataset.map(
+            function=function_kwarg,
             batched=True,
             num_proc=data_args.preprocessing_num_workers,
             # kept image paths
+            remove_columns=remove_columns_kwarg,
             load_from_cache_file=not data_args.overwrite_cache,
             desc=f"Running tokenizer on prediction dataset",
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
+            features=features_kwarg,
         )
+        if not training_args.block_size:
+            predict_dataset = predict_dataset.with_format("numpy")
     # Split the dataset into several chunks - each chunk is processed (.map) without cache to create a
     # data loader separately (in a sequential order).
             split: str = ""
     ):
         if shuffle:
             indices = jax.random.permutation(rng, len(ds))
             indices = np.asarray(indices)
         else:
             indices = np.arange(len(ds))
+        _block_size = len(ds) if not block_size else block_size
+        steps_per_block = _block_size // batch_size
+        num_examples = len(ds)
+        steps = num_examples // batch_size
+        num_splits = steps // steps_per_block + int(steps % steps_per_block > 0)
         for idx in range(num_splits):
+            if not block_size:
+                _ds = ds
+            else:
+                start_idx = block_size * idx
+                end_idx = block_size * (idx + 1)
+                selected_indices = indices[start_idx:end_idx]
+                _ds = ds.select(selected_indices)
+                names = {
+                    "train": "train",
+                    "valid": "validation",
+                    "test": "prediction",
+                }
+                _ds = _ds.map(
+                    feature_extraction_fn,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=[image_column],
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    features=features,
+                    keep_in_memory=keep_in_memory,
+                    desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
+                )
+                _ds = _ds.with_format("numpy")
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)