WanImageProcessor / README.md
YiYiXu's picture
Update README.md
d0034f7 verified

this repo contains the default image preprocessing code for wan 2.2 5B I2V pipeline

this will load an image from URL, resize and center crop it

image_processor = ModularPipeline.from_pretrained("YiYiXu/WanImageProcessor", trust_remote_code=True)
image = image_processor(
    image="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG",
    max_area=1280*704, output="processed_image")

it does this:

# copied from https://github.com/Wan-Video/Wan2.2/blob/388807310646ed5f318a99f8e8d9ad28c5b65373/wan/utils/utils.py#L136
def best_output_size(w, h, dw, dh, expected_area):
    # float output size
    ratio = w / h
    ow = (expected_area * ratio)**0.5
    oh = expected_area / ow

    # process width first
    ow1 = int(ow // dw * dw)
    oh1 = int(expected_area / ow1 // dh * dh)
    assert ow1 % dw == 0 and oh1 % dh == 0 and ow1 * oh1 <= expected_area
    ratio1 = ow1 / oh1

    # process height first
    oh2 = int(oh // dh * dh)
    ow2 = int(expected_area / oh2 // dw * dw)
    assert oh2 % dh == 0 and ow2 % dw == 0 and ow2 * oh2 <= expected_area
    ratio2 = ow2 / oh2

    # compare ratios
    if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2,
                                                 ratio2 / ratio):
        return ow1, oh1
    else:
        return ow2, oh2

from diffusers.utils import load_image

image = load_image(block_state.image).convert("RGB")
max_area = 1280*704

ih, iw = image.height, image.width
dh, dw = pipe.transformer.config.patch_size[1] * pipe.vae_scale_factor_spatial, pipe.transformer.config.patch_size[2] * pipe.vae_scale_factor_spatial
ow, oh = best_output_size(iw, ih, dw, dh, max_area)

scale = max(ow / iw, oh / ih)
resized_image = image.resize((round(iw * scale), round(ih * scale)), Image.LANCZOS)

# center-crop
x1 = (resized_image.width - ow) // 2
y1 = (resized_image.height - oh) // 2
image = resized_image.crop((x1, y1, x1 + ow, y1 + oh))