|
import argparse, torch |
|
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer |
|
from PIL import Image |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--image", type=str, required=True) |
|
parser.add_argument("--max_length", type=int, default=20) |
|
args = parser.parse_args() |
|
|
|
model_id = "nlpconnect/vit-gpt2-image-captioning" |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
model = VisionEncoderDecoderModel.from_pretrained(model_id).to(device) |
|
feature_extractor = ViTImageProcessor.from_pretrained(model_id) |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
img = Image.open(args.image).convert("RGB") |
|
pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values.to(device) |
|
|
|
with torch.no_grad(): |
|
output_ids = model.generate(pixel_values, max_length=args.max_length)[0] |
|
caption = tokenizer.decode(output_ids, skip_special_tokens=True) |
|
print(caption) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|