| { | |
| "_valid_processor_keys": [ | |
| "images", | |
| "do_resize", | |
| "size", | |
| "resample", | |
| "do_center_crop", | |
| "crop_size", | |
| "do_rescale", | |
| "rescale_factor", | |
| "do_normalize", | |
| "image_mean", | |
| "image_std", | |
| "do_convert_rgb", | |
| "return_tensors", | |
| "data_format", | |
| "input_data_format" | |
| ], | |
| "auto_map": { | |
| "AutoProcessor": "processing_florence2.Florence2Processor" | |
| }, | |
| "crop_size": { | |
| "height": 768, | |
| "width": 768 | |
| }, | |
| "do_center_crop": false, | |
| "do_convert_rgb": null, | |
| "do_normalize": true, | |
| "do_rescale": true, | |
| "do_resize": true, | |
| "image_mean": [ | |
| 0.485, | |
| 0.456, | |
| 0.406 | |
| ], | |
| "image_processor_type": "CLIPImageProcessor", | |
| "image_seq_length": 577, | |
| "image_std": [ | |
| 0.229, | |
| 0.224, | |
| 0.225 | |
| ], | |
| "processor_class": "Florence2Processor", | |
| "resample": 3, | |
| "rescale_factor": 0.00392156862745098, | |
| "size": { | |
| "height": 768, | |
| "width": 768 | |
| }, | |
| "tasks_answer_post_processing_type": { | |
| "<OCR>": "pure_text", | |
| "<OCR_WITH_REGION>": "ocr", | |
| "<CAPTION>": "pure_text", | |
| "<DETAILED_CAPTION>": "pure_text", | |
| "<MORE_DETAILED_CAPTION>": "pure_text", | |
| "<OD>": "description_with_bboxes", | |
| "<DENSE_REGION_CAPTION>": "description_with_bboxes", | |
| "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding", | |
| "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons", | |
| "<REGION_TO_SEGMENTATION>": "polygons", | |
| "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons", | |
| "<REGION_TO_CATEGORY>": "pure_text", | |
| "<REGION_TO_DESCRIPTION>": "pure_text", | |
| "<REGION_TO_OCR>": "pure_text", | |
| "<REGION_PROPOSAL>": "bboxes" | |
| }, | |
| "task_prompts_without_inputs": { | |
| "<OCR>": "What is the text in the image?", | |
| "<OCR_WITH_REGION>": "What is the text in the image, with regions?", | |
| "<CAPTION>": "What does the image describe?", | |
| "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.", | |
| "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.", | |
| "<OD>": "Locate the objects with category name in the image.", | |
| "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.", | |
| "<REGION_PROPOSAL>": "Locate the region proposals in the image." | |
| }, | |
| "task_prompts_with_input": { | |
| "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}", | |
| "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask", | |
| "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}", | |
| "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.", | |
| "<REGION_TO_CATEGORY>": "What is the region {input}?", | |
| "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?", | |
| "<REGION_TO_OCR>": "What text is in the region {input}?" | |
| } | |
| } | |