Upload folder using huggingface_hub
Browse files- README.md +7 -7
- modeling_intern_vit.py +6 -12
    	
        README.md
    CHANGED
    
    | @@ -178,7 +178,7 @@ model = AutoModel.from_pretrained( | |
| 178 | 
             
                trust_remote_code=True).eval().cuda()
         | 
| 179 | 
             
            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
         | 
| 180 |  | 
| 181 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 182 | 
             
            question = 'Hello, who are you?'
         | 
| 183 | 
             
            response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
         | 
| 184 | 
             
            print(f'User: {question}')
         | 
| @@ -209,7 +209,7 @@ image_processor = CLIPImageProcessor.from_pretrained(path) | |
| 209 | 
             
            image = Image.open('./examples/image2.jpg').resize((448, 448))
         | 
| 210 | 
             
            pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 211 |  | 
| 212 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 213 | 
             
            question = '<image>\nPlease describe the image shortly.'
         | 
| 214 | 
             
            response = model.chat(tokenizer, pixel_values, question, generation_config)
         | 
| 215 | 
             
            print(f'User: {question}')
         | 
| @@ -235,7 +235,7 @@ image_processor = CLIPImageProcessor.from_pretrained(path) | |
| 235 | 
             
            image = Image.open('./examples/image2.jpg').resize((448, 448))
         | 
| 236 | 
             
            pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 237 |  | 
| 238 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 239 | 
             
            question = '<image>\nPlease describe the image in detail.'
         | 
| 240 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
         | 
| 241 | 
             
            print(f'User: {question}')
         | 
| @@ -271,7 +271,7 @@ image2 = Image.open('./examples/image2.jpg').resize((448, 448)) | |
| 271 | 
             
            pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 272 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 273 |  | 
| 274 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 275 | 
             
            question = '<image>\nDescribe the two images in detail.'
         | 
| 276 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config,
         | 
| 277 | 
             
                                           history=None, return_history=True)
         | 
| @@ -310,7 +310,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values | |
| 310 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 311 | 
             
            num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
         | 
| 312 |  | 
| 313 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 314 | 
             
            question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
         | 
| 315 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config,
         | 
| 316 | 
             
                                           num_patches_list=num_patches_list, history=None, return_history=True)
         | 
| @@ -347,7 +347,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values | |
| 347 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 348 | 
             
            num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
         | 
| 349 |  | 
| 350 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 351 | 
             
            questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
         | 
| 352 | 
             
            responses = model.batch_chat(tokenizer, pixel_values,
         | 
| 353 | 
             
                                         num_patches_list=num_patches_list,
         | 
| @@ -409,7 +409,7 @@ model = AutoModel.from_pretrained( | |
| 409 | 
             
                trust_remote_code=True).eval().cuda()
         | 
| 410 | 
             
            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
         | 
| 411 |  | 
| 412 | 
            -
            generation_config = dict(max_new_tokens=1024, do_sample= | 
| 413 |  | 
| 414 | 
             
            video_path = './examples/red-panda.mp4'
         | 
| 415 | 
             
            pixel_values, num_patches_list = load_video(video_path, num_segments=8)
         | 
|  | |
| 178 | 
             
                trust_remote_code=True).eval().cuda()
         | 
| 179 | 
             
            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
         | 
| 180 |  | 
| 181 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 182 | 
             
            question = 'Hello, who are you?'
         | 
| 183 | 
             
            response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
         | 
| 184 | 
             
            print(f'User: {question}')
         | 
|  | |
| 209 | 
             
            image = Image.open('./examples/image2.jpg').resize((448, 448))
         | 
| 210 | 
             
            pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 211 |  | 
| 212 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 213 | 
             
            question = '<image>\nPlease describe the image shortly.'
         | 
| 214 | 
             
            response = model.chat(tokenizer, pixel_values, question, generation_config)
         | 
| 215 | 
             
            print(f'User: {question}')
         | 
|  | |
| 235 | 
             
            image = Image.open('./examples/image2.jpg').resize((448, 448))
         | 
| 236 | 
             
            pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 237 |  | 
| 238 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 239 | 
             
            question = '<image>\nPlease describe the image in detail.'
         | 
| 240 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
         | 
| 241 | 
             
            print(f'User: {question}')
         | 
|  | |
| 271 | 
             
            pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
         | 
| 272 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 273 |  | 
| 274 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 275 | 
             
            question = '<image>\nDescribe the two images in detail.'
         | 
| 276 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config,
         | 
| 277 | 
             
                                           history=None, return_history=True)
         | 
|  | |
| 310 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 311 | 
             
            num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
         | 
| 312 |  | 
| 313 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 314 | 
             
            question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
         | 
| 315 | 
             
            response, history = model.chat(tokenizer, pixel_values, question, generation_config,
         | 
| 316 | 
             
                                           num_patches_list=num_patches_list, history=None, return_history=True)
         | 
|  | |
| 347 | 
             
            pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
         | 
| 348 | 
             
            num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
         | 
| 349 |  | 
| 350 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 351 | 
             
            questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
         | 
| 352 | 
             
            responses = model.batch_chat(tokenizer, pixel_values,
         | 
| 353 | 
             
                                         num_patches_list=num_patches_list,
         | 
|  | |
| 409 | 
             
                trust_remote_code=True).eval().cuda()
         | 
| 410 | 
             
            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
         | 
| 411 |  | 
| 412 | 
            +
            generation_config = dict(max_new_tokens=1024, do_sample=True)
         | 
| 413 |  | 
| 414 | 
             
            video_path = './examples/red-panda.mp4'
         | 
| 415 | 
             
            pixel_values, num_patches_list = load_video(video_path, num_segments=8)
         | 
    	
        modeling_intern_vit.py
    CHANGED
    
    | @@ -20,18 +20,12 @@ from transformers.utils import logging | |
| 20 | 
             
            from .configuration_intern_vit import InternVisionConfig
         | 
| 21 |  | 
| 22 | 
             
            try:
         | 
| 23 | 
            -
                try:  # v1
         | 
| 24 | 
            -
                    from flash_attn.flash_attn_interface import \
         | 
| 25 | 
            -
                        flash_attn_unpadded_qkvpacked_func
         | 
| 26 | 
            -
                except:  # v2
         | 
| 27 | 
            -
                    from flash_attn.flash_attn_interface import \
         | 
| 28 | 
            -
                        flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
         | 
| 29 | 
            -
             | 
| 30 | 
             
                from flash_attn.bert_padding import pad_input, unpad_input
         | 
| 31 | 
            -
             | 
|  | |
| 32 | 
             
                has_flash_attn = True
         | 
| 33 | 
             
            except:
         | 
| 34 | 
            -
                print(' | 
| 35 | 
             
                has_flash_attn = False
         | 
| 36 |  | 
| 37 | 
             
            logger = logging.get_logger(__name__)
         | 
| @@ -74,7 +68,7 @@ class FlashAttention(nn.Module): | |
| 74 | 
             
                            max_s = seqlen
         | 
| 75 | 
             
                            cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
         | 
| 76 | 
             
                                                      device=qkv.device)
         | 
| 77 | 
            -
                            output =  | 
| 78 | 
             
                                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 79 | 
             
                                softmax_scale=self.softmax_scale, causal=causal
         | 
| 80 | 
             
                            )
         | 
| @@ -84,7 +78,7 @@ class FlashAttention(nn.Module): | |
| 84 | 
             
                            x = rearrange(qkv, 'b s three h d -> b s (three h d)')
         | 
| 85 | 
             
                            x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
         | 
| 86 | 
             
                            x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
         | 
| 87 | 
            -
                            output_unpad =  | 
| 88 | 
             
                                x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 89 | 
             
                                softmax_scale=self.softmax_scale, causal=causal
         | 
| 90 | 
             
                            )
         | 
| @@ -93,7 +87,7 @@ class FlashAttention(nn.Module): | |
| 93 | 
             
                                               'b s (h d) -> b s h d', h=nheads)
         | 
| 94 | 
             
                    else:
         | 
| 95 | 
             
                        assert max_s is not None
         | 
| 96 | 
            -
                        output =  | 
| 97 | 
             
                            qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 98 | 
             
                            softmax_scale=self.softmax_scale, causal=causal
         | 
| 99 | 
             
                        )
         | 
|  | |
| 20 | 
             
            from .configuration_intern_vit import InternVisionConfig
         | 
| 21 |  | 
| 22 | 
             
            try:
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 23 | 
             
                from flash_attn.bert_padding import pad_input, unpad_input
         | 
| 24 | 
            +
                from flash_attn.flash_attn_interface import \
         | 
| 25 | 
            +
                    flash_attn_varlen_qkvpacked_func
         | 
| 26 | 
             
                has_flash_attn = True
         | 
| 27 | 
             
            except:
         | 
| 28 | 
            +
                print('FlashAttention2 is not installed.')
         | 
| 29 | 
             
                has_flash_attn = False
         | 
| 30 |  | 
| 31 | 
             
            logger = logging.get_logger(__name__)
         | 
|  | |
| 68 | 
             
                            max_s = seqlen
         | 
| 69 | 
             
                            cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
         | 
| 70 | 
             
                                                      device=qkv.device)
         | 
| 71 | 
            +
                            output = flash_attn_varlen_qkvpacked_func(
         | 
| 72 | 
             
                                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 73 | 
             
                                softmax_scale=self.softmax_scale, causal=causal
         | 
| 74 | 
             
                            )
         | 
|  | |
| 78 | 
             
                            x = rearrange(qkv, 'b s three h d -> b s (three h d)')
         | 
| 79 | 
             
                            x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
         | 
| 80 | 
             
                            x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
         | 
| 81 | 
            +
                            output_unpad = flash_attn_varlen_qkvpacked_func(
         | 
| 82 | 
             
                                x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 83 | 
             
                                softmax_scale=self.softmax_scale, causal=causal
         | 
| 84 | 
             
                            )
         | 
|  | |
| 87 | 
             
                                               'b s (h d) -> b s h d', h=nheads)
         | 
| 88 | 
             
                    else:
         | 
| 89 | 
             
                        assert max_s is not None
         | 
| 90 | 
            +
                        output = flash_attn_varlen_qkvpacked_func(
         | 
| 91 | 
             
                            qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
         | 
| 92 | 
             
                            softmax_scale=self.softmax_scale, causal=causal
         | 
| 93 | 
             
                        )
         | 

