Upload folder using huggingface_hub
Browse files- README.md +1 -1
- modeling_intern_vit.py +6 -12
README.md
CHANGED
|
@@ -30,7 +30,7 @@ LMDeploy supports the following NVIDIA GPU for W4A16 inference:
|
|
| 30 |
Before proceeding with the quantization and inference, please ensure that lmdeploy is installed.
|
| 31 |
|
| 32 |
```shell
|
| 33 |
-
pip install lmdeploy
|
| 34 |
```
|
| 35 |
|
| 36 |
This article comprises the following sections:
|
|
|
|
| 30 |
Before proceeding with the quantization and inference, please ensure that lmdeploy is installed.
|
| 31 |
|
| 32 |
```shell
|
| 33 |
+
pip install lmdeploy==0.5.3
|
| 34 |
```
|
| 35 |
|
| 36 |
This article comprises the following sections:
|
modeling_intern_vit.py
CHANGED
|
@@ -20,18 +20,12 @@ from transformers.utils import logging
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
try: # v1
|
| 24 |
-
from flash_attn.flash_attn_interface import \
|
| 25 |
-
flash_attn_unpadded_qkvpacked_func
|
| 26 |
-
except: # v2
|
| 27 |
-
from flash_attn.flash_attn_interface import \
|
| 28 |
-
flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
|
| 29 |
-
|
| 30 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 31 |
-
|
|
|
|
| 32 |
has_flash_attn = True
|
| 33 |
except:
|
| 34 |
-
print('
|
| 35 |
has_flash_attn = False
|
| 36 |
|
| 37 |
logger = logging.get_logger(__name__)
|
|
@@ -74,7 +68,7 @@ class FlashAttention(nn.Module):
|
|
| 74 |
max_s = seqlen
|
| 75 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 76 |
device=qkv.device)
|
| 77 |
-
output =
|
| 78 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 79 |
softmax_scale=self.softmax_scale, causal=causal
|
| 80 |
)
|
|
@@ -84,7 +78,7 @@ class FlashAttention(nn.Module):
|
|
| 84 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 85 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 86 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 87 |
-
output_unpad =
|
| 88 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 89 |
softmax_scale=self.softmax_scale, causal=causal
|
| 90 |
)
|
|
@@ -93,7 +87,7 @@ class FlashAttention(nn.Module):
|
|
| 93 |
'b s (h d) -> b s h d', h=nheads)
|
| 94 |
else:
|
| 95 |
assert max_s is not None
|
| 96 |
-
output =
|
| 97 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 98 |
softmax_scale=self.softmax_scale, causal=causal
|
| 99 |
)
|
|
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 24 |
+
from flash_attn.flash_attn_interface import \
|
| 25 |
+
flash_attn_varlen_qkvpacked_func
|
| 26 |
has_flash_attn = True
|
| 27 |
except:
|
| 28 |
+
print('FlashAttention2 is not installed.')
|
| 29 |
has_flash_attn = False
|
| 30 |
|
| 31 |
logger = logging.get_logger(__name__)
|
|
|
|
| 68 |
max_s = seqlen
|
| 69 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 70 |
device=qkv.device)
|
| 71 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 72 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 73 |
softmax_scale=self.softmax_scale, causal=causal
|
| 74 |
)
|
|
|
|
| 78 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 79 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 80 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 81 |
+
output_unpad = flash_attn_varlen_qkvpacked_func(
|
| 82 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 83 |
softmax_scale=self.softmax_scale, causal=causal
|
| 84 |
)
|
|
|
|
| 87 |
'b s (h d) -> b s h d', h=nheads)
|
| 88 |
else:
|
| 89 |
assert max_s is not None
|
| 90 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 91 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 92 |
softmax_scale=self.softmax_scale, causal=causal
|
| 93 |
)
|