Gengzigang
commited on
Commit
·
3831a1e
1
Parent(s):
9d797ea
add
Browse files- modeling_clip.py +108 -60
modeling_clip.py
CHANGED
|
@@ -39,7 +39,6 @@ from transformers.utils import (
|
|
| 39 |
)
|
| 40 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
| 41 |
|
| 42 |
-
|
| 43 |
if is_flash_attn_2_available():
|
| 44 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
| 45 |
|
|
@@ -603,16 +602,15 @@ class CLIPPreTrainedModel(PreTrainedModel):
|
|
| 603 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
| 604 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
| 605 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
| 606 |
-
elif isinstance(module,
|
| 607 |
-
pass
|
| 608 |
# nn.init.normal_(
|
| 609 |
# module.text_projection.weight,
|
| 610 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
| 611 |
# )
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
| 617 |
nn.init.normal_(
|
| 618 |
module.visual_projection.weight,
|
|
@@ -1112,80 +1110,97 @@ class CLIPVisionModel(CLIPPreTrainedModel):
|
|
| 1112 |
|
| 1113 |
|
| 1114 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
| 1115 |
-
class
|
| 1116 |
config_class = CLIPConfig
|
| 1117 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
| 1118 |
|
| 1119 |
def __init__(self, config: CLIPConfig):
|
| 1120 |
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
| 1122 |
raise TypeError(
|
| 1123 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
| 1124 |
f" {type(config.vision_config)}."
|
| 1125 |
)
|
| 1126 |
|
|
|
|
| 1127 |
vision_config = config.vision_config
|
| 1128 |
|
| 1129 |
self.projection_dim = config.projection_dim
|
|
|
|
| 1130 |
self.vision_embed_dim = vision_config.hidden_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1131 |
|
| 1132 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
| 1133 |
self.vision_model = vision_model.vision_model
|
| 1134 |
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
self.visual_projection = nn.Parameter(scale * torch.randn(self.vision_embed_dim, self.projection_dim))
|
| 1138 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
| 1139 |
|
| 1140 |
# Initialize weights and apply final processing
|
| 1141 |
self.post_init()
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
self
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
-
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
|
| 1174 |
-
|
| 1175 |
-
|
| 1176 |
-
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
|
| 1181 |
-
|
| 1182 |
-
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1189 |
|
| 1190 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
| 1191 |
def get_image_features(
|
|
@@ -1232,7 +1247,7 @@ class CLIPModel(CLIPPreTrainedModel):
|
|
| 1232 |
)
|
| 1233 |
|
| 1234 |
pooled_output = vision_outputs[1] # pooled_output
|
| 1235 |
-
image_features =
|
| 1236 |
|
| 1237 |
return image_features
|
| 1238 |
|
|
@@ -1413,7 +1428,40 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
|
|
| 1413 |
attentions=text_outputs.attentions,
|
| 1414 |
)
|
| 1415 |
|
| 1416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1417 |
@add_start_docstrings(
|
| 1418 |
"""
|
| 1419 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|
|
|
|
| 39 |
)
|
| 40 |
from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
| 41 |
|
|
|
|
| 42 |
if is_flash_attn_2_available():
|
| 43 |
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
| 44 |
|
|
|
|
| 602 |
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
| 603 |
nn.init.normal_(module.fc1.weight, std=fc_std)
|
| 604 |
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
| 605 |
+
elif isinstance(module, LLM2CLIPModel):
|
|
|
|
| 606 |
# nn.init.normal_(
|
| 607 |
# module.text_projection.weight,
|
| 608 |
# std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
| 609 |
# )
|
| 610 |
+
nn.init.normal_(
|
| 611 |
+
module.visual_projection.weight,
|
| 612 |
+
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
|
| 613 |
+
)
|
| 614 |
elif isinstance(module, CLIPVisionModelWithProjection):
|
| 615 |
nn.init.normal_(
|
| 616 |
module.visual_projection.weight,
|
|
|
|
| 1110 |
|
| 1111 |
|
| 1112 |
@add_start_docstrings(CLIP_START_DOCSTRING)
|
| 1113 |
+
class LLM2CLIPModel(CLIPPreTrainedModel):
|
| 1114 |
config_class = CLIPConfig
|
| 1115 |
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
|
| 1116 |
|
| 1117 |
def __init__(self, config: CLIPConfig):
|
| 1118 |
super().__init__(config)
|
| 1119 |
+
# if not isinstance(config.text_config, CLIPTextConfig):
|
| 1120 |
+
# raise TypeError(
|
| 1121 |
+
# "config.text_config is expected to be of type CLIPTextConfig but is of type"
|
| 1122 |
+
# f" {type(config.text_config)}."
|
| 1123 |
+
# )
|
| 1124 |
+
|
| 1125 |
if not isinstance(config.vision_config, CLIPVisionConfig):
|
| 1126 |
raise TypeError(
|
| 1127 |
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
|
| 1128 |
f" {type(config.vision_config)}."
|
| 1129 |
)
|
| 1130 |
|
| 1131 |
+
# text_config = config.text_config
|
| 1132 |
vision_config = config.vision_config
|
| 1133 |
|
| 1134 |
self.projection_dim = config.projection_dim
|
| 1135 |
+
# self.text_embed_dim = text_config.hidden_size
|
| 1136 |
self.vision_embed_dim = vision_config.hidden_size
|
| 1137 |
+
|
| 1138 |
+
adapter = LLM2CLIP_Adapter()
|
| 1139 |
+
self.text_adapter = adapter
|
| 1140 |
+
|
| 1141 |
+
# text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
|
| 1142 |
+
# self.text_model = text_model.text_model
|
| 1143 |
|
| 1144 |
vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
|
| 1145 |
self.vision_model = vision_model.vision_model
|
| 1146 |
|
| 1147 |
+
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
| 1148 |
+
# self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
|
|
|
| 1149 |
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
| 1150 |
|
| 1151 |
# Initialize weights and apply final processing
|
| 1152 |
self.post_init()
|
| 1153 |
+
|
| 1154 |
+
def get_text_features(self, inputs):
|
| 1155 |
+
#TODO: make this more flexible and configurable
|
| 1156 |
+
return self.text_adapter(inputs)
|
| 1157 |
+
|
| 1158 |
+
# @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
|
| 1159 |
+
# def get_text_features(
|
| 1160 |
+
# self,
|
| 1161 |
+
# input_ids: Optional[torch.Tensor] = None,
|
| 1162 |
+
# attention_mask: Optional[torch.Tensor] = None,
|
| 1163 |
+
# position_ids: Optional[torch.Tensor] = None,
|
| 1164 |
+
# output_attentions: Optional[bool] = None,
|
| 1165 |
+
# output_hidden_states: Optional[bool] = None,
|
| 1166 |
+
# return_dict: Optional[bool] = None,
|
| 1167 |
+
# ) -> torch.FloatTensor:
|
| 1168 |
+
# r"""
|
| 1169 |
+
# Returns:
|
| 1170 |
+
# text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
|
| 1171 |
+
# applying the projection layer to the pooled output of [`CLIPTextModel`].
|
| 1172 |
+
|
| 1173 |
+
# Examples:
|
| 1174 |
+
|
| 1175 |
+
# ```python
|
| 1176 |
+
# >>> from transformers import AutoTokenizer, CLIPModel
|
| 1177 |
+
|
| 1178 |
+
# >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 1179 |
+
# >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 1180 |
+
|
| 1181 |
+
# >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
| 1182 |
+
# >>> text_features = model.get_text_features(**inputs)
|
| 1183 |
+
# ```"""
|
| 1184 |
+
# # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
|
| 1185 |
+
# output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1186 |
+
# output_hidden_states = (
|
| 1187 |
+
# output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1188 |
+
# )
|
| 1189 |
+
# return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1190 |
+
|
| 1191 |
+
# text_outputs = self.text_model(
|
| 1192 |
+
# input_ids=input_ids,
|
| 1193 |
+
# attention_mask=attention_mask,
|
| 1194 |
+
# position_ids=position_ids,
|
| 1195 |
+
# output_attentions=output_attentions,
|
| 1196 |
+
# output_hidden_states=output_hidden_states,
|
| 1197 |
+
# return_dict=return_dict,
|
| 1198 |
+
# )
|
| 1199 |
+
|
| 1200 |
+
# pooled_output = text_outputs[1]
|
| 1201 |
+
# text_features = self.text_projection(pooled_output)
|
| 1202 |
+
|
| 1203 |
+
# return text_features
|
| 1204 |
|
| 1205 |
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
| 1206 |
def get_image_features(
|
|
|
|
| 1247 |
)
|
| 1248 |
|
| 1249 |
pooled_output = vision_outputs[1] # pooled_output
|
| 1250 |
+
image_features = self.visual_projection(pooled_output)
|
| 1251 |
|
| 1252 |
return image_features
|
| 1253 |
|
|
|
|
| 1428 |
attentions=text_outputs.attentions,
|
| 1429 |
)
|
| 1430 |
|
| 1431 |
+
class LinearBlock(nn.Module):
|
| 1432 |
+
def __init__(self, dim, expansion_factor=4, dropout=0.,norm_layer=nn.LayerNorm):
|
| 1433 |
+
super().__init__()
|
| 1434 |
+
self.fn = nn.Sequential(
|
| 1435 |
+
nn.Linear(dim, int(expansion_factor * dim)),
|
| 1436 |
+
nn.GELU(),
|
| 1437 |
+
nn.Dropout(dropout),
|
| 1438 |
+
nn.Linear(int(expansion_factor * dim), dim),
|
| 1439 |
+
)
|
| 1440 |
+
self.ln = norm_layer(dim)
|
| 1441 |
+
|
| 1442 |
+
def forward(self, x):
|
| 1443 |
+
return x + self.fn(self.ln(x))
|
| 1444 |
+
|
| 1445 |
+
class LLM2CLIP_Adapter(nn.Module):
|
| 1446 |
+
def __init__(self):
|
| 1447 |
+
super().__init__()
|
| 1448 |
+
#TODO: make this more flexible and configurable
|
| 1449 |
+
# hard-coded values from the LLM2CLIP model
|
| 1450 |
+
text_embedding_dim = 4096
|
| 1451 |
+
expansion_factor = 2
|
| 1452 |
+
adaptor_num_layers = 4
|
| 1453 |
+
proj_bias = True
|
| 1454 |
+
output_dim = 1280
|
| 1455 |
+
self.adaptor = nn.Sequential(
|
| 1456 |
+
*[LinearBlock(text_embedding_dim, expansion_factor) for _ in range(adaptor_num_layers)],
|
| 1457 |
+
nn.LayerNorm(text_embedding_dim),
|
| 1458 |
+
nn.Linear(text_embedding_dim, output_dim, bias=proj_bias),
|
| 1459 |
+
)
|
| 1460 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 1461 |
+
hidden_states = torch.nn.functional.normalize(hidden_states, p=2, dim=1)
|
| 1462 |
+
hidden_states = self.adaptor(hidden_states)
|
| 1463 |
+
return hidden_states
|
| 1464 |
+
|
| 1465 |
@add_start_docstrings(
|
| 1466 |
"""
|
| 1467 |
CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|