Update README.md
Browse files
README.md
CHANGED
|
@@ -32,15 +32,23 @@ Truncate to 77 tokens
|
|
| 32 |
tensor([[0.16484, 0.0749, 0.1618, 0.0774]], device='cuda:0') 📉
|
| 33 |
```
|
| 34 |
# 👇
|
| 35 |
-
# Option 2
|
| 36 |
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
```
|
| 46 |
# Resulting Cosine Similarities for 248 tokens padded:
|
|
|
|
| 32 |
tensor([[0.16484, 0.0749, 0.1618, 0.0774]], device='cuda:0') 📉
|
| 33 |
```
|
| 34 |
# 👇
|
| 35 |
+
# Option 2, proper integration: 💖 RECOMMENDED 💖
|
| 36 |
|
| 37 |
+
- ### Solution for implementation of 248 tokens / thanks [@kk3dmax ](https://huggingface.co/zer0int/LongCLIP-GmP-ViT-L-14/discussions/3) 🤗
|
| 38 |
+
- Obtain a full example script using this solution for Flux.1 inference on [my GitHub](https://github.com/zer0int/CLIP-txt2img-diffusers-scripts)
|
| 39 |
|
| 40 |
+
```
|
| 41 |
+
model_id = ("zer0int/LongCLIP-GmP-ViT-L-14")
|
| 42 |
+
config = CLIPConfig.from_pretrained(model_id)
|
| 43 |
+
config.text_config.max_position_embeddings = 248
|
| 44 |
+
clip_model = CLIPModel.from_pretrained(model_id, torch_dtype=dtype, config=config)
|
| 45 |
+
clip_processor = CLIPProcessor.from_pretrained(model_id, padding="max_length", max_length=248)
|
| 46 |
+
|
| 47 |
+
pipe.tokenizer = clip_processor.tokenizer # Replace with the CLIP tokenizer
|
| 48 |
+
pipe.text_encoder = clip_model.text_model # Replace with the CLIP text encoder
|
| 49 |
+
pipe.tokenizer_max_length = 248
|
| 50 |
+
pipe.text_encoder.dtype = torch.bfloat16
|
| 51 |
+
```
|
| 52 |
|
| 53 |
```
|
| 54 |
# Resulting Cosine Similarities for 248 tokens padded:
|