Update pipeline.py
Browse files- pipeline.py +47 -34
pipeline.py
CHANGED
@@ -17,6 +17,7 @@
|
|
17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
18 |
|
19 |
import inspect
|
|
|
20 |
from typing import Any, Callable, Dict, List, Optional, Union
|
21 |
|
22 |
import paddle
|
@@ -30,18 +31,25 @@ from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
|
|
30 |
StableDiffusionSafetyChecker,
|
31 |
)
|
32 |
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
|
33 |
-
from ppdiffusers.utils import
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
|
37 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
38 |
|
|
|
39 |
@paddle.no_grad()
|
40 |
-
def load_lora(
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
ratio = float(ratio)
|
46 |
visited = []
|
47 |
for key in state_dict:
|
@@ -49,8 +57,7 @@ def load_lora(pipeline,
|
|
49 |
continue
|
50 |
|
51 |
if "text" in key:
|
52 |
-
tmp_layer_infos = key.split(".")[0].split(
|
53 |
-
LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
|
54 |
hf_to_ppnlp = {
|
55 |
"encoder": "transformer",
|
56 |
"fc1": "linear1",
|
@@ -58,12 +65,12 @@ def load_lora(pipeline,
|
|
58 |
}
|
59 |
layer_infos = []
|
60 |
for layer_info in tmp_layer_infos:
|
61 |
-
if layer_info == "mlp":
|
|
|
62 |
layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
|
63 |
curr_layer: paddle.nn.Linear = pipeline.text_encoder
|
64 |
else:
|
65 |
-
layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET +
|
66 |
-
"_")[-1].split("_")
|
67 |
curr_layer: paddle.nn.Linear = pipeline.unet
|
68 |
|
69 |
temp_name = layer_infos.pop(0)
|
@@ -82,14 +89,9 @@ def load_lora(pipeline,
|
|
82 |
else:
|
83 |
temp_name = layer_infos.pop(0)
|
84 |
|
85 |
-
triplet_keys = [
|
86 |
-
key,
|
87 |
-
key.replace("lora_down", "lora_up"),
|
88 |
-
key.replace("lora_down.weight", "alpha")
|
89 |
-
]
|
90 |
dtype: paddle.dtype = curr_layer.weight.dtype
|
91 |
-
weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(
|
92 |
-
dtype)
|
93 |
weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
|
94 |
rank: float = float(weight_down.shape[0])
|
95 |
if triplet_keys[2] in state_dict:
|
@@ -100,31 +102,37 @@ def load_lora(pipeline,
|
|
100 |
|
101 |
if not hasattr(curr_layer, "backup_weights"):
|
102 |
curr_layer.backup_weights = curr_layer.weight.clone()
|
103 |
-
|
104 |
if len(weight_down.shape) == 4:
|
105 |
if weight_down.shape[2:4] == [1, 1]:
|
106 |
# conv2d 1x1
|
107 |
curr_layer.weight.copy_(
|
108 |
-
curr_layer.weight
|
109 |
-
ratio
|
110 |
-
|
111 |
-
|
|
|
|
|
112 |
else:
|
113 |
# conv2d 3x3
|
114 |
curr_layer.weight.copy_(
|
115 |
-
curr_layer.weight
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
118 |
else:
|
119 |
# linear
|
120 |
-
curr_layer.weight.copy_(
|
121 |
-
curr_layer.weight +
|
122 |
-
ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
|
123 |
|
124 |
# update visited list
|
125 |
visited.extend(triplet_keys)
|
126 |
return pipeline
|
127 |
|
|
|
128 |
class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
129 |
r"""
|
130 |
Pipeline for text-to-image generation using Stable Diffusion.
|
@@ -399,7 +407,7 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
|
399 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
400 |
callback_steps: Optional[int] = 1,
|
401 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
402 |
-
clip_skip: int =
|
403 |
lora_dir: str = "./loras",
|
404 |
):
|
405 |
r"""
|
@@ -452,7 +460,9 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
|
452 |
`self.processor` in
|
453 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
454 |
clip_skip (`int`, *optional*, defaults to 0):
|
455 |
-
CLIP_stop_at_last_layers, if clip_skip
|
|
|
|
|
456 |
Examples:
|
457 |
|
458 |
Returns:
|
@@ -554,7 +564,9 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
|
554 |
cross_attention_kwargs=cross_attention_kwargs,
|
555 |
).sample
|
556 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
557 |
-
noise_pred = noise_pred_uncond + weight * guidance_scale * (
|
|
|
|
|
558 |
else:
|
559 |
noise_pred = self.unet(
|
560 |
latent_model_input,
|
@@ -616,6 +628,7 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
|
616 |
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
617 |
self.weights_has_changed = False
|
618 |
|
|
|
619 |
# clip.py
|
620 |
import math
|
621 |
from collections import namedtuple
|
|
|
17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
18 |
|
19 |
import inspect
|
20 |
+
from pathlib import Path
|
21 |
from typing import Any, Callable, Dict, List, Optional, Union
|
22 |
|
23 |
import paddle
|
|
|
31 |
StableDiffusionSafetyChecker,
|
32 |
)
|
33 |
from ppdiffusers.schedulers import KarrasDiffusionSchedulers
|
34 |
+
from ppdiffusers.utils import (
|
35 |
+
logging,
|
36 |
+
randn_tensor,
|
37 |
+
safetensors_load,
|
38 |
+
smart_load,
|
39 |
+
torch_load,
|
40 |
+
)
|
41 |
|
42 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
43 |
|
44 |
+
|
45 |
@paddle.no_grad()
|
46 |
+
def load_lora(
|
47 |
+
pipeline,
|
48 |
+
state_dict: dict,
|
49 |
+
LORA_PREFIX_UNET: str = "lora_unet",
|
50 |
+
LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
|
51 |
+
ratio: float = 1.0,
|
52 |
+
):
|
53 |
ratio = float(ratio)
|
54 |
visited = []
|
55 |
for key in state_dict:
|
|
|
57 |
continue
|
58 |
|
59 |
if "text" in key:
|
60 |
+
tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
|
|
|
61 |
hf_to_ppnlp = {
|
62 |
"encoder": "transformer",
|
63 |
"fc1": "linear1",
|
|
|
65 |
}
|
66 |
layer_infos = []
|
67 |
for layer_info in tmp_layer_infos:
|
68 |
+
if layer_info == "mlp":
|
69 |
+
continue
|
70 |
layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
|
71 |
curr_layer: paddle.nn.Linear = pipeline.text_encoder
|
72 |
else:
|
73 |
+
layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
|
|
|
74 |
curr_layer: paddle.nn.Linear = pipeline.unet
|
75 |
|
76 |
temp_name = layer_infos.pop(0)
|
|
|
89 |
else:
|
90 |
temp_name = layer_infos.pop(0)
|
91 |
|
92 |
+
triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
|
|
|
|
|
|
|
|
|
93 |
dtype: paddle.dtype = curr_layer.weight.dtype
|
94 |
+
weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
|
|
|
95 |
weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
|
96 |
rank: float = float(weight_down.shape[0])
|
97 |
if triplet_keys[2] in state_dict:
|
|
|
102 |
|
103 |
if not hasattr(curr_layer, "backup_weights"):
|
104 |
curr_layer.backup_weights = curr_layer.weight.clone()
|
105 |
+
|
106 |
if len(weight_down.shape) == 4:
|
107 |
if weight_down.shape[2:4] == [1, 1]:
|
108 |
# conv2d 1x1
|
109 |
curr_layer.weight.copy_(
|
110 |
+
curr_layer.weight
|
111 |
+
+ ratio
|
112 |
+
* paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
|
113 |
+
* scale,
|
114 |
+
True,
|
115 |
+
)
|
116 |
else:
|
117 |
# conv2d 3x3
|
118 |
curr_layer.weight.copy_(
|
119 |
+
curr_layer.weight
|
120 |
+
+ ratio
|
121 |
+
* paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
|
122 |
+
[1, 0, 2, 3]
|
123 |
+
)
|
124 |
+
* scale,
|
125 |
+
True,
|
126 |
+
)
|
127 |
else:
|
128 |
# linear
|
129 |
+
curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
|
|
|
|
|
130 |
|
131 |
# update visited list
|
132 |
visited.extend(triplet_keys)
|
133 |
return pipeline
|
134 |
|
135 |
+
|
136 |
class WebUIStableDiffusionPipeline(DiffusionPipeline):
|
137 |
r"""
|
138 |
Pipeline for text-to-image generation using Stable Diffusion.
|
|
|
407 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
408 |
callback_steps: Optional[int] = 1,
|
409 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
410 |
+
clip_skip: int = 1,
|
411 |
lora_dir: str = "./loras",
|
412 |
):
|
413 |
r"""
|
|
|
460 |
`self.processor` in
|
461 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
462 |
clip_skip (`int`, *optional*, defaults to 0):
|
463 |
+
CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
|
464 |
+
lora_dir (`str`, *optional*):
|
465 |
+
Path to lora which we want to load.
|
466 |
Examples:
|
467 |
|
468 |
Returns:
|
|
|
564 |
cross_attention_kwargs=cross_attention_kwargs,
|
565 |
).sample
|
566 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
567 |
+
noise_pred = noise_pred_uncond + weight * guidance_scale * (
|
568 |
+
noise_pred_text - noise_pred_uncond
|
569 |
+
)
|
570 |
else:
|
571 |
noise_pred = self.unet(
|
572 |
latent_model_input,
|
|
|
628 |
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
629 |
self.weights_has_changed = False
|
630 |
|
631 |
+
|
632 |
# clip.py
|
633 |
import math
|
634 |
from collections import namedtuple
|