提交 4fe892a7 authored 作者: kijai's avatar kijai

fix quality

duh
上级 ffae2711
......@@ -21,6 +21,7 @@ from ..modules.pose_net import PoseNet
from comfy.utils import ProgressBar
import comfy.model_management as mm
from comfy.clip_vision import clip_preprocess
offload_device = mm.unet_offload_device()
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
......@@ -126,25 +127,28 @@ class MimicMotionPipeline(DiffusionPipeline):
do_classifier_free_guidance: bool):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.image_processor.pil_to_numpy(image)
image = self.image_processor.numpy_to_pt(image)
# We normalize the image before resizing to match with the original implementation.
# Then we unnormalize it after resizing.
image = image * 2.0 - 1.0
image = _resize_with_antialiasing(image, (224, 224))
image = (image + 1.0) / 2.0
# Normalize the image with for CLIP input
image = self.feature_extractor(
images=image,
do_normalize=True,
do_center_crop=False,
do_resize=False,
do_rescale=False,
return_tensors="pt",
).pixel_values
# if not isinstance(image, torch.Tensor):
# image = self.image_processor.pil_to_numpy(image)
# image = self.image_processor.numpy_to_pt(image)
# # We normalize the image before resizing to match with the original implementation.
# # Then we unnormalize it after resizing.
# image = image * 2.0 - 1.0
# image = _resize_with_antialiasing(image, (224, 224))
# image = (image + 1.0) / 2.0
# # Normalize the image with for CLIP input
# image = self.feature_extractor(
# images=image,
# do_normalize=True,
# do_center_crop=False,
# do_resize=False,
# do_rescale=False,
# return_tensors="pt",
# ).pixel_values
image = image.permute(0, 2, 3, 1)
image = clip_preprocess(image.clone(), 224)
image = image.to(device=device, dtype=dtype)
self.image_encoder.to(device)
......@@ -159,6 +163,7 @@ class MimicMotionPipeline(DiffusionPipeline):
if do_classifier_free_guidance:
negative_image_embeddings = torch.zeros_like(image_embeddings)
#negative_image_embeddings = torch.randn_like(image_embeddings)
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
......@@ -638,7 +643,7 @@ class MimicMotionPipeline(DiffusionPipeline):
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
latents = callback_outputs.pop("latents", latents)
self.unet.to(offload_device)
if not output_type == "latent":
......
......@@ -7,6 +7,8 @@ import folder_paths
import comfy.model_management as mm
import comfy.utils
from comfy.clip_vision import clip_preprocess
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
......@@ -183,21 +185,22 @@ class MimicMotionSampler:
ref_image = ref_image.permute(0, 3, 1, 2)
pose_images = pose_images.permute(0, 3, 1, 2)
if ref_image.shape[1:3] != (224, 224):
ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
else:
ref_img = ref_image
# if ref_image.shape[1:3] != (224, 224):
# #ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
# ref_img = clip_preprocess(ref_image, 224)
# else:
# ref_img = ref_image
pose_images = pose_images * 2 - 1
ref_img = ref_img.to(device).to(dtype)
ref_image = ref_image.to(device).to(dtype)
pose_images = pose_images.to(device).to(dtype)
generator = torch.Generator(device=device)
generator.manual_seed(seed)
frames = pipeline(
ref_img,
ref_image,
image_pose=pose_images,
num_frames=B,
tile_size = context_size,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论