提交 4fe892a7 authored 作者: kijai's avatar kijai

fix quality

duh
上级 ffae2711
...@@ -21,6 +21,7 @@ from ..modules.pose_net import PoseNet ...@@ -21,6 +21,7 @@ from ..modules.pose_net import PoseNet
from comfy.utils import ProgressBar from comfy.utils import ProgressBar
import comfy.model_management as mm import comfy.model_management as mm
from comfy.clip_vision import clip_preprocess
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
...@@ -126,25 +127,28 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -126,25 +127,28 @@ class MimicMotionPipeline(DiffusionPipeline):
do_classifier_free_guidance: bool): do_classifier_free_guidance: bool):
dtype = next(self.image_encoder.parameters()).dtype dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor): # if not isinstance(image, torch.Tensor):
image = self.image_processor.pil_to_numpy(image) # image = self.image_processor.pil_to_numpy(image)
image = self.image_processor.numpy_to_pt(image) # image = self.image_processor.numpy_to_pt(image)
# We normalize the image before resizing to match with the original implementation. # # We normalize the image before resizing to match with the original implementation.
# Then we unnormalize it after resizing. # # Then we unnormalize it after resizing.
image = image * 2.0 - 1.0 # image = image * 2.0 - 1.0
image = _resize_with_antialiasing(image, (224, 224)) # image = _resize_with_antialiasing(image, (224, 224))
image = (image + 1.0) / 2.0 # image = (image + 1.0) / 2.0
# Normalize the image with for CLIP input # # Normalize the image with for CLIP input
image = self.feature_extractor( # image = self.feature_extractor(
images=image, # images=image,
do_normalize=True, # do_normalize=True,
do_center_crop=False, # do_center_crop=False,
do_resize=False, # do_resize=False,
do_rescale=False, # do_rescale=False,
return_tensors="pt", # return_tensors="pt",
).pixel_values # ).pixel_values
image = image.permute(0, 2, 3, 1)
image = clip_preprocess(image.clone(), 224)
image = image.to(device=device, dtype=dtype) image = image.to(device=device, dtype=dtype)
self.image_encoder.to(device) self.image_encoder.to(device)
...@@ -159,6 +163,7 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -159,6 +163,7 @@ class MimicMotionPipeline(DiffusionPipeline):
if do_classifier_free_guidance: if do_classifier_free_guidance:
negative_image_embeddings = torch.zeros_like(image_embeddings) negative_image_embeddings = torch.zeros_like(image_embeddings)
#negative_image_embeddings = torch.randn_like(image_embeddings)
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
...@@ -638,7 +643,7 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -638,7 +643,7 @@ class MimicMotionPipeline(DiffusionPipeline):
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
latents = callback_outputs.pop("latents", latents) latents = callback_outputs.pop("latents", latents)
self.unet.to(offload_device) self.unet.to(offload_device)
if not output_type == "latent": if not output_type == "latent":
......
...@@ -7,6 +7,8 @@ import folder_paths ...@@ -7,6 +7,8 @@ import folder_paths
import comfy.model_management as mm import comfy.model_management as mm
import comfy.utils import comfy.utils
from comfy.clip_vision import clip_preprocess
from diffusers.models import AutoencoderKLTemporalDecoder from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
...@@ -183,21 +185,22 @@ class MimicMotionSampler: ...@@ -183,21 +185,22 @@ class MimicMotionSampler:
ref_image = ref_image.permute(0, 3, 1, 2) ref_image = ref_image.permute(0, 3, 1, 2)
pose_images = pose_images.permute(0, 3, 1, 2) pose_images = pose_images.permute(0, 3, 1, 2)
if ref_image.shape[1:3] != (224, 224): # if ref_image.shape[1:3] != (224, 224):
ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled") # #ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
else: # ref_img = clip_preprocess(ref_image, 224)
ref_img = ref_image # else:
# ref_img = ref_image
pose_images = pose_images * 2 - 1 pose_images = pose_images * 2 - 1
ref_img = ref_img.to(device).to(dtype) ref_image = ref_image.to(device).to(dtype)
pose_images = pose_images.to(device).to(dtype) pose_images = pose_images.to(device).to(dtype)
generator = torch.Generator(device=device) generator = torch.Generator(device=device)
generator.manual_seed(seed) generator.manual_seed(seed)
frames = pipeline( frames = pipeline(
ref_img, ref_image,
image_pose=pose_images, image_pose=pose_images,
num_frames=B, num_frames=B,
tile_size = context_size, tile_size = context_size,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论