提交 b546cfc3 authored 作者: kijai's avatar kijai

bigger update

上级 4f5929a8
{
"last_node_id": 24,
"last_link_id": 54,
"last_node_id": 51,
"last_link_id": 128,
"nodes": [
{
"id": 9,
"type": "GetImageSizeAndCount",
"id": 42,
"type": "MimicMotionGetPoses",
"pos": [
841,
663
327,
702
],
"size": {
"0": 210,
"1": 86
"0": 330,
"1": 126
},
"flags": {},
"order": 7,
"order": 9,
"mode": 0,
"inputs": [
{
"name": "image",
"name": "ref_image",
"type": "IMAGE",
"link": 53
"link": 110
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 111
}
],
"outputs": [
{
"name": "image",
"name": "poses_with_ref",
"type": "IMAGE",
"links": [
8
112,
114
],
"shape": 3,
"slot_index": 0
},
{
"name": "512 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "768 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "32 count",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 2,
"type": "DownloadAndLoadMimicMotionModel",
"pos": [
352,
352
],
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"name": "pose_images",
"type": "IMAGE",
"links": [
1
113
],
"shape": 3
"shape": 3,
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadMimicMotionModel"
"Node name for S&R": "MimicMotionGetPoses"
},
"widgets_values": [
"MimicMotion-fp16.safetensors",
"fp16"
true,
true,
true
]
},
{
"id": 12,
"type": "PrepImageForClipVision",
"id": 5,
"type": "VHS_LoadVideo",
"pos": [
359,
498
-402,
787
],
"size": [
235.1999969482422,
658.5777723524305
],
"size": {
"0": 315,
"1": 106
},
"flags": {},
"order": 3,
"order": 0,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 12
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
......@@ -114,282 +84,257 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
13
86
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "PrepImageForClipVision"
},
"widgets_values": [
"LANCZOS",
"center",
0
]
},
{
"id": 3,
"type": "LoadImage",
"pos": [
-8,
449
],
"size": {
"0": 315,
"1": 314.0000305175781
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
},
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
12,
36
],
"shape": 3,
"slot_index": 0
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": [
"demo1 (1).jpg",
"image"
]
"widgets_values": {
"video": "pose1.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 16,
"skip_first_frames": 0,
"select_every_nth": 3,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 16,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "pose1.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 3
}
}
}
},
{
"id": 17,
"type": "ImageConcatMulti",
"id": 35,
"type": "ImageResizeKJ",
"pos": [
1752,
883
-75,
781
],
"size": {
"0": 210,
"1": 170
"0": 315,
"1": 242
},
"flags": {},
"order": 11,
"order": 7,
"mode": 0,
"inputs": [
{
"name": "image_1",
"name": "image",
"type": "IMAGE",
"link": 49
"link": 86
},
{
"name": "image_2",
"name": "get_image_size",
"type": "IMAGE",
"link": 54
"link": null
},
{
"name": "image_3",
"type": "IMAGE",
"link": 51
"name": "width_input",
"type": "INT",
"link": 88,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": 89,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "images",
"name": "IMAGE",
"type": "IMAGE",
"links": [
35
111
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {},
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
3,
"right",
576,
1024,
"lanczos",
false,
null
64,
0,
0
]
},
{
"id": 19,
"type": "VHS_SplitImages",
"id": 9,
"type": "GetImageSizeAndCount",
"pos": [
1356,
763
826,
505
],
"size": {
"0": 315,
"1": 118
"0": 210,
"1": 86
},
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "images",
"name": "image",
"type": "IMAGE",
"link": 25
"link": 112
}
],
"outputs": [
{
"name": "IMAGE_A",
"name": "image",
"type": "IMAGE",
"links": null,
"shape": 3
"links": [
92
],
"shape": 3,
"slot_index": 0
},
{
"name": "A_count",
"name": "576 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "IMAGE_B",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 2
"name": "1024 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "B_count",
"name": "17 count",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_SplitImages"
},
"widgets_values": {
"split_index": 1
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 1,
"type": "MimicMotionSampler",
"id": 37,
"type": "VHS_VideoCombine",
"pos": [
1165,
426
723,
819
],
"size": [
440,
978.6666666666666
],
"size": {
"0": 307.6666259765625,
"1": 290
},
"flags": {},
"order": 9,
"order": 11,
"mode": 0,
"inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 1,
"slot_index": 0
},
{
"name": "ref_image",
"name": "images",
"type": "IMAGE",
"link": 13,
"slot_index": 1
"link": 114
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 8
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
25
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MimicMotionSampler"
},
"widgets_values": [
25,
2,
2,
123,
"fixed",
15,
0.02,
0.02
]
},
{
"id": 6,
"type": "DWPreprocessor",
"pos": [
385,
854
],
"size": {
"0": 315,
"1": 198
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "image",
"type": "IMAGE",
"link": 5
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
42
],
"shape": 3,
"slot_index": 0
},
{
"name": "POSE_KEYPOINT",
"type": "POSE_KEYPOINT",
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DWPreprocessor"
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": [
"enable",
"enable",
"enable",
512,
"yolox_l.torchscript.pt",
"dw-ll_ucoco_384_bs5.torchscript.pt"
]
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "MimicPose",
"format": "image/webp",
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicPose_00001.webp",
"subfolder": "",
"type": "temp",
"format": "image/webp"
}
}
}
},
{
"id": 23,
"id": 28,
"type": "ImageResizeKJ",
"pos": [
784,
976
-71,
481
],
"size": {
"0": 315,
......@@ -402,7 +347,7 @@
{
"name": "image",
"type": "IMAGE",
"link": 42
"link": 61
},
{
"name": "get_image_size",
......@@ -431,8 +376,8 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
44,
53
95,
110
],
"shape": 3,
"slot_index": 0
......@@ -440,184 +385,207 @@
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
"links": [
88
],
"shape": 3,
"slot_index": 1
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
"links": [
89
],
"shape": 3,
"slot_index": 2
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
512,
768,
"nearest-exact",
576,
1024,
"lanczos",
false,
8,
64,
0,
0
]
},
{
"id": 21,
"type": "ImageResizeKJ",
"id": 49,
"type": "PreviewImage",
"pos": [
1212,
1157
290,
251
],
"size": [
210,
246
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 36
},
{
"name": "get_image_size",
"name": "images",
"type": "IMAGE",
"link": 37
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
"link": 125
}
],
"properties": {
"Node name for S&R": "PreviewImage"
}
},
{
"id": 2,
"type": "DownloadAndLoadMimicMotionModel",
"pos": [
764,
229
],
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"links": [
49
1
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
"Node name for S&R": "DownloadAndLoadMimicMotionModel"
},
"widgets_values": [
512,
512,
"nearest-exact",
false,
2,
0,
0
"MimicMotion-fp16.safetensors",
"fp16"
]
},
{
"id": 20,
"type": "VHS_SplitImages",
"id": 50,
"type": "Note",
"pos": [
281,
104
],
"size": [
293.73750640869093,
101.2688590393065
],
"flags": {},
"order": 2,
"mode": 0,
"properties": {
"text": ""
},
"widgets_values": [
"ref_image needs to be 224x224 for clip_vision\ndefault seems to be just resizing, you can try other methods like cropping as well"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 51,
"type": "Note",
"pos": [
779,
105
],
"size": [
290.12339111328083,
69.71562744140618
],
"flags": {},
"order": 3,
"mode": 0,
"properties": {
"text": ""
},
"widgets_values": [
"Downloads MimicMotion model and fp16 version of SVD XT 1.1"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 17,
"type": "ImageConcatMulti",
"pos": [
1181,
974
1186,
774
],
"size": {
"0": 315,
"1": 118
"0": 210,
"1": 170
},
"flags": {},
"order": 6,
"order": 13,
"mode": 0,
"inputs": [
{
"name": "images",
"name": "image_1",
"type": "IMAGE",
"link": 95
},
{
"name": "image_2",
"type": "IMAGE",
"link": 113
},
{
"name": "image_3",
"type": "IMAGE",
"link": 44
"link": 99
}
],
"outputs": [
{
"name": "IMAGE_A",
"type": "IMAGE",
"links": null,
"shape": 3
},
{
"name": "A_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "IMAGE_B",
"name": "images",
"type": "IMAGE",
"links": [
37,
54
93
],
"shape": 3,
"slot_index": 2
},
{
"name": "B_count",
"type": "INT",
"links": null,
"shape": 3
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "VHS_SplitImages"
},
"widgets_values": {
"split_index": 1
}
"properties": {},
"widgets_values": [
3,
"right",
false,
null
]
},
{
"id": 16,
"type": "VHS_VideoCombine",
"pos": [
2010,
480
1452,
196
],
"size": [
1004.800048828125,
796.4000244140625
1530.494967759278,
1199.1081290425352
],
"flags": {},
"order": 12,
"order": 14,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 35
"link": 93
},
{
"name": "audio",
......@@ -644,44 +612,163 @@
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "AnimateDiff",
"filename_prefix": "MimicMotion",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": true,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00027.mp4",
"filename": "MimicMotion_00001.mp4",
"subfolder": "",
"type": "output",
"type": "temp",
"format": "video/h264-mp4"
}
}
}
},
{
"id": 5,
"type": "VHS_LoadVideo",
"id": 1,
"type": "MimicMotionSampler",
"pos": [
1101,
419
],
"size": {
"0": 307.6666259765625,
"1": 290
},
"flags": {},
"order": 12,
"mode": 0,
"inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 1,
"slot_index": 0
},
{
"name": "ref_image",
"type": "IMAGE",
"link": 128,
"slot_index": 1
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 92
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
99
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MimicMotionSampler"
},
"widgets_values": [
25,
851
2,
2,
123,
"fixed",
15,
0.02,
true
]
},
{
"id": 3,
"type": "LoadImage",
"pos": [
-393,
311
],
"size": {
"0": 213.0849151611328,
"1": 410.70074462890625
},
"flags": {},
"order": 4,
"mode": 0,
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
61,
124
],
"shape": 3,
"slot_index": 0
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"demo1.jpg",
"image"
]
},
{
"id": 48,
"type": "ImageResizeKJ",
"pos": [
-57,
170
],
"size": [
235.1999969482422,
658.5777723524305
315,
242
],
"flags": {},
"order": 2,
"order": 6,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"name": "image",
"type": "IMAGE",
"link": 124
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
......@@ -689,57 +776,39 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
5
125,
128
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
"links": [],
"shape": 3,
"slot_index": 1
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
"name": "height",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 2
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": {
"video": "pose1.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 16,
"skip_first_frames": 0,
"select_every_nth": 2,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 16,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "pose1.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 2
}
}
}
"widgets_values": [
224,
224,
"lanczos",
false,
64,
0,
0
]
}
],
"links": [
......@@ -752,114 +821,130 @@
"MIMICPIPE"
],
[
5,
5,
61,
3,
0,
6,
28,
0,
"IMAGE"
],
[
8,
9,
86,
5,
0,
35,
0,
"IMAGE"
],
[
88,
28,
1,
35,
2,
"IMAGE"
"INT"
],
[
12,
89,
28,
2,
35,
3,
0,
12,
0,
"IMAGE"
"INT"
],
[
13,
12,
92,
9,
0,
1,
1,
2,
"IMAGE"
],
[
25,
1,
93,
17,
0,
19,
16,
0,
"IMAGE"
],
[
35,
95,
28,
0,
17,
0,
16,
"IMAGE"
],
[
99,
1,
0,
17,
2,
"IMAGE"
],
[
36,
3,
110,
28,
0,
21,
42,
0,
"IMAGE"
],
[
37,
20,
2,
21,
111,
35,
0,
42,
1,
"IMAGE"
],
[
112,
42,
6,
0,
23,
9,
0,
"IMAGE"
],
[
44,
23,
0,
20,
0,
113,
42,
1,
17,
1,
"IMAGE"
],
[
49,
21,
114,
42,
0,
17,
37,
0,
"IMAGE"
],
[
51,
19,
2,
17,
2,
124,
3,
0,
48,
0,
"IMAGE"
],
[
53,
23,
125,
48,
0,
9,
49,
0,
"IMAGE"
],
[
54,
20,
2,
17,
128,
48,
0,
1,
1,
"IMAGE"
]
......@@ -868,10 +953,10 @@
"config": {},
"extra": {
"ds": {
"scale": 0.7513148009015777,
"scale": 0.5644739300537774,
"offset": {
"0": 52.74061584472656,
"1": -197.62571716308594
"0": 738.7613525390625,
"1": 31.221477508544922
}
}
},
......
......@@ -20,6 +20,8 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ..modules.pose_net import PoseNet
from comfy.utils import ProgressBar
import comfy.model_management as mm
offload_device = mm.unet_offload_device()
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
......@@ -145,8 +147,10 @@ class MimicMotionPipeline(DiffusionPipeline):
).pixel_values
image = image.to(device=device, dtype=dtype)
self.image_encoder.to(device)
image_embeddings = self.image_encoder(image).image_embeds
image_embeddings = image_embeddings.unsqueeze(1)
self.image_encoder.to(offload_device)
# duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape
......@@ -189,7 +193,9 @@ class MimicMotionPipeline(DiffusionPipeline):
do_classifier_free_guidance: bool,
):
image = image.to(device=device)
self.vae.to(device)
image_latents = self.vae.encode(image).latent_dist.mode()
self.vae.to(offload_device)
if do_classifier_free_guidance:
negative_image_latents = torch.zeros_like(image_latents)
......@@ -256,7 +262,10 @@ class MimicMotionPipeline(DiffusionPipeline):
# we only pass num_frames_in if it's expected
decode_kwargs["num_frames"] = num_frames_in
self.vae.to(latents.device)
frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
self.vae.to(offload_device)
frames.append(frame.cpu())
frames = torch.cat(frames, dim=0)
......@@ -568,6 +577,8 @@ class MimicMotionPipeline(DiffusionPipeline):
self._guidance_scale = guidance_scale
# 8. Denoising loop
self.unet.to(device)
self._num_timesteps = len(timesteps)
pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames)
indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
......@@ -627,6 +638,8 @@ class MimicMotionPipeline(DiffusionPipeline):
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
latents = callback_outputs.pop("latents", latents)
self.unet.to(offload_device)
if not output_type == "latent":
# cast back to fp16 if needed
......
import os
from omegaconf import OmegaConf
import torch
import torch.nn.functional as F
import sys
import numpy as np
import gc
script_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_directory)
from einops import repeat
import folder_paths
import comfy.model_management as mm
import comfy.utils
from contextlib import nullcontext
try:
from accelerate import init_empty_weights
is_accelerate_available = True
except:
pass
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
script_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_directory)
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
from mimicmotion.modules.pose_net import PoseNet
......@@ -65,6 +55,7 @@ class DownloadAndLoadMimicMotionModel:
], {
"default": 'fp16'
}),
},
}
......@@ -77,6 +68,8 @@ class DownloadAndLoadMimicMotionModel:
device = mm.get_torch_device()
mm.soft_empty_cache()
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
pbar = comfy.utils.ProgressBar(3)
download_path = os.path.join(folder_paths.models_dir, "mimicmotion")
model_path = os.path.join(download_path, model)
......@@ -89,13 +82,20 @@ class DownloadAndLoadMimicMotionModel:
local_dir=download_path,
local_dir_use_symlinks=False)
ckpt_base_name = os.path.basename(model_path)
print(f"Loading model from: {model_path}")
pbar.update(1)
svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1")
if not os.path.exists(svd_path):
raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
#raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
print(f"Downloading SVD model to: {model_path}")
from huggingface_hub import snapshot_download
snapshot_download(repo_id="vdo/stable-video-diffusion-img2vid-xt-1-1",
allow_patterns=[f"*.json", "*fp16*"],
local_dir=svd_path,
local_dir_use_symlinks=False)
pbar.update(1)
mimicmotion_models = MimicMotionModel(svd_path).to(device=device).eval()
mimicmotion_models.load_state_dict(comfy.utils.load_torch_file(model_path), strict=False)
......@@ -108,16 +108,18 @@ class DownloadAndLoadMimicMotionModel:
feature_extractor=mimicmotion_models.feature_extractor,
pose_net=mimicmotion_models.pose_net,
)
pipeline.unet.to(dtype)
pipeline.pose_net.to(dtype)
pipeline.vae.to(dtype)
pipeline.image_encoder.to(dtype)
pipeline.pose_net.to(dtype)
pipeline.pose_net.to(dtype)
mimic_model = {
'pipeline': pipeline,
'dtype': dtype
}
pbar.update(1)
return (mimic_model,)
class MimicMotionSampler:
......@@ -151,16 +153,26 @@ class MimicMotionSampler:
pipeline = mimic_pipeline['pipeline']
B, H, W, C = pose_images.shape
ref_image = ref_image.permute(0, 3, 1, 2).to(device).to(dtype)
pose_images = pose_images.permute(0, 3, 1, 2).to(device).to(dtype)
ref_image = ref_image * 2 - 1
ref_image = ref_image.permute(0, 3, 1, 2)
pose_images = pose_images.permute(0, 3, 1, 2)
if ref_image.shape[1:3] != (224, 224):
ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
else:
ref_img = ref_image
ref_img = ref_img * 2 - 1
pose_images = pose_images * 2 - 1
ref_img = ref_img.to(device).to(dtype)
pose_images = pose_images.to(device).to(dtype)
generator = torch.Generator(device=device)
generator.manual_seed(seed)
frames = pipeline(
ref_image,
ref_img,
image_pose=pose_images,
num_frames=B,
tile_size = 16,
......@@ -177,8 +189,14 @@ class MimicMotionSampler:
output_type="pt",
device=device
).frames
frames = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float()
print(frames.shape)
frames = frames.squeeze(0)[1:].permute(0, 2, 3, 1).cpu().float()
if not keep_model_loaded:
pipeline.unet.to(offload_device)
pipeline.vae.to(offload_device)
mm.soft_empty_cache()
gc.collect()
return frames,
......@@ -194,8 +212,8 @@ class MimicMotionGetPoses:
},
}
RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",)
RETURN_TYPES = ("IMAGE", "IMAGE",)
RETURN_NAMES = ("poses_with_ref", "pose_images")
FUNCTION = "process"
CATEGORY = "MimicMotionWrapper"
......@@ -246,9 +264,11 @@ class MimicMotionGetPoses:
pose_images_np = pose_images.cpu().numpy() * 255
# read input video
pbar = comfy.utils.ProgressBar(len(pose_images_np))
detected_poses_np_list = []
for img_np in pose_images_np:
detected_poses_np_list.append(dwprocessor(img_np))
pbar.update(1)
detected_bodies = np.stack(
[p['bodies']['candidate'] for p in detected_poses_np_list if p['bodies']['candidate'].shape[0] == 18])[:,
......@@ -277,10 +297,7 @@ class MimicMotionGetPoses:
output_tensor = torch.cat((ref_pose_tensor.unsqueeze(0), output_tensor))
output_tensor = output_tensor.permute(0, 2, 3, 1).cpu().float()
return output_tensor,
return output_tensor, output_tensor[1:]
NODE_CLASS_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论