提交 97197011 authored 作者: kijai's avatar kijai

separate decode phase

上级 e3933422
{
"last_node_id": 54,
"last_link_id": 139,
"last_node_id": 58,
"last_link_id": 151,
"nodes": [
{
"id": 51,
"type": "Note",
"pos": [
766,
105
],
"size": {
"0": 290.1233825683594,
"1": 69.71562957763672
},
"flags": {},
"order": 0,
"mode": 0,
"properties": {
"text": ""
},
"widgets_values": [
"Downloads MimicMotion model and fp16 version of SVD XT 1.1"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 9,
"type": "GetImageSizeAndCount",
......@@ -51,25 +28,25 @@
"name": "image",
"type": "IMAGE",
"links": [
92
148
],
"shape": 3,
"slot_index": 0
},
{
"name": "1024 width",
"name": "576 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "576 height",
"name": "1024 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "129 count",
"name": "16 count",
"type": "INT",
"links": null,
"shape": 3
......@@ -91,14 +68,15 @@
"1": 106
},
"flags": {},
"order": 1,
"order": 0,
"mode": 0,
"outputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"links": [
1
146,
150
],
"shape": 3
}
......@@ -112,122 +90,6 @@
false
]
},
{
"id": 17,
"type": "ImageConcatMulti",
"pos": [
1212,
865
],
"size": {
"0": 210,
"1": 190
},
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "image_1",
"type": "IMAGE",
"link": 95
},
{
"name": "image_2",
"type": "IMAGE",
"link": 137
},
{
"name": "image_3",
"type": "IMAGE",
"link": 138
},
{
"name": "image_4",
"type": "IMAGE",
"link": 139
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
93
],
"shape": 3,
"slot_index": 0
}
],
"properties": {},
"widgets_values": [
4,
"right",
false,
null
]
},
{
"id": 37,
"type": "VHS_VideoCombine",
"pos": [
653,
958
],
"size": [
440,
468.25
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 114
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "MimicPose",
"format": "image/webp",
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicPose_00001.webp",
"subfolder": "",
"type": "temp",
"format": "image/webp"
}
}
}
},
{
"id": 42,
"type": "MimicMotionGetPoses",
......@@ -284,70 +146,6 @@
true
]
},
{
"id": 16,
"type": "VHS_VideoCombine",
"pos": [
1483,
95
],
"size": [
2861.6603258383248,
703.6084833210144
],
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 93
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 12,
"loop_count": 0,
"filename_prefix": "MimicMotion",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicMotion_00010.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4"
}
}
}
},
{
"id": 3,
"type": "LoadImage",
......@@ -360,7 +158,7 @@
"1": 410.70074462890625
},
"flags": {},
"order": 2,
"order": 1,
"mode": 0,
"outputs": [
{
......@@ -388,24 +186,45 @@
]
},
{
"id": 5,
"type": "VHS_LoadVideo",
"id": 35,
"type": "ImageResizeKJ",
"pos": [
-402,
787
],
"size": [
235.1999969482422,
658.5777723524305
-75,
781
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 3,
"order": 5,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"name": "image",
"type": "IMAGE",
"link": 86
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": 88,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": 89,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
......@@ -413,77 +232,57 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
86
111,
137
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": {
"video": "pose1.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 15,
"skip_first_frames": 0,
"select_every_nth": 2,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 15,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "pose1.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 2
}
}
}
"widgets_values": [
576,
1024,
"lanczos",
false,
64,
0,
0
]
},
{
"id": 35,
"id": 28,
"type": "ImageResizeKJ",
"pos": [
-75,
781
-71,
481
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"order": 4,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 86
"link": 61
},
{
"name": "get_image_size",
......@@ -493,7 +292,7 @@
{
"name": "width_input",
"type": "INT",
"link": 88,
"link": null,
"widget": {
"name": "width_input"
}
......@@ -501,7 +300,7 @@
{
"name": "height_input",
"type": "INT",
"link": 89,
"link": null,
"widget": {
"name": "height_input"
}
......@@ -512,8 +311,9 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
111,
137
95,
110,
147
],
"shape": 3,
"slot_index": 0
......@@ -521,14 +321,20 @@
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
"links": [
88
],
"shape": 3,
"slot_index": 1
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
"links": [
89
],
"shape": 3,
"slot_index": 2
}
],
"properties": {
......@@ -538,107 +344,185 @@
576,
1024,
"lanczos",
false,
true,
64,
0,
0
]
},
{
"id": 28,
"type": "ImageResizeKJ",
"id": 16,
"type": "VHS_VideoCombine",
"pos": [
-71,
481
1895,
155
],
"size": [
2861.660400390625,
1566.960177951389
],
"flags": {},
"order": 12,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 93
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 12,
"loop_count": 0,
"filename_prefix": "MimicMotion",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicMotion_00001.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4"
}
}
}
},
{
"id": 17,
"type": "ImageConcatMulti",
"pos": [
1644,
830
],
"size": {
"0": 315,
"1": 242
"0": 210,
"1": 190
},
"flags": {},
"order": 4,
"order": 11,
"mode": 0,
"inputs": [
{
"name": "image",
"name": "image_1",
"type": "IMAGE",
"link": 61
"link": 95
},
{
"name": "get_image_size",
"name": "image_2",
"type": "IMAGE",
"link": null
"link": 137
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
"name": "image_3",
"type": "IMAGE",
"link": 138
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
"name": "image_4",
"type": "IMAGE",
"link": 151
}
],
"outputs": [
{
"name": "IMAGE",
"name": "images",
"type": "IMAGE",
"links": [
95,
110,
136
93
],
"shape": 3,
"slot_index": 0
}
],
"properties": {},
"widgets_values": [
4,
"right",
false,
null
]
},
{
"name": "width",
"type": "INT",
"links": [
88
"id": 58,
"type": "MimicMotionDecode",
"pos": [
1466,
396
],
"shape": 3,
"slot_index": 1
"size": [
255.46680297851572,
78
],
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 150,
"slot_index": 0
},
{
"name": "height",
"type": "INT",
"name": "samples",
"type": "LATENT",
"link": 149
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
89
151
],
"shape": 3,
"slot_index": 2
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
"Node name for S&R": "MimicMotionDecode"
},
"widgets_values": [
576,
1024,
"lanczos",
true,
64,
0,
0
4
]
},
{
"id": 1,
"id": 57,
"type": "MimicMotionSampler",
"pos": [
1101,
419
],
"size": {
"0": 307.6666259765625,
"0": 315,
"1": 314
},
"flags": {},
......@@ -648,27 +532,25 @@
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 1,
"slot_index": 0
"link": 146
},
{
"name": "ref_image",
"type": "IMAGE",
"link": 136,
"slot_index": 1
"link": 147
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 92
"link": 148
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"name": "samples",
"type": "LATENT",
"links": [
139
149
],
"shape": 3,
"slot_index": 0
......@@ -687,19 +569,173 @@
0,
16,
6,
true
false
]
},
{
"id": 37,
"type": "VHS_VideoCombine",
"pos": [
678,
897
],
"size": [
440,
978.6666666666666
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 114
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "MimicPose",
"format": "image/webp",
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicPose_00001.webp",
"subfolder": "",
"type": "temp",
"format": "image/webp"
}
}
}
},
{
"id": 51,
"type": "Note",
"pos": [
770,
85
],
"size": [
310.11510095517497,
95.53232006987258
],
"flags": {},
"order": 2,
"mode": 0,
"properties": {
"text": ""
},
"widgets_values": [
"Downloads MimicMotion model and fp16 version of SVD XT 1.1\n\nlcm version is experimental and most likely doesn't work well"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 5,
"type": "VHS_LoadVideo",
"pos": [
-402,
787
],
"size": [
235.1999969482422,
658.5777723524305
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
[
1,
2,
0,
1,
0,
"MIMICPIPE"
86
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "pose1.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 15,
"skip_first_frames": 0,
"select_every_nth": 2,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 15,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "pose1.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 2
}
}
}
}
],
"links": [
[
61,
3,
......@@ -732,14 +768,6 @@
3,
"INT"
],
[
92,
9,
0,
1,
2,
"IMAGE"
],
[
93,
17,
......@@ -788,14 +816,6 @@
0,
"IMAGE"
],
[
136,
28,
0,
1,
1,
"IMAGE"
],
[
137,
35,
......@@ -813,8 +833,48 @@
"IMAGE"
],
[
139,
146,
2,
0,
57,
0,
"MIMICPIPE"
],
[
147,
28,
0,
57,
1,
"IMAGE"
],
[
148,
9,
0,
57,
2,
"IMAGE"
],
[
149,
57,
0,
58,
1,
"LATENT"
],
[
150,
2,
0,
58,
0,
"MIMICPIPE"
],
[
151,
58,
0,
17,
3,
......@@ -825,10 +885,10 @@
"config": {},
"extra": {
"ds": {
"scale": 0.6934334949441352,
"scale": 0.5644739300537777,
"offset": {
"0": 466.36474609375,
"1": 44.73270034790039
"0": 763.3873291015625,
"1": 37.92726135253906
}
}
},
......
......@@ -259,6 +259,7 @@ class MimicMotionPipeline(DiffusionPipeline):
accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
# decode decode_chunk_size frames at a time to avoid OOM
pbar = ProgressBar(latents.shape[0])
frames = []
for i in range(0, latents.shape[0], decode_chunk_size):
num_frames_in = latents[i: i + decode_chunk_size].shape[0]
......@@ -272,6 +273,7 @@ class MimicMotionPipeline(DiffusionPipeline):
self.vae.to(offload_device)
frames.append(frame.cpu())
pbar.update(decode_chunk_size)
frames = torch.cat(frames, dim=0)
# [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
......@@ -485,6 +487,7 @@ class MimicMotionPipeline(DiffusionPipeline):
width = width or self.unet.config.sample_size * self.vae_scale_factor
num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
print("num_frames: ", num_frames)
decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
# 1. Check inputs. Raise error if not correct
......
......@@ -7,15 +7,13 @@ import folder_paths
import comfy.model_management as mm
import comfy.utils
from comfy.clip_vision import clip_preprocess
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
script_directory = os.path.dirname(os.path.abspath(__file__))
from .mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from .mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline, tensor2vid
from .mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
from .mimicmotion.modules.pose_net import PoseNet
......@@ -140,7 +138,6 @@ class DownloadAndLoadMimicMotionModel:
pipeline.pose_net.to(dtype)
pipeline.vae.to(dtype)
pipeline.image_encoder.to(dtype)
pipeline.pose_net.to(dtype)
mimic_model = {
'pipeline': pipeline,
......@@ -168,8 +165,8 @@ class MimicMotionSampler:
},
}
RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",)
RETURN_TYPES = ("LATENT",)
RETURN_NAMES = ("samples",)
FUNCTION = "process"
CATEGORY = "MimicMotionWrapper"
......@@ -215,18 +212,47 @@ class MimicMotionSampler:
min_guidance_scale=cfg_min,
max_guidance_scale=cfg_max,
decode_chunk_size=4,
output_type="pt",
output_type="latent",
device=device
).frames
frames = frames.squeeze(0)[1:].permute(0, 2, 3, 1).cpu().float()
#frames = frames.squeeze(0)[1:].permute(0, 2, 3, 1).cpu().float()
if not keep_model_loaded:
pipeline.unet.to(offload_device)
pipeline.vae.to(offload_device)
mm.soft_empty_cache()
gc.collect()
return {"samples": frames},
class MimicMotionDecode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"mimic_pipeline": ("MIMICPIPE",),
"samples": ("LATENT",),
"decode_chunk_size": ("INT", {"default": 4, "min": 1, "max": 200, "step": 1})
},
}
RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",)
FUNCTION = "process"
CATEGORY = "MimicMotionWrapper"
def process(self, mimic_pipeline, samples, decode_chunk_size):
mm.soft_empty_cache()
pipeline = mimic_pipeline['pipeline']
num_frames = samples['samples'].shape[0]
try:
frames = pipeline.decode_latents(samples['samples'], num_frames, decode_chunk_size)
except:
frames = pipeline.decode_latents(samples['samples'], num_frames, 1)
frames = tensor2vid(frames, pipeline.image_processor, output_type="pt")
frames = frames.squeeze(1)[1:].permute(0, 2, 3, 1).cpu().float()
return frames,
class MimicMotionGetPoses:
......@@ -251,6 +277,8 @@ class MimicMotionGetPoses:
from .mimicmotion.dwpose.util import draw_pose
from .mimicmotion.dwpose.dwpose_detector import DWposeDetector
assert ref_image.shape[1:3] == pose_images.shape[1:3], "ref_image and pose_images must have the same resolution"
yolo_model = "yolox_l.onnx"
dw_pose_model = "dw-ll_ucoco_384.onnx"
model_base_path = os.path.join(script_directory, "models", "DWPose")
......@@ -331,11 +359,13 @@ class MimicMotionGetPoses:
NODE_CLASS_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,
"MimicMotionSampler": MimicMotionSampler,
"MimicMotionGetPoses": MimicMotionGetPoses
"MimicMotionGetPoses": MimicMotionGetPoses,
"MimicMotionDecode": MimicMotionDecode
}
NODE_DISPLAY_NAME_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": "DownloadAndLoadMimicMotionModel",
"MimicMotionSampler": "MimicMotionSampler",
"MimicMotionGetPoses": "MimicMotionGetPoses"
"MimicMotionGetPoses": "MimicMotionGetPoses",
"MimicMotionDecode": "MimicMotionDecode"
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论