提交 b546cfc3 authored 作者: kijai's avatar kijai

bigger update

上级 4f5929a8
{ {
"last_node_id": 24, "last_node_id": 51,
"last_link_id": 54, "last_link_id": 128,
"nodes": [ "nodes": [
{ {
"id": 9, "id": 42,
"type": "GetImageSizeAndCount", "type": "MimicMotionGetPoses",
"pos": [ "pos": [
841, 327,
663 702
], ],
"size": { "size": {
"0": 210, "0": 330,
"1": 86 "1": 126
}, },
"flags": {}, "flags": {},
"order": 7, "order": 9,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "image", "name": "ref_image",
"type": "IMAGE",
"link": 110
},
{
"name": "pose_images",
"type": "IMAGE", "type": "IMAGE",
"link": 53 "link": 111
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "image", "name": "poses_with_ref",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
8 112,
114
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
}, },
{ {
"name": "512 width", "name": "pose_images",
"type": "INT", "type": "IMAGE",
"links": null,
"shape": 3
},
{
"name": "768 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "32 count",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 2,
"type": "DownloadAndLoadMimicMotionModel",
"pos": [
352,
352
],
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"links": [ "links": [
1 113
], ],
"shape": 3 "shape": 3,
"slot_index": 1
} }
], ],
"properties": { "properties": {
"Node name for S&R": "DownloadAndLoadMimicMotionModel" "Node name for S&R": "MimicMotionGetPoses"
}, },
"widgets_values": [ "widgets_values": [
"MimicMotion-fp16.safetensors", true,
"fp16" true,
true
] ]
}, },
{ {
"id": 12, "id": 5,
"type": "PrepImageForClipVision", "type": "VHS_LoadVideo",
"pos": [ "pos": [
359, -402,
498 787
],
"size": [
235.1999969482422,
658.5777723524305
], ],
"size": {
"0": 315,
"1": 106
},
"flags": {}, "flags": {},
"order": 3, "order": 0,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "image", "name": "meta_batch",
"type": "IMAGE", "type": "VHS_BatchManager",
"link": 12 "link": null
} }
], ],
"outputs": [ "outputs": [
...@@ -114,282 +84,257 @@ ...@@ -114,282 +84,257 @@
"name": "IMAGE", "name": "IMAGE",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
13 86
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
}
],
"properties": {
"Node name for S&R": "PrepImageForClipVision"
},
"widgets_values": [
"LANCZOS",
"center",
0
]
}, },
{ {
"id": 3, "name": "frame_count",
"type": "LoadImage", "type": "INT",
"pos": [ "links": null,
-8, "shape": 3
449
],
"size": {
"0": 315,
"1": 314.0000305175781
}, },
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{ {
"name": "IMAGE", "name": "audio",
"type": "IMAGE", "type": "VHS_AUDIO",
"links": [ "links": null,
12, "shape": 3
36
],
"shape": 3,
"slot_index": 0
}, },
{ {
"name": "MASK", "name": "video_info",
"type": "MASK", "type": "VHS_VIDEOINFO",
"links": null, "links": null,
"shape": 3 "shape": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "LoadImage" "Node name for S&R": "VHS_LoadVideo"
}, },
"widgets_values": [ "widgets_values": {
"demo1 (1).jpg", "video": "pose1.mp4",
"image" "force_rate": 0,
] "force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 16,
"skip_first_frames": 0,
"select_every_nth": 3,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 16,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "pose1.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 3
}
}
}
}, },
{ {
"id": 17, "id": 35,
"type": "ImageConcatMulti", "type": "ImageResizeKJ",
"pos": [ "pos": [
1752, -75,
883 781
], ],
"size": { "size": {
"0": 210, "0": 315,
"1": 170 "1": 242
}, },
"flags": {}, "flags": {},
"order": 11, "order": 7,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "image_1", "name": "image",
"type": "IMAGE", "type": "IMAGE",
"link": 49 "link": 86
}, },
{ {
"name": "image_2", "name": "get_image_size",
"type": "IMAGE", "type": "IMAGE",
"link": 54 "link": null
}, },
{ {
"name": "image_3", "name": "width_input",
"type": "IMAGE", "type": "INT",
"link": 51 "link": 88,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": 89,
"widget": {
"name": "height_input"
}
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "images", "name": "IMAGE",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
35 111
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
} }
], ],
"properties": {}, "properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [ "widgets_values": [
3, 576,
"right", 1024,
"lanczos",
false, false,
null 64,
0,
0
] ]
}, },
{ {
"id": 19, "id": 9,
"type": "VHS_SplitImages", "type": "GetImageSizeAndCount",
"pos": [ "pos": [
1356, 826,
763 505
], ],
"size": { "size": {
"0": 315, "0": 210,
"1": 118 "1": 86
}, },
"flags": {}, "flags": {},
"order": 10, "order": 10,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "images", "name": "image",
"type": "IMAGE", "type": "IMAGE",
"link": 25 "link": 112
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "IMAGE_A", "name": "image",
"type": "IMAGE", "type": "IMAGE",
"links": null, "links": [
"shape": 3 92
],
"shape": 3,
"slot_index": 0
}, },
{ {
"name": "A_count", "name": "576 width",
"type": "INT", "type": "INT",
"links": null, "links": null,
"shape": 3 "shape": 3
}, },
{ {
"name": "IMAGE_B", "name": "1024 height",
"type": "IMAGE", "type": "INT",
"links": [ "links": null,
51 "shape": 3
],
"shape": 3,
"slot_index": 2
}, },
{ {
"name": "B_count", "name": "17 count",
"type": "INT", "type": "INT",
"links": null, "links": null,
"shape": 3 "shape": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "VHS_SplitImages" "Node name for S&R": "GetImageSizeAndCount"
},
"widgets_values": {
"split_index": 1
} }
}, },
{ {
"id": 1, "id": 37,
"type": "MimicMotionSampler", "type": "VHS_VideoCombine",
"pos": [ "pos": [
1165, 723,
426 819
],
"size": [
440,
978.6666666666666
], ],
"size": {
"0": 307.6666259765625,
"1": 290
},
"flags": {}, "flags": {},
"order": 9, "order": 11,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 1,
"slot_index": 0
},
{
"name": "ref_image",
"type": "IMAGE",
"link": 13,
"slot_index": 1
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 8
}
],
"outputs": [
{ {
"name": "images", "name": "images",
"type": "IMAGE", "type": "IMAGE",
"links": [ "link": 114
25
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MimicMotionSampler"
},
"widgets_values": [
25,
2,
2,
123,
"fixed",
15,
0.02,
0.02
]
}, },
{ {
"id": 6, "name": "audio",
"type": "DWPreprocessor", "type": "VHS_AUDIO",
"pos": [ "link": null
385,
854
],
"size": {
"0": 315,
"1": 198
}, },
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{ {
"name": "image", "name": "meta_batch",
"type": "IMAGE", "type": "VHS_BatchManager",
"link": 5 "link": null
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "IMAGE", "name": "Filenames",
"type": "IMAGE", "type": "VHS_FILENAMES",
"links": [
42
],
"shape": 3,
"slot_index": 0
},
{
"name": "POSE_KEYPOINT",
"type": "POSE_KEYPOINT",
"links": null, "links": null,
"shape": 3 "shape": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "DWPreprocessor" "Node name for S&R": "VHS_VideoCombine"
}, },
"widgets_values": [ "widgets_values": {
"enable", "frame_rate": 8,
"enable", "loop_count": 0,
"enable", "filename_prefix": "MimicPose",
512, "format": "image/webp",
"yolox_l.torchscript.pt", "pingpong": false,
"dw-ll_ucoco_384_bs5.torchscript.pt" "save_output": false,
] "videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "MimicPose_00001.webp",
"subfolder": "",
"type": "temp",
"format": "image/webp"
}
}
}
}, },
{ {
"id": 23, "id": 28,
"type": "ImageResizeKJ", "type": "ImageResizeKJ",
"pos": [ "pos": [
784, -71,
976 481
], ],
"size": { "size": {
"0": 315, "0": 315,
...@@ -402,7 +347,7 @@ ...@@ -402,7 +347,7 @@
{ {
"name": "image", "name": "image",
"type": "IMAGE", "type": "IMAGE",
"link": 42 "link": 61
}, },
{ {
"name": "get_image_size", "name": "get_image_size",
...@@ -431,8 +376,8 @@ ...@@ -431,8 +376,8 @@
"name": "IMAGE", "name": "IMAGE",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
44, 95,
53 110
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
...@@ -440,184 +385,207 @@ ...@@ -440,184 +385,207 @@
{ {
"name": "width", "name": "width",
"type": "INT", "type": "INT",
"links": null, "links": [
"shape": 3 88
],
"shape": 3,
"slot_index": 1
}, },
{ {
"name": "height", "name": "height",
"type": "INT", "type": "INT",
"links": null, "links": [
"shape": 3 89
],
"shape": 3,
"slot_index": 2
} }
], ],
"properties": { "properties": {
"Node name for S&R": "ImageResizeKJ" "Node name for S&R": "ImageResizeKJ"
}, },
"widgets_values": [ "widgets_values": [
512, 576,
768, 1024,
"nearest-exact", "lanczos",
false, false,
8, 64,
0, 0,
0 0
] ]
}, },
{ {
"id": 21, "id": 49,
"type": "ImageResizeKJ", "type": "PreviewImage",
"pos": [ "pos": [
1212, 290,
1157 251
],
"size": [
210,
246
], ],
"size": {
"0": 315,
"1": 242
},
"flags": {}, "flags": {},
"order": 8, "order": 8,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "image", "name": "images",
"type": "IMAGE",
"link": 36
},
{
"name": "get_image_size",
"type": "IMAGE", "type": "IMAGE",
"link": 37 "link": 125
}, }
{ ],
"name": "width_input", "properties": {
"type": "INT", "Node name for S&R": "PreviewImage"
"link": null,
"widget": {
"name": "width_input"
} }
}, },
{ {
"name": "height_input", "id": 2,
"type": "INT", "type": "DownloadAndLoadMimicMotionModel",
"link": null, "pos": [
"widget": { 764,
"name": "height_input" 229
}
}
], ],
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [ "outputs": [
{ {
"name": "IMAGE", "name": "mimic_pipeline",
"type": "IMAGE", "type": "MIMICPIPE",
"links": [ "links": [
49 1
], ],
"shape": 3, "shape": 3
"slot_index": 0 }
],
"properties": {
"Node name for S&R": "DownloadAndLoadMimicMotionModel"
},
"widgets_values": [
"MimicMotion-fp16.safetensors",
"fp16"
]
}, },
{ {
"name": "width", "id": 50,
"type": "INT", "type": "Note",
"links": null, "pos": [
"shape": 3 281,
104
],
"size": [
293.73750640869093,
101.2688590393065
],
"flags": {},
"order": 2,
"mode": 0,
"properties": {
"text": ""
},
"widgets_values": [
"ref_image needs to be 224x224 for clip_vision\ndefault seems to be just resizing, you can try other methods like cropping as well"
],
"color": "#432",
"bgcolor": "#653"
}, },
{ {
"name": "height", "id": 51,
"type": "INT", "type": "Note",
"links": null, "pos": [
"shape": 3 779,
} 105
],
"size": [
290.12339111328083,
69.71562744140618
], ],
"flags": {},
"order": 3,
"mode": 0,
"properties": { "properties": {
"Node name for S&R": "ImageResizeKJ" "text": ""
}, },
"widgets_values": [ "widgets_values": [
512, "Downloads MimicMotion model and fp16 version of SVD XT 1.1"
512, ],
"nearest-exact", "color": "#432",
false, "bgcolor": "#653"
2,
0,
0
]
}, },
{ {
"id": 20, "id": 17,
"type": "VHS_SplitImages", "type": "ImageConcatMulti",
"pos": [ "pos": [
1181, 1186,
974 774
], ],
"size": { "size": {
"0": 315, "0": 210,
"1": 118 "1": 170
}, },
"flags": {}, "flags": {},
"order": 6, "order": 13,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "images", "name": "image_1",
"type": "IMAGE",
"link": 95
},
{
"name": "image_2",
"type": "IMAGE",
"link": 113
},
{
"name": "image_3",
"type": "IMAGE", "type": "IMAGE",
"link": 44 "link": 99
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "IMAGE_A", "name": "images",
"type": "IMAGE",
"links": null,
"shape": 3
},
{
"name": "A_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "IMAGE_B",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
37, 93
54
], ],
"shape": 3, "shape": 3,
"slot_index": 2 "slot_index": 0
},
{
"name": "B_count",
"type": "INT",
"links": null,
"shape": 3
} }
], ],
"properties": { "properties": {},
"Node name for S&R": "VHS_SplitImages" "widgets_values": [
}, 3,
"widgets_values": { "right",
"split_index": 1 false,
} null
]
}, },
{ {
"id": 16, "id": 16,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
"pos": [ "pos": [
2010, 1452,
480 196
], ],
"size": [ "size": [
1004.800048828125, 1530.494967759278,
796.4000244140625 1199.1081290425352
], ],
"flags": {}, "flags": {},
"order": 12, "order": 14,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "images", "name": "images",
"type": "IMAGE", "type": "IMAGE",
"link": 35 "link": 93
}, },
{ {
"name": "audio", "name": "audio",
...@@ -644,102 +612,203 @@ ...@@ -644,102 +612,203 @@
"widgets_values": { "widgets_values": {
"frame_rate": 8, "frame_rate": 8,
"loop_count": 0, "loop_count": 0,
"filename_prefix": "AnimateDiff", "filename_prefix": "MimicMotion",
"format": "video/h264-mp4", "format": "video/h264-mp4",
"pix_fmt": "yuv420p", "pix_fmt": "yuv420p",
"crf": 19, "crf": 19,
"save_metadata": true, "save_metadata": true,
"pingpong": false, "pingpong": false,
"save_output": true, "save_output": false,
"videopreview": { "videopreview": {
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "AnimateDiff_00027.mp4", "filename": "MimicMotion_00001.mp4",
"subfolder": "", "subfolder": "",
"type": "output", "type": "temp",
"format": "video/h264-mp4" "format": "video/h264-mp4"
} }
} }
} }
}, },
{ {
"id": 5, "id": 1,
"type": "VHS_LoadVideo", "type": "MimicMotionSampler",
"pos": [ "pos": [
25, 1101,
851 419
],
"size": [
235.1999969482422,
658.5777723524305
], ],
"size": {
"0": 307.6666259765625,
"1": 290
},
"flags": {}, "flags": {},
"order": 2, "order": 12,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "meta_batch", "name": "mimic_pipeline",
"type": "VHS_BatchManager", "type": "MIMICPIPE",
"link": null "link": 1,
"slot_index": 0
},
{
"name": "ref_image",
"type": "IMAGE",
"link": 128,
"slot_index": 1
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 92
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "IMAGE", "name": "images",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
5 99
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
}
],
"properties": {
"Node name for S&R": "MimicMotionSampler"
},
"widgets_values": [
25,
2,
2,
123,
"fixed",
15,
0.02,
true
]
}, },
{ {
"name": "frame_count", "id": 3,
"type": "INT", "type": "LoadImage",
"links": null, "pos": [
"shape": 3 -393,
311
],
"size": {
"0": 213.0849151611328,
"1": 410.70074462890625
}, },
"flags": {},
"order": 4,
"mode": 0,
"outputs": [
{ {
"name": "audio", "name": "IMAGE",
"type": "VHS_AUDIO", "type": "IMAGE",
"links": null, "links": [
"shape": 3 61,
124
],
"shape": 3,
"slot_index": 0
}, },
{ {
"name": "video_info", "name": "MASK",
"type": "VHS_VIDEOINFO", "type": "MASK",
"links": null, "links": null,
"shape": 3 "shape": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "VHS_LoadVideo" "Node name for S&R": "LoadImage"
}, },
"widgets_values": { "widgets_values": [
"video": "pose1.mp4", "demo1.jpg",
"force_rate": 0, "image"
"force_size": "Disabled", ]
"custom_width": 512, },
"custom_height": 512, {
"frame_load_cap": 16, "id": 48,
"skip_first_frames": 0, "type": "ImageResizeKJ",
"select_every_nth": 2, "pos": [
"choose video to upload": "image", -57,
"videopreview": { 170
"hidden": false, ],
"paused": false, "size": [
"params": { 315,
"frame_load_cap": 16, 242
"skip_first_frames": 0, ],
"force_rate": 0, "flags": {},
"filename": "pose1.mp4", "order": 6,
"type": "input", "mode": 0,
"format": "video/mp4", "inputs": [
"select_every_nth": 2 {
"name": "image",
"type": "IMAGE",
"link": 124
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
} }
} }
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
125,
128
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 1
},
{
"name": "height",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 2
} }
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
224,
224,
"lanczos",
false,
64,
0,
0
]
} }
], ],
"links": [ "links": [
...@@ -752,114 +821,130 @@ ...@@ -752,114 +821,130 @@
"MIMICPIPE" "MIMICPIPE"
], ],
[ [
5, 61,
5, 3,
0, 0,
6, 28,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
8, 86,
9, 5,
0,
35,
0, 0,
"IMAGE"
],
[
88,
28,
1, 1,
35,
2, 2,
"IMAGE" "INT"
], ],
[ [
12, 89,
28,
2,
35,
3, 3,
0, "INT"
12,
0,
"IMAGE"
], ],
[ [
13, 92,
12, 9,
0, 0,
1, 1,
1, 2,
"IMAGE" "IMAGE"
], ],
[ [
25, 93,
1, 17,
0, 0,
19, 16,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
35, 95,
28,
0,
17, 17,
0, 0,
16, "IMAGE"
],
[
99,
1,
0, 0,
17,
2,
"IMAGE" "IMAGE"
], ],
[ [
36, 110,
3, 28,
0, 0,
21, 42,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
37, 111,
20, 35,
2, 0,
21, 42,
1, 1,
"IMAGE" "IMAGE"
], ],
[ [
112,
42, 42,
6,
0, 0,
23, 9,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
44, 113,
23, 42,
0, 1,
20, 17,
0, 1,
"IMAGE" "IMAGE"
], ],
[ [
49, 114,
21, 42,
0, 0,
17, 37,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
51, 124,
19, 3,
2, 0,
17, 48,
2, 0,
"IMAGE" "IMAGE"
], ],
[ [
53, 125,
23, 48,
0, 0,
9, 49,
0, 0,
"IMAGE" "IMAGE"
], ],
[ [
54, 128,
20, 48,
2, 0,
17, 1,
1, 1,
"IMAGE" "IMAGE"
] ]
...@@ -868,10 +953,10 @@ ...@@ -868,10 +953,10 @@
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.7513148009015777, "scale": 0.5644739300537774,
"offset": { "offset": {
"0": 52.74061584472656, "0": 738.7613525390625,
"1": -197.62571716308594 "1": 31.221477508544922
} }
} }
}, },
......
...@@ -20,6 +20,8 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection ...@@ -20,6 +20,8 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ..modules.pose_net import PoseNet from ..modules.pose_net import PoseNet
from comfy.utils import ProgressBar from comfy.utils import ProgressBar
import comfy.model_management as mm
offload_device = mm.unet_offload_device()
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
...@@ -145,8 +147,10 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -145,8 +147,10 @@ class MimicMotionPipeline(DiffusionPipeline):
).pixel_values ).pixel_values
image = image.to(device=device, dtype=dtype) image = image.to(device=device, dtype=dtype)
self.image_encoder.to(device)
image_embeddings = self.image_encoder(image).image_embeds image_embeddings = self.image_encoder(image).image_embeds
image_embeddings = image_embeddings.unsqueeze(1) image_embeddings = image_embeddings.unsqueeze(1)
self.image_encoder.to(offload_device)
# duplicate image embeddings for each generation per prompt, using mps friendly method # duplicate image embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = image_embeddings.shape bs_embed, seq_len, _ = image_embeddings.shape
...@@ -189,7 +193,9 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -189,7 +193,9 @@ class MimicMotionPipeline(DiffusionPipeline):
do_classifier_free_guidance: bool, do_classifier_free_guidance: bool,
): ):
image = image.to(device=device) image = image.to(device=device)
self.vae.to(device)
image_latents = self.vae.encode(image).latent_dist.mode() image_latents = self.vae.encode(image).latent_dist.mode()
self.vae.to(offload_device)
if do_classifier_free_guidance: if do_classifier_free_guidance:
negative_image_latents = torch.zeros_like(image_latents) negative_image_latents = torch.zeros_like(image_latents)
...@@ -256,7 +262,10 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -256,7 +262,10 @@ class MimicMotionPipeline(DiffusionPipeline):
# we only pass num_frames_in if it's expected # we only pass num_frames_in if it's expected
decode_kwargs["num_frames"] = num_frames_in decode_kwargs["num_frames"] = num_frames_in
self.vae.to(latents.device)
frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
self.vae.to(offload_device)
frames.append(frame.cpu()) frames.append(frame.cpu())
frames = torch.cat(frames, dim=0) frames = torch.cat(frames, dim=0)
...@@ -568,6 +577,8 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -568,6 +577,8 @@ class MimicMotionPipeline(DiffusionPipeline):
self._guidance_scale = guidance_scale self._guidance_scale = guidance_scale
# 8. Denoising loop # 8. Denoising loop
self.unet.to(device)
self._num_timesteps = len(timesteps) self._num_timesteps = len(timesteps)
pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames) pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames)
indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
...@@ -628,6 +639,8 @@ class MimicMotionPipeline(DiffusionPipeline): ...@@ -628,6 +639,8 @@ class MimicMotionPipeline(DiffusionPipeline):
latents = callback_outputs.pop("latents", latents) latents = callback_outputs.pop("latents", latents)
self.unet.to(offload_device)
if not output_type == "latent": if not output_type == "latent":
# cast back to fp16 if needed # cast back to fp16 if needed
# if needs_upcasting: # if needs_upcasting:
......
import os import os
from omegaconf import OmegaConf
import torch import torch
import torch.nn.functional as F
import sys import sys
import numpy as np import numpy as np
import gc
script_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_directory)
from einops import repeat
import folder_paths import folder_paths
import comfy.model_management as mm import comfy.model_management as mm
import comfy.utils import comfy.utils
from contextlib import nullcontext
try:
from accelerate import init_empty_weights
is_accelerate_available = True
except:
pass
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from diffusers.models import AutoencoderKLTemporalDecoder from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
script_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_directory)
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from mimicmotion.modules.unet import UNetSpatioTemporalConditionModel from mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
from mimicmotion.modules.pose_net import PoseNet from mimicmotion.modules.pose_net import PoseNet
...@@ -65,6 +55,7 @@ class DownloadAndLoadMimicMotionModel: ...@@ -65,6 +55,7 @@ class DownloadAndLoadMimicMotionModel:
], { ], {
"default": 'fp16' "default": 'fp16'
}), }),
}, },
} }
...@@ -78,6 +69,8 @@ class DownloadAndLoadMimicMotionModel: ...@@ -78,6 +69,8 @@ class DownloadAndLoadMimicMotionModel:
mm.soft_empty_cache() mm.soft_empty_cache()
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
pbar = comfy.utils.ProgressBar(3)
download_path = os.path.join(folder_paths.models_dir, "mimicmotion") download_path = os.path.join(folder_paths.models_dir, "mimicmotion")
model_path = os.path.join(download_path, model) model_path = os.path.join(download_path, model)
...@@ -89,13 +82,20 @@ class DownloadAndLoadMimicMotionModel: ...@@ -89,13 +82,20 @@ class DownloadAndLoadMimicMotionModel:
local_dir=download_path, local_dir=download_path,
local_dir_use_symlinks=False) local_dir_use_symlinks=False)
ckpt_base_name = os.path.basename(model_path)
print(f"Loading model from: {model_path}") print(f"Loading model from: {model_path}")
pbar.update(1)
svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1") svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1")
if not os.path.exists(svd_path): if not os.path.exists(svd_path):
raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}") #raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
print(f"Downloading SVD model to: {model_path}")
from huggingface_hub import snapshot_download
snapshot_download(repo_id="vdo/stable-video-diffusion-img2vid-xt-1-1",
allow_patterns=[f"*.json", "*fp16*"],
local_dir=svd_path,
local_dir_use_symlinks=False)
pbar.update(1)
mimicmotion_models = MimicMotionModel(svd_path).to(device=device).eval() mimicmotion_models = MimicMotionModel(svd_path).to(device=device).eval()
mimicmotion_models.load_state_dict(comfy.utils.load_torch_file(model_path), strict=False) mimicmotion_models.load_state_dict(comfy.utils.load_torch_file(model_path), strict=False)
...@@ -108,6 +108,7 @@ class DownloadAndLoadMimicMotionModel: ...@@ -108,6 +108,7 @@ class DownloadAndLoadMimicMotionModel:
feature_extractor=mimicmotion_models.feature_extractor, feature_extractor=mimicmotion_models.feature_extractor,
pose_net=mimicmotion_models.pose_net, pose_net=mimicmotion_models.pose_net,
) )
pipeline.unet.to(dtype) pipeline.unet.to(dtype)
pipeline.pose_net.to(dtype) pipeline.pose_net.to(dtype)
pipeline.vae.to(dtype) pipeline.vae.to(dtype)
...@@ -118,6 +119,7 @@ class DownloadAndLoadMimicMotionModel: ...@@ -118,6 +119,7 @@ class DownloadAndLoadMimicMotionModel:
'pipeline': pipeline, 'pipeline': pipeline,
'dtype': dtype 'dtype': dtype
} }
pbar.update(1)
return (mimic_model,) return (mimic_model,)
class MimicMotionSampler: class MimicMotionSampler:
...@@ -151,16 +153,26 @@ class MimicMotionSampler: ...@@ -151,16 +153,26 @@ class MimicMotionSampler:
pipeline = mimic_pipeline['pipeline'] pipeline = mimic_pipeline['pipeline']
B, H, W, C = pose_images.shape B, H, W, C = pose_images.shape
ref_image = ref_image.permute(0, 3, 1, 2).to(device).to(dtype)
pose_images = pose_images.permute(0, 3, 1, 2).to(device).to(dtype) ref_image = ref_image.permute(0, 3, 1, 2)
ref_image = ref_image * 2 - 1 pose_images = pose_images.permute(0, 3, 1, 2)
if ref_image.shape[1:3] != (224, 224):
ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
else:
ref_img = ref_image
ref_img = ref_img * 2 - 1
pose_images = pose_images * 2 - 1 pose_images = pose_images * 2 - 1
ref_img = ref_img.to(device).to(dtype)
pose_images = pose_images.to(device).to(dtype)
generator = torch.Generator(device=device) generator = torch.Generator(device=device)
generator.manual_seed(seed) generator.manual_seed(seed)
frames = pipeline( frames = pipeline(
ref_image, ref_img,
image_pose=pose_images, image_pose=pose_images,
num_frames=B, num_frames=B,
tile_size = 16, tile_size = 16,
...@@ -177,8 +189,14 @@ class MimicMotionSampler: ...@@ -177,8 +189,14 @@ class MimicMotionSampler:
output_type="pt", output_type="pt",
device=device device=device
).frames ).frames
frames = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float() frames = frames.squeeze(0)[1:].permute(0, 2, 3, 1).cpu().float()
print(frames.shape)
if not keep_model_loaded:
pipeline.unet.to(offload_device)
pipeline.vae.to(offload_device)
mm.soft_empty_cache()
gc.collect()
return frames, return frames,
...@@ -194,8 +212,8 @@ class MimicMotionGetPoses: ...@@ -194,8 +212,8 @@ class MimicMotionGetPoses:
}, },
} }
RETURN_TYPES = ("IMAGE",) RETURN_TYPES = ("IMAGE", "IMAGE",)
RETURN_NAMES = ("images",) RETURN_NAMES = ("poses_with_ref", "pose_images")
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "MimicMotionWrapper" CATEGORY = "MimicMotionWrapper"
...@@ -246,9 +264,11 @@ class MimicMotionGetPoses: ...@@ -246,9 +264,11 @@ class MimicMotionGetPoses:
pose_images_np = pose_images.cpu().numpy() * 255 pose_images_np = pose_images.cpu().numpy() * 255
# read input video # read input video
pbar = comfy.utils.ProgressBar(len(pose_images_np))
detected_poses_np_list = [] detected_poses_np_list = []
for img_np in pose_images_np: for img_np in pose_images_np:
detected_poses_np_list.append(dwprocessor(img_np)) detected_poses_np_list.append(dwprocessor(img_np))
pbar.update(1)
detected_bodies = np.stack( detected_bodies = np.stack(
[p['bodies']['candidate'] for p in detected_poses_np_list if p['bodies']['candidate'].shape[0] == 18])[:, [p['bodies']['candidate'] for p in detected_poses_np_list if p['bodies']['candidate'].shape[0] == 18])[:,
...@@ -277,10 +297,7 @@ class MimicMotionGetPoses: ...@@ -277,10 +297,7 @@ class MimicMotionGetPoses:
output_tensor = torch.cat((ref_pose_tensor.unsqueeze(0), output_tensor)) output_tensor = torch.cat((ref_pose_tensor.unsqueeze(0), output_tensor))
output_tensor = output_tensor.permute(0, 2, 3, 1).cpu().float() output_tensor = output_tensor.permute(0, 2, 3, 1).cpu().float()
return output_tensor, return output_tensor, output_tensor[1:]
NODE_CLASS_MAPPINGS = { NODE_CLASS_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel, "DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论