bigger update

b546cfc3 · kijai · 4f5929a8 · b546cfc3 · b546cfc3 · b546cfc3
--- a/examples/mimic_motion_example_01.json
+++ b/examples/mimic_motion_example_01.json
 {
-  "last_node_id": 24,
-  "last_link_id": 54,
+  "last_node_id": 51,
+  "last_link_id": 128,
  "nodes": [
    {
-      "id": 9,
-      "type": "GetImageSizeAndCount",
+      "id": 42,
+      "type": "MimicMotionGetPoses",
      "pos": [
-        841,
-        663
+        327,
+        702
      ],
      "size": {
-        "0": 210,
-        "1": 86
+        "0": 330,
+        "1": 126
      },
      "flags": {},
-      "order": 7,
+      "order": 9,
      "mode": 0,
      "inputs": [
        {
-          "name": "image",
+          "name": "ref_image",
          "type": "IMAGE",
-          "link": 53
+          "link": 110
+        },
+        {
+          "name": "pose_images",
+          "type": "IMAGE",
+          "link": 111
        }
      ],
      "outputs": [
        {
-          "name": "image",
+          "name": "poses_with_ref",
          "type": "IMAGE",
          "links": [
-            8
+            112,
+            114
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
-          "name": "512 width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "768 height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "32 count",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
-      }
-    },
-    {
-      "id": 2,
-      "type": "DownloadAndLoadMimicMotionModel",
-      "pos": [
-        352,
-        352
-      ],
-      "size": {
-        "0": 315,
-        "1": 82
-      },
-      "flags": {},
-      "order": 0,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "mimic_pipeline",
-          "type": "MIMICPIPE",
+          "name": "pose_images",
+          "type": "IMAGE",
          "links": [
-            1
+            113
          ],
-          "shape": 3
+          "shape": 3,
+          "slot_index": 1
        }
      ],
      "properties": {
-        "Node name for S&R": "DownloadAndLoadMimicMotionModel"
+        "Node name for S&R": "MimicMotionGetPoses"
      },
      "widgets_values": [
-        "MimicMotion-fp16.safetensors",
-        "fp16"
+        true,
+        true,
+        true
      ]
    },
    {
-      "id": 12,
-      "type": "PrepImageForClipVision",
+      "id": 5,
+      "type": "VHS_LoadVideo",
      "pos": [
-        359,
-        498
+        -402,
+        787
+      ],
+      "size": [
+        235.1999969482422,
+        658.5777723524305
      ],
-      "size": {
-        "0": 315,
-        "1": 106
-      },
      "flags": {},
-      "order": 3,
+      "order": 0,
      "mode": 0,
      "inputs": [
        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 12
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
        }
      ],
      "outputs": [
@@ -114,282 +84,257 @@
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            13
+            86
          ],
          "shape": 3,
          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "PrepImageForClipVision"
-      },
-      "widgets_values": [
-        "LANCZOS",
-        "center",
-        0
-      ]
-    },
-    {
-      "id": 3,
-      "type": "LoadImage",
-      "pos": [
-        -8,
-        449
-      ],
-      "size": {
-        "0": 315,
-        "1": 314.0000305175781
-      },
-      "flags": {},
-      "order": 1,
-      "mode": 0,
-      "outputs": [
+        },
        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            12,
-            36
-          ],
-          "shape": 3,
-          "slot_index": 0
+          "name": "frame_count",
+          "type": "INT",
+          "links": null,
+          "shape": 3
        },
        {
-          "name": "MASK",
-          "type": "MASK",
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "video_info",
+          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "LoadImage"
+        "Node name for S&R": "VHS_LoadVideo"
      },
-      "widgets_values": [
-        "demo1 (1).jpg",
-        "image"
-      ]
+      "widgets_values": {
+        "video": "pose1.mp4",
+        "force_rate": 0,
+        "force_size": "Disabled",
+        "custom_width": 512,
+        "custom_height": 512,
+        "frame_load_cap": 16,
+        "skip_first_frames": 0,
+        "select_every_nth": 3,
+        "choose video to upload": "image",
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "frame_load_cap": 16,
+            "skip_first_frames": 0,
+            "force_rate": 0,
+            "filename": "pose1.mp4",
+            "type": "input",
+            "format": "video/mp4",
+            "select_every_nth": 3
+          }
+        }
+      }
    },
    {
-      "id": 17,
-      "type": "ImageConcatMulti",
+      "id": 35,
+      "type": "ImageResizeKJ",
      "pos": [
-        1752,
-        883
+        -75,
+        781
      ],
      "size": {
-        "0": 210,
-        "1": 170
+        "0": 315,
+        "1": 242
      },
      "flags": {},
-      "order": 11,
+      "order": 7,
      "mode": 0,
      "inputs": [
        {
-          "name": "image_1",
+          "name": "image",
          "type": "IMAGE",
-          "link": 49
+          "link": 86
        },
        {
-          "name": "image_2",
+          "name": "get_image_size",
          "type": "IMAGE",
-          "link": 54
+          "link": null
        },
        {
-          "name": "image_3",
-          "type": "IMAGE",
-          "link": 51
+          "name": "width_input",
+          "type": "INT",
+          "link": 88,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": 89,
+          "widget": {
+            "name": "height_input"
+          }
        }
      ],
      "outputs": [
        {
-          "name": "images",
+          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            35
+            111
          ],
          "shape": 3,
          "slot_index": 0
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
        }
      ],
-      "properties": {},
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
      "widgets_values": [
-        3,
-        "right",
+        576,
+        1024,
+        "lanczos",
        false,
-        null
+        64,
+        0,
+        0
      ]
    },
    {
-      "id": 19,
-      "type": "VHS_SplitImages",
+      "id": 9,
+      "type": "GetImageSizeAndCount",
      "pos": [
-        1356,
-        763
+        826,
+        505
      ],
      "size": {
-        "0": 315,
-        "1": 118
+        "0": 210,
+        "1": 86
      },
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
-          "name": "images",
+          "name": "image",
          "type": "IMAGE",
-          "link": 25
+          "link": 112
        }
      ],
      "outputs": [
        {
-          "name": "IMAGE_A",
+          "name": "image",
          "type": "IMAGE",
-          "links": null,
-          "shape": 3
+          "links": [
+            92
+          ],
+          "shape": 3,
+          "slot_index": 0
        },
        {
-          "name": "A_count",
+          "name": "576 width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
-          "name": "IMAGE_B",
-          "type": "IMAGE",
-          "links": [
-            51
-          ],
-          "shape": 3,
-          "slot_index": 2
+          "name": "1024 height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
        },
        {
-          "name": "B_count",
+          "name": "17 count",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "VHS_SplitImages"
-      },
-      "widgets_values": {
-        "split_index": 1
+        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
-      "id": 1,
-      "type": "MimicMotionSampler",
+      "id": 37,
+      "type": "VHS_VideoCombine",
      "pos": [
-        1165,
-        426
+        723,
+        819
+      ],
+      "size": [
+        440,
+        978.6666666666666
      ],
-      "size": {
-        "0": 307.6666259765625,
-        "1": 290
-      },
      "flags": {},
-      "order": 9,
+      "order": 11,
      "mode": 0,
      "inputs": [
        {
-          "name": "mimic_pipeline",
-          "type": "MIMICPIPE",
-          "link": 1,
-          "slot_index": 0
-        },
-        {
-          "name": "ref_image",
+          "name": "images",
          "type": "IMAGE",
-          "link": 13,
-          "slot_index": 1
+          "link": 114
        },
        {
-          "name": "pose_images",
-          "type": "IMAGE",
-          "link": 8
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            25
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "MimicMotionSampler"
-      },
-      "widgets_values": [
-        25,
-        2,
-        2,
-        123,
-        "fixed",
-        15,
-        0.02,
-        0.02
-      ]
-    },
-    {
-      "id": 6,
-      "type": "DWPreprocessor",
-      "pos": [
-        385,
-        854
-      ],
-      "size": {
-        "0": 315,
-        "1": 198
-      },
-      "flags": {},
-      "order": 4,
-      "mode": 0,
-      "inputs": [
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "link": null
+        },
        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 5
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
        }
      ],
      "outputs": [
        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            42
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "POSE_KEYPOINT",
-          "type": "POSE_KEYPOINT",
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "DWPreprocessor"
+        "Node name for S&R": "VHS_VideoCombine"
      },
-      "widgets_values": [
-        "enable",
-        "enable",
-        "enable",
-        512,
-        "yolox_l.torchscript.pt",
-        "dw-ll_ucoco_384_bs5.torchscript.pt"
-      ]
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "MimicPose",
+        "format": "image/webp",
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "MimicPose_00001.webp",
+            "subfolder": "",
+            "type": "temp",
+            "format": "image/webp"
+          }
+        }
+      }
    },
    {
-      "id": 23,
+      "id": 28,
      "type": "ImageResizeKJ",
      "pos": [
-        784,
-        976
+        -71,
+        481
      ],
      "size": {
        "0": 315,
@@ -402,7 +347,7 @@
        {
          "name": "image",
          "type": "IMAGE",
-          "link": 42
+          "link": 61
        },
        {
          "name": "get_image_size",
@@ -431,8 +376,8 @@
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            44,
-            53
+            95,
+            110
          ],
          "shape": 3,
          "slot_index": 0
@@ -440,184 +385,207 @@
        {
          "name": "width",
          "type": "INT",
-          "links": null,
-          "shape": 3
+          "links": [
+            88
+          ],
+          "shape": 3,
+          "slot_index": 1
        },
        {
          "name": "height",
          "type": "INT",
-          "links": null,
-          "shape": 3
+          "links": [
+            89
+          ],
+          "shape": 3,
+          "slot_index": 2
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
-        512,
-        768,
-        "nearest-exact",
+        576,
+        1024,
+        "lanczos",
        false,
-        8,
+        64,
        0,
        0
      ]
    },
    {
-      "id": 21,
-      "type": "ImageResizeKJ",
+      "id": 49,
+      "type": "PreviewImage",
      "pos": [
-        1212,
-        1157
+        290,
+        251
+      ],
+      "size": [
+        210,
+        246
      ],
-      "size": {
-        "0": 315,
-        "1": 242
-      },
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 36
-        },
-        {
-          "name": "get_image_size",
+          "name": "images",
          "type": "IMAGE",
-          "link": 37
-        },
-        {
-          "name": "width_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "width_input"
-          }
-        },
-        {
-          "name": "height_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "height_input"
-          }
+          "link": 125
        }
      ],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      }
+    },
+    {
+      "id": 2,
+      "type": "DownloadAndLoadMimicMotionModel",
+      "pos": [
+        764,
+        229
+      ],
+      "size": {
+        "0": 315,
+        "1": 82
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
      "outputs": [
        {
-          "name": "IMAGE",
-          "type": "IMAGE",
+          "name": "mimic_pipeline",
+          "type": "MIMICPIPE",
          "links": [
-            49
+            1
          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "height",
-          "type": "INT",
-          "links": null,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "ImageResizeKJ"
+        "Node name for S&R": "DownloadAndLoadMimicMotionModel"
      },
      "widgets_values": [
-        512,
-        512,
-        "nearest-exact",
-        false,
-        2,
-        0,
-        0
+        "MimicMotion-fp16.safetensors",
+        "fp16"
      ]
    },
    {
-      "id": 20,
-      "type": "VHS_SplitImages",
+      "id": 50,
+      "type": "Note",
+      "pos": [
+        281,
+        104
+      ],
+      "size": [
+        293.73750640869093,
+        101.2688590393065
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "ref_image needs to be 224x224 for clip_vision\ndefault seems to be just resizing, you can try other methods like cropping as well"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 51,
+      "type": "Note",
+      "pos": [
+        779,
+        105
+      ],
+      "size": [
+        290.12339111328083,
+        69.71562744140618
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "Downloads MimicMotion model and fp16 version of SVD XT 1.1"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 17,
+      "type": "ImageConcatMulti",
      "pos": [
-        1181,
-        974
+        1186,
+        774
      ],
      "size": {
-        "0": 315,
-        "1": 118
+        "0": 210,
+        "1": 170
      },
      "flags": {},
-      "order": 6,
+      "order": 13,
      "mode": 0,
      "inputs": [
        {
-          "name": "images",
+          "name": "image_1",
+          "type": "IMAGE",
+          "link": 95
+        },
+        {
+          "name": "image_2",
+          "type": "IMAGE",
+          "link": 113
+        },
+        {
+          "name": "image_3",
          "type": "IMAGE",
-          "link": 44
+          "link": 99
        }
      ],
      "outputs": [
        {
-          "name": "IMAGE_A",
-          "type": "IMAGE",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "A_count",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "IMAGE_B",
+          "name": "images",
          "type": "IMAGE",
          "links": [
-            37,
-            54
+            93
          ],
          "shape": 3,
-          "slot_index": 2
-        },
-        {
-          "name": "B_count",
-          "type": "INT",
-          "links": null,
-          "shape": 3
+          "slot_index": 0
        }
      ],
-      "properties": {
-        "Node name for S&R": "VHS_SplitImages"
-      },
-      "widgets_values": {
-        "split_index": 1
-      }
+      "properties": {},
+      "widgets_values": [
+        3,
+        "right",
+        false,
+        null
+      ]
    },
    {
      "id": 16,
      "type": "VHS_VideoCombine",
      "pos": [
-        2010,
-        480
+        1452,
+        196
      ],
      "size": [
-        1004.800048828125,
-        796.4000244140625
+        1530.494967759278,
+        1199.1081290425352
      ],
      "flags": {},
-      "order": 12,
+      "order": 14,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
-          "link": 35
+          "link": 93
        },
        {
          "name": "audio",
@@ -644,44 +612,163 @@
      "widgets_values": {
        "frame_rate": 8,
        "loop_count": 0,
-        "filename_prefix": "AnimateDiff",
+        "filename_prefix": "MimicMotion",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
-        "save_output": true,
+        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00027.mp4",
+            "filename": "MimicMotion_00001.mp4",
            "subfolder": "",
-            "type": "output",
+            "type": "temp",
            "format": "video/h264-mp4"
          }
        }
      }
    },
    {
-      "id": 5,
-      "type": "VHS_LoadVideo",
+      "id": 1,
+      "type": "MimicMotionSampler",
      "pos": [
+        1101,
+        419
+      ],
+      "size": {
+        "0": 307.6666259765625,
+        "1": 290
+      },
+      "flags": {},
+      "order": 12,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mimic_pipeline",
+          "type": "MIMICPIPE",
+          "link": 1,
+          "slot_index": 0
+        },
+        {
+          "name": "ref_image",
+          "type": "IMAGE",
+          "link": 128,
+          "slot_index": 1
+        },
+        {
+          "name": "pose_images",
+          "type": "IMAGE",
+          "link": 92
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            99
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "MimicMotionSampler"
+      },
+      "widgets_values": [
        25,
-        851
+        2,
+        2,
+        123,
+        "fixed",
+        15,
+        0.02,
+        true
+      ]
+    },
+    {
+      "id": 3,
+      "type": "LoadImage",
+      "pos": [
+        -393,
+        311
+      ],
+      "size": {
+        "0": 213.0849151611328,
+        "1": 410.70074462890625
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            61,
+            124
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "demo1.jpg",
+        "image"
+      ]
+    },
+    {
+      "id": 48,
+      "type": "ImageResizeKJ",
+      "pos": [
+        -57,
+        170
      ],
      "size": [
-        235.1999969482422,
-        658.5777723524305
+        315,
+        242
      ],
      "flags": {},
-      "order": 2,
+      "order": 6,
      "mode": 0,
      "inputs": [
        {
-          "name": "meta_batch",
-          "type": "VHS_BatchManager",
+          "name": "image",
+          "type": "IMAGE",
+          "link": 124
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
          "link": null
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
        }
      ],
      "outputs": [
@@ -689,57 +776,39 @@
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            5
+            125,
+            128
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
-          "name": "frame_count",
+          "name": "width",
          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "audio",
-          "type": "VHS_AUDIO",
-          "links": null,
-          "shape": 3
+          "links": [],
+          "shape": 3,
+          "slot_index": 1
        },
        {
-          "name": "video_info",
-          "type": "VHS_VIDEOINFO",
-          "links": null,
-          "shape": 3
+          "name": "height",
+          "type": "INT",
+          "links": [],
+          "shape": 3,
+          "slot_index": 2
        }
      ],
      "properties": {
-        "Node name for S&R": "VHS_LoadVideo"
+        "Node name for S&R": "ImageResizeKJ"
      },
-      "widgets_values": {
-        "video": "pose1.mp4",
-        "force_rate": 0,
-        "force_size": "Disabled",
-        "custom_width": 512,
-        "custom_height": 512,
-        "frame_load_cap": 16,
-        "skip_first_frames": 0,
-        "select_every_nth": 2,
-        "choose video to upload": "image",
-        "videopreview": {
-          "hidden": false,
-          "paused": false,
-          "params": {
-            "frame_load_cap": 16,
-            "skip_first_frames": 0,
-            "force_rate": 0,
-            "filename": "pose1.mp4",
-            "type": "input",
-            "format": "video/mp4",
-            "select_every_nth": 2
-          }
-        }
-      }
+      "widgets_values": [
+        224,
+        224,
+        "lanczos",
+        false,
+        64,
+        0,
+        0
+      ]
    }
  ],
  "links": [
@@ -752,114 +821,130 @@
      "MIMICPIPE"
    ],
    [
-      5,
-      5,
+      61,
+      3,
      0,
-      6,
+      28,
      0,
      "IMAGE"
    ],
    [
-      8,
-      9,
+      86,
+      5,
+      0,
+      35,
      0,
+      "IMAGE"
+    ],
+    [
+      88,
+      28,
      1,
+      35,
      2,
-      "IMAGE"
+      "INT"
    ],
    [
-      12,
+      89,
+      28,
+      2,
+      35,
      3,
-      0,
-      12,
-      0,
-      "IMAGE"
+      "INT"
    ],
    [
-      13,
-      12,
+      92,
+      9,
      0,
      1,
-      1,
+      2,
      "IMAGE"
    ],
    [
-      25,
-      1,
+      93,
+      17,
      0,
-      19,
+      16,
      0,
      "IMAGE"
    ],
    [
-      35,
+      95,
+      28,
+      0,
      17,
      0,
-      16,
+      "IMAGE"
+    ],
+    [
+      99,
+      1,
      0,
+      17,
+      2,
      "IMAGE"
    ],
    [
-      36,
-      3,
+      110,
+      28,
      0,
-      21,
+      42,
      0,
      "IMAGE"
    ],
    [
-      37,
-      20,
-      2,
-      21,
+      111,
+      35,
+      0,
+      42,
      1,
      "IMAGE"
    ],
    [
+      112,
      42,
-      6,
      0,
-      23,
+      9,
      0,
      "IMAGE"
    ],
    [
-      44,
-      23,
-      0,
-      20,
-      0,
+      113,
+      42,
+      1,
+      17,
+      1,
      "IMAGE"
    ],
    [
-      49,
-      21,
+      114,
+      42,
      0,
-      17,
+      37,
      0,
      "IMAGE"
    ],
    [
-      51,
-      19,
-      2,
-      17,
-      2,
+      124,
+      3,
+      0,
+      48,
+      0,
      "IMAGE"
    ],
    [
-      53,
-      23,
+      125,
+      48,
      0,
-      9,
+      49,
      0,
      "IMAGE"
    ],
    [
-      54,
-      20,
-      2,
-      17,
+      128,
+      48,
+      0,
+      1,
      1,
      "IMAGE"
    ]
@@ -868,10 +953,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7513148009015777,
+      "scale": 0.5644739300537774,
      "offset": {
-        "0": 52.74061584472656,
-        "1": -197.62571716308594
+        "0": 738.7613525390625,
+        "1": 31.221477508544922
      }
    }
  },

--- a/mimicmotion/pipelines/pipeline_mimicmotion.py
+++ b/mimicmotion/pipelines/pipeline_mimicmotion.py
@@ -20,6 +20,8 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 from ..modules.pose_net import PoseNet

 from comfy.utils import ProgressBar
+import comfy.model_management as mm
+offload_device = mm.unet_offload_device()

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

@@ -145,8 +147,10 @@ class MimicMotionPipeline(DiffusionPipeline):
            ).pixel_values

        image = image.to(device=device, dtype=dtype)
+        self.image_encoder.to(device)
        image_embeddings = self.image_encoder(image).image_embeds
        image_embeddings = image_embeddings.unsqueeze(1)
+        self.image_encoder.to(offload_device)

        # duplicate image embeddings for each generation per prompt, using mps friendly method
        bs_embed, seq_len, _ = image_embeddings.shape
@@ -189,7 +193,9 @@ class MimicMotionPipeline(DiffusionPipeline):
        do_classifier_free_guidance: bool,
    ):
        image = image.to(device=device)
+        self.vae.to(device)
        image_latents = self.vae.encode(image).latent_dist.mode()
+        self.vae.to(offload_device)

        if do_classifier_free_guidance:
            negative_image_latents = torch.zeros_like(image_latents)
@@ -256,7 +262,10 @@ class MimicMotionPipeline(DiffusionPipeline):
                # we only pass num_frames_in if it's expected
                decode_kwargs["num_frames"] = num_frames_in

+            self.vae.to(latents.device)
            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
+            self.vae.to(offload_device)
+
            frames.append(frame.cpu())
        frames = torch.cat(frames, dim=0)

@@ -568,6 +577,8 @@ class MimicMotionPipeline(DiffusionPipeline):
        self._guidance_scale = guidance_scale

        # 8. Denoising loop
+        self.unet.to(device)
+
        self._num_timesteps = len(timesteps)
        pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames)
        indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
@@ -627,6 +638,8 @@ class MimicMotionPipeline(DiffusionPipeline):
                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

                    latents = callback_outputs.pop("latents", latents)
+                    
+        self.unet.to(offload_device)

        if not output_type == "latent":
            # cast back to fp16 if needed

--- a/nodes.py
+++ b/nodes.py
 import os
-from omegaconf import OmegaConf
 import torch
-import torch.nn.functional as F
 import sys
 import numpy as np
+import gc

-script_directory = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(script_directory)
-
-from einops import repeat
 import folder_paths
 import comfy.model_management as mm
 import comfy.utils

-from contextlib import nullcontext
-try:
-    from accelerate import init_empty_weights
-    is_accelerate_available = True
-except:
-    pass
-
-from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
-
 from diffusers.models import AutoencoderKLTemporalDecoder
 from diffusers.schedulers import EulerDiscreteScheduler
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

+script_directory = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(script_directory)
+
+from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
 from mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
 from mimicmotion.modules.pose_net import PoseNet

@@ -65,6 +55,7 @@ class DownloadAndLoadMimicMotionModel:
                    ], {
                        "default": 'fp16'
                    }),
+            
            },
        }

@@ -77,6 +68,8 @@ class DownloadAndLoadMimicMotionModel:
        device = mm.get_torch_device()
        mm.soft_empty_cache()
        dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
+
+        pbar = comfy.utils.ProgressBar(3)
        
        download_path = os.path.join(folder_paths.models_dir, "mimicmotion")
        model_path = os.path.join(download_path, model)
@@ -89,13 +82,20 @@ class DownloadAndLoadMimicMotionModel:
                                local_dir=download_path, 
                                local_dir_use_symlinks=False)

-        ckpt_base_name = os.path.basename(model_path)
        print(f"Loading model from: {model_path}")
+        pbar.update(1)

        svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1")
-
+        
        if not os.path.exists(svd_path):
-            raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
+            #raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
+            print(f"Downloading SVD model to: {model_path}")
+            from huggingface_hub import snapshot_download
+            snapshot_download(repo_id="vdo/stable-video-diffusion-img2vid-xt-1-1", 
+                                allow_patterns=[f"*.json", "*fp16*"],
+                                local_dir=svd_path, 
+                                local_dir_use_symlinks=False)
+        pbar.update(1)

        mimicmotion_models = MimicMotionModel(svd_path).to(device=device).eval()
        mimicmotion_models.load_state_dict(comfy.utils.load_torch_file(model_path), strict=False)
@@ -108,16 +108,18 @@ class DownloadAndLoadMimicMotionModel:
            feature_extractor=mimicmotion_models.feature_extractor, 
            pose_net=mimicmotion_models.pose_net,
        )
+        
        pipeline.unet.to(dtype)
        pipeline.pose_net.to(dtype)
        pipeline.vae.to(dtype)
        pipeline.image_encoder.to(dtype)
-        pipeline.pose_net.to(dtype)        
-
+        pipeline.pose_net.to(dtype)
+        
        mimic_model = {
            'pipeline': pipeline,
            'dtype': dtype
        }
+        pbar.update(1)
        return (mimic_model,)
    
 class MimicMotionSampler:
@@ -151,16 +153,26 @@ class MimicMotionSampler:
        pipeline = mimic_pipeline['pipeline']

        B, H, W, C = pose_images.shape
-        ref_image = ref_image.permute(0, 3, 1, 2).to(device).to(dtype)
-        pose_images = pose_images.permute(0, 3, 1, 2).to(device).to(dtype)
-        ref_image = ref_image * 2 - 1
+       
+        ref_image = ref_image.permute(0, 3, 1, 2)
+        pose_images = pose_images.permute(0, 3, 1, 2)
+
+        if ref_image.shape[1:3] != (224, 224):
+            ref_img = comfy.utils.common_upscale(ref_image, 224, 224, "lanczos", "disabled")
+        else:
+            ref_img = ref_image
+
+        ref_img = ref_img * 2 - 1
        pose_images = pose_images * 2 - 1

+        ref_img = ref_img.to(device).to(dtype)
+        pose_images = pose_images.to(device).to(dtype)
+
        generator = torch.Generator(device=device)
        generator.manual_seed(seed)
-
+        
        frames = pipeline(
-            ref_image, 
+            ref_img, 
            image_pose=pose_images, 
            num_frames=B,
            tile_size = 16, 
@@ -177,8 +189,14 @@ class MimicMotionSampler:
            output_type="pt", 
            device=device
        ).frames
-        frames = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float()
-        print(frames.shape)
+        frames = frames.squeeze(0)[1:].permute(0, 2, 3, 1).cpu().float()
+
+        if not keep_model_loaded:
+            pipeline.unet.to(offload_device)
+            pipeline.vae.to(offload_device)
+
+            mm.soft_empty_cache()
+            gc.collect()

        return frames,

@@ -194,8 +212,8 @@ class MimicMotionGetPoses:
            },
        }

-    RETURN_TYPES = ("IMAGE",)
-    RETURN_NAMES = ("images",)
+    RETURN_TYPES = ("IMAGE", "IMAGE",)
+    RETURN_NAMES = ("poses_with_ref", "pose_images")
    FUNCTION = "process"
    CATEGORY = "MimicMotionWrapper"

@@ -246,9 +264,11 @@ class MimicMotionGetPoses:
        pose_images_np = pose_images.cpu().numpy() * 255

        # read input video
+        pbar = comfy.utils.ProgressBar(len(pose_images_np))
        detected_poses_np_list = []
        for img_np in pose_images_np:
            detected_poses_np_list.append(dwprocessor(img_np))
+            pbar.update(1)

        detected_bodies = np.stack(
            [p['bodies']['candidate'] for p in detected_poses_np_list if p['bodies']['candidate'].shape[0] == 18])[:,
@@ -277,10 +297,7 @@ class MimicMotionGetPoses:
        output_tensor = torch.cat((ref_pose_tensor.unsqueeze(0), output_tensor))
        output_tensor = output_tensor.permute(0, 2, 3, 1).cpu().float()      
        
-        return output_tensor,
-
-        
-
+        return output_tensor, output_tensor[1:]

 NODE_CLASS_MAPPINGS = {
    "DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,