提交 2b4729da authored 作者: kijai's avatar kijai

initial commit

上级 8e536752
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
#/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# custom ignores
.DS_Store
_.*
# models and outputs
models/
outputs/
差异被折叠。
# WORK IN PROGRESS
# MimicMotion wrapper for ComfyUI
needs SVD 1.1 to ComfyUI/models/diffusers https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1/tree/main
the whole folder, but subfolders only need one of the .safetensors + config.json
\ No newline at end of file
from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
\ No newline at end of file
# base svd model path
base_model_path: models/SVD/stable-video-diffusion-img2vid-xt-1-1
# checkpoint path
ckpt_path: models/MimicMotion.pth
test_case:
- ref_video_path: assets/example_data/videos/pose1.mp4
ref_image_path: assets/example_data/images/demo1.jpg
num_frames: 16
resolution: 576
frames_overlap: 6
num_inference_steps: 25
noise_aug_strength: 0
guidance_scale: 2.0
sample_stride: 2
fps: 15
seed: 42
# w/h apsect ratio
ASPECT_RATIO = 9 / 16
name: mimicmotion
channels:
- pytorch
- nvidia
dependencies:
- python=3.11
- pytorch=2.0.1
- torchvision=0.15.2
- pytorch-cuda=11.7
- pip
- pip:
- diffusers==0.27.0
- transformers==4.32.1
- decord==0.6.0
- einops
- omegaconf
import os
import argparse
import logging
import math
from omegaconf import OmegaConf
from datetime import datetime
from pathlib import Path
import numpy as np
import torch.jit
from torchvision.datasets.folder import pil_loader
from torchvision.transforms.functional import pil_to_tensor, resize, center_crop
from torchvision.transforms.functional import to_pil_image
from constants import ASPECT_RATIO
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from mimicmotion.utils.loader import create_pipeline
from mimicmotion.utils.utils import save_to_mp4
from mimicmotion.dwpose.preprocess import get_video_pose, get_image_pose
logging.basicConfig(level=logging.INFO, format="%(asctime)s: [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def preprocess(video_path, image_path, resolution=576, sample_stride=2):
"""preprocess ref image pose and video pose
Args:
video_path (str): input video pose path
image_path (str): reference image path
resolution (int, optional): Defaults to 576.
sample_stride (int, optional): Defaults to 2.
"""
image_pixels = pil_loader(image_path)
image_pixels = pil_to_tensor(image_pixels) # (c, h, w)
h, w = image_pixels.shape[-2:]
############################ compute target h/w according to original aspect ratio ###############################
if h>w:
w_target, h_target = resolution, int(resolution / ASPECT_RATIO // 64) * 64
else:
w_target, h_target = int(resolution / ASPECT_RATIO // 64) * 64, resolution
h_w_ratio = float(h) / float(w)
if h_w_ratio < h_target / w_target:
h_resize, w_resize = h_target, math.ceil(h_target / h_w_ratio)
else:
h_resize, w_resize = math.ceil(w_target * h_w_ratio), w_target
image_pixels = resize(image_pixels, [h_resize, w_resize], antialias=None)
image_pixels = center_crop(image_pixels, [h_target, w_target])
image_pixels = image_pixels.permute((1, 2, 0)).numpy()
##################################### get image&video pose value #################################################
image_pose = get_image_pose(image_pixels)
video_pose = get_video_pose(video_path, image_pixels, sample_stride=sample_stride)
pose_pixels = np.concatenate([np.expand_dims(image_pose, 0), video_pose])
image_pixels = np.transpose(np.expand_dims(image_pixels, 0), (0, 3, 1, 2))
return torch.from_numpy(pose_pixels.copy()) / 127.5 - 1, torch.from_numpy(image_pixels) / 127.5 - 1
def run_pipeline(pipeline: MimicMotionPipeline, image_pixels, pose_pixels, device, task_config):
image_pixels = [to_pil_image(img.to(torch.uint8)) for img in (image_pixels + 1.0) * 127.5]
pose_pixels = pose_pixels.unsqueeze(0).to(device)
generator = torch.Generator(device=device)
generator.manual_seed(task_config.seed)
frames = pipeline(
image_pixels, image_pose=pose_pixels, num_frames=pose_pixels.size(1),
tile_size=task_config.num_frames, tile_overlap=task_config.frames_overlap,
height=pose_pixels.shape[-2], width=pose_pixels.shape[-1], fps=7,
noise_aug_strength=task_config.noise_aug_strength, num_inference_steps=task_config.num_inference_steps,
generator=generator, min_guidance_scale=task_config.guidance_scale,
max_guidance_scale=task_config.guidance_scale, decode_chunk_size=8, output_type="pt", device=device
).frames.cpu()
video_frames = (frames * 255.0).to(torch.uint8)
for vid_idx in range(video_frames.shape[0]):
# deprecated first frame because of ref image
_video_frames = video_frames[vid_idx, 1:]
return _video_frames
@torch.no_grad()
def main(args):
if not args.no_use_float16 :
torch.set_default_dtype(torch.float16)
infer_config = OmegaConf.load(args.inference_config)
pipeline = create_pipeline(infer_config, device)
for task in infer_config.test_case:
############################################## Pre-process data ##############################################
pose_pixels, image_pixels = preprocess(
task.ref_video_path, task.ref_image_path,
resolution=task.resolution, sample_stride=task.sample_stride
)
########################################### Run MimicMotion pipeline ###########################################
_video_frames = run_pipeline(
pipeline,
image_pixels, pose_pixels,
device, task
)
################################### save results to output folder. ###########################################
save_to_mp4(
_video_frames,
f"{args.output_dir}/{os.path.basename(task.ref_video_path).split('.')[0]}" \
f"_{datetime.now().strftime('%Y%m%d%H%M%S')}.mp4",
fps=task.fps,
)
def set_logger(log_file=None, log_level=logging.INFO):
log_handler = logging.FileHandler(log_file, "w")
log_handler.setFormatter(
logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s]: %(message)s")
)
log_handler.setLevel(log_level)
logger.addHandler(log_handler)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--log_file", type=str, default=None)
parser.add_argument("--inference_config", type=str, default="configs/test.yaml") #ToDo
parser.add_argument("--output_dir", type=str, default="outputs/", help="path to output")
parser.add_argument("--no_use_float16",
action="store_true",
help="Whether use float16 to speed up inference",
)
args = parser.parse_args()
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
set_logger(args.log_file \
if args.log_file is not None else f"{args.output_dir}/{datetime.now().strftime('%Y%m%d%H%M%S')}.log")
main(args)
logger.info(f"--- Finished ---")
差异被折叠。
from pathlib import Path
import einops
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as init
class PoseNet(nn.Module):
"""a tiny conv network for introducing pose sequence as the condition
"""
def __init__(self, noise_latent_channels=320, *args, **kwargs):
super().__init__(*args, **kwargs)
# multiple convolution layers
self.conv_layers = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=4, stride=2, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
nn.SiLU(),
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
nn.SiLU()
)
# Final projection layer
self.final_proj = nn.Conv2d(in_channels=128, out_channels=noise_latent_channels, kernel_size=1)
# Initialize layers
self._initialize_weights()
self.scale = nn.Parameter(torch.ones(1) * 2)
def _initialize_weights(self):
"""Initialize weights with He. initialization and zero out the biases
"""
for m in self.conv_layers:
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n))
if m.bias is not None:
init.zeros_(m.bias)
init.zeros_(self.final_proj.weight)
if self.final_proj.bias is not None:
init.zeros_(self.final_proj.bias)
def forward(self, x):
if x.ndim == 5:
x = einops.rearrange(x, "b f c h w -> (b f) c h w")
x = self.conv_layers(x)
x = self.final_proj(x)
return x * self.scale
@classmethod
def from_pretrained(cls, pretrained_model_path):
"""load pretrained pose-net weights
"""
if not Path(pretrained_model_path).exists():
print(f"There is no model file in {pretrained_model_path}")
print(f"loaded PoseNet's pretrained weights from {pretrained_model_path}.")
state_dict = torch.load(pretrained_model_path, map_location="cpu")
model = PoseNet(noise_latent_channels=320)
model.load_state_dict(state_dict, strict=True)
return model
差异被折叠。
差异被折叠。
import logging
import torch
import torch.utils.checkpoint
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from ..modules.unet import UNetSpatioTemporalConditionModel
from ..modules.pose_net import PoseNet
from ..pipelines.pipeline_mimicmotion import MimicMotionPipeline
logger = logging.getLogger(__name__)
class MimicMotionModel(torch.nn.Module):
def __init__(self, base_model_path):
"""construnct base model components and load pretrained svd model except pose-net
Args:
base_model_path (str): pretrained svd model path
"""
super().__init__()
self.unet = UNetSpatioTemporalConditionModel.from_config(
UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder="unet"))
self.vae = AutoencoderKLTemporalDecoder.from_pretrained(
base_model_path, subfolder="vae").half()
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
base_model_path, subfolder="image_encoder")
self.noise_scheduler = EulerDiscreteScheduler.from_pretrained(
base_model_path, subfolder="scheduler")
self.feature_extractor = CLIPImageProcessor.from_pretrained(
base_model_path, subfolder="feature_extractor")
# pose_net
self.pose_net = PoseNet(noise_latent_channels=self.unet.config.block_out_channels[0])
def create_pipeline(infer_config, device):
"""create mimicmotion pipeline and load pretrained weight
Args:
infer_config (str):
device (str or torch.device): "cpu" or "cuda:{device_id}"
"""
mimicmotion_models = MimicMotionModel(infer_config.base_model_path).to(device=device).eval()
mimicmotion_models.load_state_dict(torch.load(infer_config.ckpt_path, map_location=device), strict=False)
pipeline = MimicMotionPipeline(
vae=mimicmotion_models.vae,
image_encoder=mimicmotion_models.image_encoder,
unet=mimicmotion_models.unet,
scheduler=mimicmotion_models.noise_scheduler,
feature_extractor=mimicmotion_models.feature_extractor,
pose_net=mimicmotion_models.pose_net
)
return pipeline
import logging
from pathlib import Path
from torchvision.io import write_video
logger = logging.getLogger(__name__)
def save_to_mp4(frames, save_path, fps=7):
frames = frames.permute((0, 2, 3, 1)) # (f, c, h, w) to (f, h, w, c)
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
write_video(save_path, frames, fps=fps)
import os
from omegaconf import OmegaConf
import torch
import torch.nn.functional as F
import sys
script_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_directory)
from einops import repeat
import folder_paths
import comfy.model_management as mm
import comfy.utils
from contextlib import nullcontext
try:
from accelerate import init_empty_weights
is_accelerate_available = True
except:
pass
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
from mimicmotion.modules.pose_net import PoseNet
from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline
class MimicMotionModel(torch.nn.Module):
def __init__(self, base_model_path):
"""construnct base model components and load pretrained svd model except pose-net
Args:
base_model_path (str): pretrained svd model path
"""
super().__init__()
self.unet = UNetSpatioTemporalConditionModel.from_config(
UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder="unet", variant="fp16"))
self.vae = AutoencoderKLTemporalDecoder.from_pretrained(
base_model_path, subfolder="vae", variant="fp16")
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
base_model_path, subfolder="image_encoder", variant="fp16")
self.noise_scheduler = EulerDiscreteScheduler.from_pretrained(
base_model_path, subfolder="scheduler")
self.feature_extractor = CLIPImageProcessor.from_pretrained(
base_model_path, subfolder="feature_extractor")
# pose_net
self.pose_net = PoseNet(noise_latent_channels=self.unet.config.block_out_channels[0])
class DownloadAndLoadMimicMotionModel:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"model": (
[ 'MimicMotion-fp16.safetensors',
],
),
"precision": (
[
'fp32',
'fp16',
'bf16',
], {
"default": 'fp16'
}),
},
}
RETURN_TYPES = ("MIMICPIPE",)
RETURN_NAMES = ("mimic_pipeline",)
FUNCTION = "loadmodel"
CATEGORY = "MimicMotionWrapper"
def loadmodel(self, precision, model):
device = mm.get_torch_device()
mm.soft_empty_cache()
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
download_path = os.path.join(folder_paths.models_dir, "mimicmotion")
model_path = os.path.join(download_path, model)
if not os.path.exists(model_path):
print(f"Downloading model to: {model_path}")
from huggingface_hub import snapshot_download
snapshot_download(repo_id="Kijai/MimicMotion_pruned",
allow_patterns=[f"*{model}*"],
local_dir=download_path,
local_dir_use_symlinks=False)
ckpt_base_name = os.path.basename(model_path)
print(f"Loading model from: {model_path}")
svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1")
if not os.path.exists(svd_path):
raise ValueError(f"Please download stable-video-diffusion-img2vid-xt-1-1 to {svd_path}")
mimicmotion_models = MimicMotionModel(svd_path).to(device=device).eval()
mimicmotion_models.load_state_dict(comfy.utils.load_torch_file(model_path), strict=False)
pipeline = MimicMotionPipeline(
vae=mimicmotion_models.vae,
image_encoder=mimicmotion_models.image_encoder,
unet=mimicmotion_models.unet,
scheduler=mimicmotion_models.noise_scheduler,
feature_extractor=mimicmotion_models.feature_extractor,
pose_net=mimicmotion_models.pose_net,
)
pipeline.unet.to(dtype)
pipeline.pose_net.to(dtype)
pipeline.vae.to(dtype)
pipeline.image_encoder.to(dtype)
pipeline.pose_net.to(dtype)
mimic_model = {
'pipeline': pipeline,
'dtype': dtype
}
return (mimic_model,)
class MimicMotionSampler:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"mimic_pipeline": ("MIMICPIPE",),
"ref_image": ("IMAGE",),
"pose_images": ("IMAGE",),
"steps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}),
"cfg_min": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 20.0, "step": 0.01}),
"cfg_max": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 20.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"fps": ("INT", {"default": 15, "min": 2, "max": 100, "step": 1}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"keep_model_loaded": ("BOOLEAN", {"default": True}),
},
}
RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",)
FUNCTION = "process"
CATEGORY = "MimicMotionWrapper"
def process(self, mimic_pipeline, ref_image, pose_images, cfg_min, cfg_max, steps, seed, noise_aug_strength, fps, keep_model_loaded):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
mm.unload_all_models()
mm.soft_empty_cache()
dtype = mimic_pipeline['dtype']
pipeline = mimic_pipeline['pipeline']
B, H, W, C = pose_images.shape
ref_image = ref_image.permute(0, 3, 1, 2).to(device).to(dtype)
pose_images = pose_images.permute(0, 3, 1, 2).to(device).to(dtype)
ref_image = ref_image * 2 - 1
pose_images = pose_images * 2 - 1
generator = torch.Generator(device=device)
generator.manual_seed(seed)
frames = pipeline(
ref_image,
image_pose=pose_images,
num_frames=B,
tile_size = 16,
tile_overlap= 6,
height=H,
width=W,
fps=fps,
noise_aug_strength=noise_aug_strength,
num_inference_steps=steps,
generator=generator,
min_guidance_scale=cfg_min,
max_guidance_scale=cfg_max,
decode_chunk_size=8,
output_type="pt",
device=device
).frames
frames = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float()
print(frames.shape)
return frames,
NODE_CLASS_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,
"MimicMotionSampler": MimicMotionSampler,
}
NODE_DISPLAY_NAME_MAPPINGS = {
"DownloadAndLoadMimicMotionModel": "DownloadAndLoadMimicMotionModel",
"MimicMotionSampler": "MimicMotionSampler",
}
diffusers>=0.27.0
transformers>=4.32.1
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论