Spaces:

prithivMLmods
/

HY-World-2.0-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on Apr 16

Commit

fbcc0a9

verified ·

1 Parent(s): f90be73

update app --files

Browse files

Files changed (42) hide show

hyworldmirror/__init__.py +0 -0
hyworldmirror/comm/__init__.py +0 -0
hyworldmirror/comm/communication.py +61 -0
hyworldmirror/comm/padding.py +134 -0
hyworldmirror/models/__init__.py +0 -0
hyworldmirror/models/heads/__init__.py +0 -0
hyworldmirror/models/heads/camera_head.py +184 -0
hyworldmirror/models/heads/dense_head.py +672 -0
hyworldmirror/models/heads/gs_head.py +83 -0
hyworldmirror/models/layers/__init__.py +5 -0
hyworldmirror/models/layers/attention.py +131 -0
hyworldmirror/models/layers/block.py +269 -0
hyworldmirror/models/layers/drop_path.py +29 -0
hyworldmirror/models/layers/layer_scale.py +17 -0
hyworldmirror/models/layers/mlp.py +64 -0
hyworldmirror/models/layers/norm_rope.py +140 -0
hyworldmirror/models/layers/patch_embed.py +155 -0
hyworldmirror/models/layers/rope.py +182 -0
hyworldmirror/models/layers/swiglu_ffn.py +46 -0
hyworldmirror/models/layers/vision_transformer.py +394 -0
hyworldmirror/models/models/__init__.py +0 -0
hyworldmirror/models/models/rasterization.py +525 -0
hyworldmirror/models/models/visual_transformer.py +542 -0
hyworldmirror/models/models/worldmirror.py +685 -0
hyworldmirror/models/utils/__init__.py +0 -0
hyworldmirror/models/utils/act_gs.py +22 -0
hyworldmirror/models/utils/camera_utils.py +75 -0
hyworldmirror/models/utils/frustum.py +196 -0
hyworldmirror/models/utils/geometry.py +111 -0
hyworldmirror/models/utils/grid.py +90 -0
hyworldmirror/models/utils/priors.py +168 -0
hyworldmirror/models/utils/rotation.py +126 -0
hyworldmirror/models/utils/sh_utils.py +116 -0
hyworldmirror/utils/__init__.py +0 -0
hyworldmirror/utils/geometry.py +531 -0
hyworldmirror/utils/inference_utils.py +824 -0
hyworldmirror/utils/render_utils.py +294 -0
hyworldmirror/utils/save_utils.py +261 -0
hyworldmirror/utils/video_utils.py +557 -0
hyworldmirror/utils/visual_util.py +617 -0
hyworldmirror/utils/warnings.py +29 -0
pipeline.py +847 -0

hyworldmirror/__init__.py ADDED Viewed

File without changes

hyworldmirror/comm/__init__.py ADDED Viewed

File without changes

hyworldmirror/comm/communication.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+import torch.distributed as dist
+def all2all(tensor, scatter_dim, gather_dim, cur_group, async_op):
+    group_size = dist.get_world_size(group=cur_group)
+    scatter_tensor_list = list(chunk.contiguous() for chunk in torch.chunk(tensor, chunks=group_size, dim=scatter_dim))
+    gather_tensor_list = [torch.zeros_like(x) for x in scatter_tensor_list]
+    comm = dist.all_to_all(gather_tensor_list, scatter_tensor_list, group=cur_group, async_op=async_op)
+    if async_op:
+        def wait():
+            comm.wait()
+            recieved_tensor = torch.cat(gather_tensor_list, dim=gather_dim).contiguous()
+            return recieved_tensor
+        return wait()
+    recieved_tensor = torch.cat(gather_tensor_list, dim=gather_dim).contiguous()
+    return recieved_tensor
+def all_gather(tensor, gather_dim, cur_group, async_op):
+    tensor = tensor.contiguous()
+    group_size = dist.get_world_size(group=cur_group)
+    gather_list = [torch.zeros_like(tensor) for _ in range(group_size)]
+    comm = dist.all_gather(gather_list, tensor, group=cur_group, async_op=async_op)
+    gather_tensor = torch.cat(gather_list, dim=gather_dim)
+    if async_op:
+        def wait():
+            comm.wait()
+            gather_tensor = torch.cat(gather_list, dim=gather_dim)
+            return gather_tensor
+        return wait()
+    return gather_tensor
+class _All2All(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, scatter_dim, gather_dim, cur_group, async_op):
+        ctx.cur_group = cur_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.async_op = async_op
+        return all2all(tensor=tensor, scatter_dim=scatter_dim, gather_dim=gather_dim, cur_group=cur_group, async_op=async_op)
+    @staticmethod
+    def backward(ctx, grad_outputs):
+        input_t = grad_outputs
+        return (all2all(input_t, ctx.gather_dim, ctx.scatter_dim, ctx.cur_group, False), None, None, None, None)
+class _Allgather(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, gather_dim, cur_group, async_op):
+        ctx.gather_dim = gather_dim
+        ctx.cur_group = cur_group
+        ctx.async_op = async_op
+        return all_gather(tensor=tensor, gather_dim=gather_dim, cur_group=cur_group, async_op=async_op)
+    @staticmethod
+    def backward(ctx, grad_outputs):
+        sp_group = ctx.cur_group
+        sp_group_size = dist.get_world_size(group=sp_group)
+        rank = dist.get_rank()
+        rank_in_group = dist.get_group_rank(group=sp_group, global_rank=rank)
+        return (grad_outputs.split(grad_outputs.shape[ctx.gather_dim] // sp_group_size, dim=ctx.gather_dim)[rank_in_group], None, None, None)

hyworldmirror/comm/padding.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn.functional as F
+def minimal_pad_to_divisible(tensor: torch.Tensor, sp_size: int, dim: int = 1, pad_value: float = 0.0):
+    """
+    对三维或更高维度的tensor在指定维度进行最小化padding，使其长度能被 sp_size 整除。
+    Args:
+        tensor: 输入的PyTorch tensor (例如：[B, L, C] 或 [B, H, W, C] 等)。
+        sp_size: 要求的最小分割尺寸。
+        dim: 需要进行padding的维度索引（默认为 1，即第二维）。
+        pad_value: 填充的值（默认为 0.0）。
+    Returns:
+        padded_tensor: 填充后的 tensor。
+    """
+    current_size = tensor.size(dim)
+    # 计算需要填充的长度
+    # (sp_size - current_size % sp_size) % sp_size
+    # 保证了如果 current_size 已经是 sp_size 的倍数，padding_len 为 0。
+    # 否则，计算出最小的填充长度。
+    padding_len = (sp_size - current_size % sp_size) % sp_size
+    if padding_len == 0:
+        # 如果长度已经可以整除，直接返回原 tensor
+        return tensor, 0
+    # 构建 pad 元组
+    # torch.nn.functional.pad 的 pad 参数是从**最末尾的维度**开始，**成对** (后填充, 前填充) 指定的。
+    # 假设你的 tensor 是 [D0, D1, D2]
+    # 如果 dim=1 (第二维, D1)，pad 应该是 (0, 0, padding_len, 0, 0, 0, ...)
+    #
+    # 由于我们需要在第二维 (dim=1) 的末尾进行填充，我们需要确定 pad 元组中对应 dim=1 的位置。
+    # 维度数量 D = tensor.dim()
+    # dim=0 对应 pad 元组的最后两位
+    # dim=1 对应 pad 元组的倒数第 4, 3 位
+    # dim=2 对应 pad 元组的倒数第 6, 5 位 (对于三维 tensor，即前两位)
+    # 在 dim 维度进行 '后填充' (在末尾添加)
+    # padding_dims 是一个长度为 2 * D 的元组，所有维度默认不填充
+    padding_dims = [0] * (2 * tensor.dim())
+    # 对应 dim 维度的 '后填充' (即 pad 元组中的偶数索引位置，从后往前数)
+    # 填充的位置是 (2 * tensor.dim() - 2 * dim - 2)
+    # 例如：D=3, dim=1 -> 2*3 - 2*1 - 2 = 2
+    # pad 元组为 (d2_start, d2_end, d1_start, d1_end, d0_start, d0_end)
+    # 我们要填充 d1_end，它在索引 2 的位置
+    # F.pad 要求的是 (最后维度 start, 最后维度 end, 倒数第二维度 start, 倒数第二维度 end, ...)
+    # 我们的 dim=1 是倒数第 (D - 1 - dim) + 1 = D - dim 个维度
+    # 它在 pad 元组中是倒数第 2 * (D - dim) 位和倒数第 2 * (D - dim) - 1 位
+    #
+    # 填充位置的索引 (从 0 开始, 从左往右):
+    # (2 * (tensor.dim() - dim - 1)) 是 '前填充' 的位置
+    # (2 * (tensor.dim() - dim - 1) + 1) 是 '后填充' 的位置
+    pad_index = 2 * (tensor.dim() - dim - 1) + 1
+    if pad_index < len(padding_dims):
+        padding_dims[pad_index] = padding_len
+    else:
+        raise ValueError("Invalid dimension index.")
+    # 转换回 tuple
+    pad = tuple(padding_dims)
+    # 使用 F.pad 进行填充，模式为 'constant'
+    padded_tensor = F.pad(tensor, pad=pad, mode='constant', value=pad_value)
+    return padded_tensor, padding_len
+def depad_by_length(padded_tensor: torch.Tensor, depadding_len: int, dim: int = 1) -> torch.Tensor:
+    """
+    在指定维度上去除末尾的 padding 部分。
+    Args:
+        padded_tensor: 已经经过 padding 的 PyTorch tensor。
+        depadding_len: 需要从末尾去除的长度。
+        dim: 需要去除 padding 的维度索引（默认为 1，即第二维）。
+    Returns:
+        depadded_tensor: 去除 padding 后的 tensor。
+    """
+    # 检查去除长度是否合理
+    current_size = padded_tensor.size(dim)
+    if depadding_len < 0:
+        raise ValueError("depadding_len 必须是非负数。")
+    if depadding_len > current_size:
+        raise ValueError(f"要去除的长度 {depadding_len} 大于当前维度长度 {current_size}。")
+    # 计算去除 padding 后的目标长度
+    target_size = current_size - depadding_len
+    # 构造切片操作所需的索引元组
+    # 对于所有维度，我们默认使用完整的切片 `:`
+    slices = [slice(None)] * padded_tensor.dim()
+    # 在指定维度 dim 上，我们只取从 0 到 target_size 的部分
+    # Python 切片 [0:target_size] 会保留 target_size 个元素，即去除了末尾的 depadding_len
+    slices[dim] = slice(0, target_size)
+    # 使用元组解包进行切片操作
+    depadded_tensor = padded_tensor[tuple(slices)]
+    return depadded_tensor
+def pad_by_length(padded_tensor: torch.Tensor, padding_len: int, dim: int = 1,pad_value: float = 0.0) -> torch.Tensor:
+    if padding_len < 0:
+        raise ValueError("padding_len 必须是非负数。")
+    if dim < 0 or dim >= padded_tensor.dim():
+        raise ValueError(f"维度索引 {dim} 超出有效范围 [0, {padded_tensor.dim() - 1}]。")
+    # 构建padding参数
+    # F.pad需要为每个维度指定左右两边的padding长度
+    # 格式为: (最后一个维度的左边, 最后一个维度的右边, 倒数第二个维度的左边, 倒数第二个维度的右边, ...)
+    pad_tuple = [0] * (2 * padded_tensor.dim())
+    # 将指定维度右边的padding长度设置为padding_len
+    # F.pad的维度顺序是从最后一个维度开始的，所以需要进行转换
+    pad_idx = 2 * (padded_tensor.dim() - 1 - dim) + 1
+    pad_tuple[pad_idx] = padding_len
+    # 调用F.pad进行padding
+    padded_tensor = F.pad(padded_tensor, pad=tuple(pad_tuple), mode='constant', value=pad_value)
+    return padded_tensor

hyworldmirror/models/__init__.py ADDED Viewed

File without changes

hyworldmirror/models/heads/__init__.py ADDED Viewed

File without changes

hyworldmirror/models/heads/camera_head.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# inspired by https://github.com/facebookresearch/vggt/blob/main/src/models/heads/camera_head.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..layers import Mlp, MlpFP32
+from ..layers.block import Block, DistBlock
+class CameraHead(nn.Module):
+    """
+    Camera head module: predicts camera parameters from token representations using iterative refinement
+    Processes dedicated camera tokens through a series of transformer blocks
+    """
+    def __init__(
+        self,
+        dim_in: int = 2048,
+        trunk_depth: int = 4,
+        num_heads: int = 16,
+        mlp_ratio: int = 4,
+        init_values: float = 0.01,
+        trans_act: str = "linear",
+        quat_act: str = "linear",
+        fl_act: str = "relu",
+        block_fn: nn.Module = Block,
+    ):
+        super().__init__()
+        self.out_dim = 9
+        self.trans_act = trans_act
+        self.quat_act = quat_act
+        self.fl_act = fl_act
+        self.depth = trunk_depth
+        # Build refinement network using transformer block sequence
+        self.refine_net = nn.Sequential(
+            *[
+                block_fn(dim=dim_in, num_heads=num_heads, mlp_ratio=mlp_ratio, init_values=init_values)
+                for _ in range(trunk_depth)
+            ]
+        )
+        # Normalization for camera tokens and network output
+        self.token_norm = nn.LayerNorm(dim_in)
+        self.out_norm = nn.LayerNorm(dim_in)
+        # Learnable initial camera parameter token
+        self.init_token = nn.Parameter(torch.zeros(1, 1, self.out_dim))
+        self.param_embed = nn.Linear(self.out_dim, dim_in)
+        # Generate adaptive normalization parameters: shift, scale, and gate
+        self.adapt_norm_gen = nn.Sequential(nn.SiLU(), nn.Linear(dim_in, 3 * dim_in, bias=True))
+        # Adaptive layer normalization (no learnable parameters)
+        self.adapt_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6)
+        # self.param_predictor = Mlp(in_features=dim_in, hidden_features=dim_in // 2, out_features=self.out_dim, drop=0)
+        self.param_predictor = MlpFP32(in_features=dim_in, hidden_features=dim_in // 2, out_features=self.out_dim, drop=0)
+    def to(self, *args, **kwargs):
+        self.refine_net = self.refine_net.to(*args, **kwargs)
+        self.token_norm = self.token_norm.to(*args, **kwargs)
+        self.out_norm = self.out_norm.to(*args, **kwargs)
+        self.adapt_norm_gen = self.adapt_norm_gen.to(*args, **kwargs)
+        self.adapt_norm = self.adapt_norm.to(*args, **kwargs)
+        self.param_predictor = self.param_predictor.to(*args, **kwargs)
+        # keep these parameters in FP32
+        args, kwargs = MlpFP32.map_to_args_to_float(args, kwargs)
+        self.init_token = nn.Parameter(self.init_token.to(*args, **kwargs))
+        self.param_embed = self.param_embed.to(*args, **kwargs)
+        return self
+    def forward(self, feat_seq: list, steps: int = 4) -> list:
+        """
+        Forward pass to predict camera parameters
+        Args:
+            feat_seq: List of token tensors from network, last one used for prediction
+            steps: Number of iterative refinement steps, default 4
+        Returns:
+            List of predicted camera encodings (post-activation) from each iteration
+        """
+        # Use tokens from last block for camera prediction
+        latest_feat = feat_seq[-1]
+        # Extract camera tokens
+        cam_tokens = latest_feat[:, :, 0]
+        cam_tokens = self.token_norm(cam_tokens)
+        # Iteratively refine camera pose predictions
+        b, seq_len, feat_dim = cam_tokens.shape  # seq_len expected to be 1
+        curr_pred = None
+        pred_seq = []
+        for step in range(steps):
+            # Use learned initial token for first iteration
+            if curr_pred is None:
+                net_input = self.param_embed(self.init_token.expand(b, seq_len, -1))
+            else:
+                curr_pred = curr_pred.detach()
+                net_input = self.param_embed(curr_pred)
+            net_input = net_input.to(cam_tokens.dtype)
+            norm_shift, norm_scale, norm_gate = self.adapt_norm_gen(net_input).chunk(3, dim=-1)
+            mod_cam_feat = norm_gate * self.apply_adaptive_modulation(self.adapt_norm(cam_tokens), norm_shift, norm_scale)
+            mod_cam_feat = mod_cam_feat + cam_tokens
+            proc_feat = self.refine_net(mod_cam_feat)
+            param_delta = self.param_predictor(self.out_norm(proc_feat))
+            if curr_pred is None:
+                curr_pred = param_delta
+            else:
+                curr_pred = curr_pred + param_delta
+            # Apply final activation functions for translation, quaternion, and field-of-view
+            activated_params = self.apply_camera_parameter_activation(curr_pred)
+            pred_seq.append(activated_params)
+        return pred_seq
+    def apply_camera_parameter_activation(self, params: torch.Tensor) -> torch.Tensor:
+        """
+        Apply activation functions to camera parameter components
+        Args:
+            params: Tensor containing camera parameters [translation, quaternion, focal_length]
+        Returns:
+            Activated camera parameters tensor
+        """
+        trans_vec = params[..., :3]
+        quat_vec = params[..., 3:7]
+        fl_vec = params[..., 7:]  # or field of view
+        trans_vec = self.apply_parameter_activation(trans_vec, self.trans_act)
+        quat_vec = self.apply_parameter_activation(quat_vec, self.quat_act)
+        fl_vec = self.apply_parameter_activation(fl_vec, self.fl_act)
+        activated_params = torch.cat([trans_vec, quat_vec, fl_vec], dim=-1)
+        return activated_params
+    def apply_parameter_activation(self, tensor: torch.Tensor, act_type: str) -> torch.Tensor:
+        """
+        Apply specified activation function to parameter tensor
+        Args:
+            tensor: Tensor containing parameter values
+            act_type: Activation type ("linear", "inv_log", "exp", "relu")
+        Returns:
+            Activated parameter tensor
+        """
+        if act_type == "linear":
+            return tensor
+        elif act_type == "inv_log":
+            return self.apply_inverse_logarithm_transform(tensor)
+        elif act_type == "exp":
+            return torch.exp(tensor)
+        elif act_type == "relu":
+            return F.relu(tensor)
+        else:
+            raise ValueError(f"Unknown activation_type: {act_type}")
+    def apply_inverse_logarithm_transform(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply inverse logarithm transform: sign(y) * (exp(|y|) - 1)
+        Args:
+            x: Input tensor
+        Returns:
+            Transformed tensor
+        """
+        return torch.sign(x) * (torch.expm1(torch.abs(x)))
+    def apply_adaptive_modulation(self, x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+        """
+        Apply adaptive modulation to input tensor using scaling and shifting parameters
+        """
+        # Modified from https://github.com/facebookresearch/DiT/blob/796c29e532f47bba17c5b9c5eb39b9354b8b7c64/models.py#L19
+        return x * (1 + scale) + shift

hyworldmirror/models/heads/dense_head.py ADDED Viewed

	@@ -0,0 +1,672 @@

+# inspired by https://github.com/DepthAnything/Depth-Anything-V2
+from typing import List, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from ..layers.mlp import MlpFP32
+from ..utils.grid import create_uv_grid, position_grid_to_embed
+class _BaseDPTHead(nn.Module):
+    """Base class with shared DPT feature extraction: projects, resize, scratch, and fusion."""
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        features: int = 256,
+        out_channels: List[int] = [256, 512, 1024, 1024],
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        gradient_checkpoint: bool = False,
+        _cast_pos_embed_dtype: bool = True,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        self.gradient_checkpoint = gradient_checkpoint
+        self._cast_pos_embed_dtype = _cast_pos_embed_dtype
+        self.norm = nn.LayerNorm(dim_in)
+        self.projects = nn.ModuleList([
+            nn.Conv2d(in_channels=dim_in, out_channels=oc, kernel_size=1, stride=1, padding=0)
+            for oc in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+            ),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+            ),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+            ),
+        ])
+        self.scratch = _make_scratch(out_channels, features, expand=False)
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features)
+        self.scratch.refinenet2 = _make_fusion_block(features)
+        self.scratch.refinenet3 = _make_fusion_block(features)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
+        head_features_1 = features
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+    def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        patch_w = x.shape[-1]
+        patch_h = x.shape[-2]
+        pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
+        pos_embed = pos_embed * ratio
+        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        if self._cast_pos_embed_dtype:
+            pos_embed = pos_embed.to(x.dtype)
+        return x + pos_embed
+    def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor:
+        layer_1, layer_2, layer_3, layer_4 = features
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        del layer_4_rn, layer_4
+        out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
+        del layer_3_rn, layer_3
+        out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
+        del layer_2_rn, layer_2
+        out = self.scratch.refinenet1(out, layer_1_rn)
+        del layer_1_rn, layer_1
+        out = self.scratch.output_conv1(out)
+        return out
+    def _extract_fused_features(
+        self,
+        token_list: List[torch.Tensor],
+        B: int,
+        S: int,
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        frame_start: int = None,
+        frame_end: int = None,
+    ) -> torch.Tensor:
+        """Extract multi-scale features from tokens, fuse via scratch network, and upsample."""
+        ph = H // self.patch_size
+        pw = W // self.patch_size
+        feats = []
+        for proj, resize, tokens in zip(self.projects, self.resize_layers, token_list):
+            patch_tokens = tokens[:, :, patch_start_idx:]
+            if frame_start is not None and frame_end is not None:
+                patch_tokens = patch_tokens[:, frame_start:frame_end]
+            patch_tokens = patch_tokens.reshape(B * S, -1, patch_tokens.shape[-1])
+            patch_tokens = self.norm(patch_tokens)
+            feat = patch_tokens.permute(0, 2, 1).reshape(B * S, patch_tokens.shape[-1], ph, pw)
+            feat = proj(feat)
+            if self.pos_embed:
+                feat = self._apply_pos_embed(feat, W, H)
+            feat = resize(feat)
+            feats.append(feat)
+        fused = checkpoint(self.scratch_forward, feats, use_reentrant=False) if self.gradient_checkpoint else self.scratch_forward(feats)
+        _interpolate_fn = lambda t: custom_interpolate(
+            t,
+            size=(
+                int(ph * self.patch_size / self.down_ratio),
+                int(pw * self.patch_size / self.down_ratio)
+            ),
+            mode="bilinear",
+            align_corners=True,
+        )
+        fused = checkpoint(_interpolate_fn, fused, use_reentrant=False) if self.gradient_checkpoint else _interpolate_fn(fused)
+        if self.pos_embed:
+            fused = self._apply_pos_embed(fused, W, H)
+        return fused
+class DPTHead(_BaseDPTHead):
+    """
+    # DPT Head for dense prediction tasks.
+    # This module implements the DPT (Dense Prediction Transformer) head as proposed in
+    # "Vision Transformers for Dense Prediction" (https://arxiv.org/abs/2103.13413).
+    # It takes features from a vision transformer backbone and generates dense (per-pixel) predictions
+    # by fusing multi-scale features through a series of projection, upsampling, and refinement blocks.
+    # Args:
+    #   dim_in (int): Number of input feature channels.
+    #   patch_size (int, optional): Patch size used by the backbone, default is 14.
+    #   output_dim (int, optional): Number of output channels, default is 4.
+    #   activation (str, optional): Activation function type for the output head, default is "inv_log".
+    #   conf_activation (str, optional): Activation function type for the confidence/output uncertainty head, default is "expp1".
+    #   features (int, optional): Number of channels used in intermediate feature representations, default is 256.
+    #   out_channels (List[int], optional): Number of channels for each intermediate multi-scale feature.
+    #   intermediate_layer_idx (List[int], optional): Indices specifying which backbone layers to use for multi-scale fusion.
+    #   pos_embed (bool, optional): Whether to add positional encoding to the features, default is True.
+    #   feature_only (bool, optional): If True, only return intermediate features (skip final prediction and activations).
+    #   down_ratio (int, optional): Downsampling ratio of the output predictions, default is 1 (no downsampling).
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "inv_log+expp1",
+        features: int = 256,
+        out_channels: List[int] = [256, 512, 1024, 1024],
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        is_gsdpt: bool = False,
+        enable_depth_mask: bool = False,
+        gradient_checkpoint: bool = False,
+    ) -> None:
+        super().__init__(
+            dim_in=dim_in, patch_size=patch_size, features=features,
+            out_channels=out_channels, pos_embed=pos_embed,
+            down_ratio=down_ratio, gradient_checkpoint=gradient_checkpoint,
+        )
+        self.activation = activation
+        self.is_gsdpt = is_gsdpt
+        self.enable_depth_mask = enable_depth_mask
+        head_features_2 = 32
+        conv2_in_channels = features // 2
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+        )
+        if self.is_gsdpt:
+            self.input_merger = nn.Sequential(
+                nn.Conv2d(3, conv2_in_channels, 7, 1, 3),
+                nn.ReLU()
+                )
+    def to(self, *args, **kwargs):
+        self.norm = self.norm.to(*args, **kwargs)
+        self.projects = self.projects.to(*args, **kwargs)
+        self.resize_layers = self.resize_layers.to(*args, **kwargs)
+        if self.is_gsdpt:
+            self.input_merger = self.input_merger.to(*args, **kwargs)
+        for key in ('layer1_rn', 'layer2_rn', 'layer3_rn', 'layer4_rn',
+                    'refinenet1', 'refinenet2', 'refinenet3', 'refinenet4',
+                    'output_conv1'):
+            if not hasattr(self.scratch, key):
+                continue
+            setattr(self.scratch, key, getattr(self.scratch, key).to(*args, **kwargs))
+        # keep output_conv2 in FP32
+        args, kwargs = MlpFP32.map_to_args_to_float(args, kwargs)
+        self.scratch.output_conv2 = self.scratch.output_conv2.to(*args, **kwargs)
+        return self
+    def forward(
+        self,
+        token_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frames_chunk_size: int = 8,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward pass with optional frame chunking for memory efficiency.
+        Args:
+            token_list: List of token tensors from transformer, each [B, N, C]
+            images: Input images [B, S, 3, H, W], range [0, 1]
+            patch_start_idx: Starting index of patch tokens
+            frames_chunk_size: Number of frames per chunk. If None or >= S, process all at once
+            gradient_checkpoint: Whether to use gradient checkpointing
+        Returns:
+            For is_gsdpt: predictions [B, S, ...]
+            Otherwise: (predictions, confidence), [B, S, X, H, W] and [B, S, 1, H, W]
+        """
+        B, S, _, H, W = images.shape
+        # Process all frames together if chunk size not specified or large enough
+        if frames_chunk_size is None or frames_chunk_size >= S:
+            return self._forward_impl(token_list, images, patch_start_idx)
+        assert frames_chunk_size > 0
+        # Process frames in chunks
+        preds_chunks = []
+        conf_chunks = []
+        gs_chunks = []
+        depth_mask_chunks = []
+        for frame_start in range(0, S, frames_chunk_size):
+            frame_end = min(frame_start + frames_chunk_size, S)
+            if self.is_gsdpt:
+                if self.enable_depth_mask:
+                    gs, preds, conf, depth_mask = self._forward_impl(
+                        token_list, images, patch_start_idx, frame_start, frame_end
+                    )
+                    gs_chunks.append(gs)
+                    preds_chunks.append(preds)
+                    conf_chunks.append(conf)
+                    depth_mask_chunks.append(depth_mask)
+                else:
+                    gs, preds, conf = self._forward_impl(
+                        token_list, images, patch_start_idx, frame_start, frame_end
+                    )
+                    gs_chunks.append(gs)
+                    preds_chunks.append(preds)
+                    conf_chunks.append(conf)
+            else:
+                if self.enable_depth_mask:
+                    preds, conf, depth_mask = self._forward_impl(
+                        token_list, images, patch_start_idx, frame_start, frame_end
+                    )
+                    preds_chunks.append(preds)
+                    conf_chunks.append(conf)
+                    depth_mask_chunks.append(depth_mask)
+                else:
+                    preds, conf = self._forward_impl(
+                        token_list, images, patch_start_idx, frame_start, frame_end
+                    )
+                    preds_chunks.append(preds)
+                    conf_chunks.append(conf)
+        # Concatenate chunks along frame dimension
+        if self.is_gsdpt:
+            if self.enable_depth_mask:
+                return (
+                    torch.cat(gs_chunks, dim=1),
+                    torch.cat(preds_chunks, dim=1),
+                    torch.cat(conf_chunks, dim=1),
+                    torch.cat(depth_mask_chunks, dim=1),
+                )
+            return torch.cat(gs_chunks, dim=1), torch.cat(preds_chunks, dim=1), torch.cat(conf_chunks, dim=1)
+        else:
+            if self.enable_depth_mask:
+                return torch.cat(preds_chunks, dim=1), torch.cat(conf_chunks, dim=1), torch.cat(depth_mask_chunks, dim=1)
+            else:
+                return torch.cat(preds_chunks, dim=1), torch.cat(conf_chunks, dim=1)
+    def _forward_impl(
+        self,
+        token_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frame_start: int = None,
+        frame_end: int = None,
+    ) -> torch.Tensor:
+        """
+        Core forward implementation for DPT head.
+        Args:
+            token_list: List of transformer tokens from each layer, [B, S, N, C]
+            images: Input images [B, S, 3, H, W]
+            patch_start_idx: Starting index of patch tokens
+            frame_start: Start index for frame chunking (optional)
+            frame_end: End index for frame chunking (optional)
+        Returns:
+            If is_gsdpt: (features, preds, conf)
+            Else: (preds, conf)
+        """
+        if frame_start is not None and frame_end is not None:
+            images = images[:, frame_start:frame_end].contiguous()
+        B, S, _, H, W = images.shape
+        fused = self._extract_fused_features(token_list, B, S, H, W, patch_start_idx, frame_start, frame_end)
+        # Generate predictions and confidence
+        if self.is_gsdpt:
+            out = self.scratch.output_conv2(fused.float().contiguous())
+            if self.enable_depth_mask:
+                preds, conf, depth_mask = self.activate_head(out, activation=self.activation)
+            else:
+                preds, conf = self.activate_head(out, activation=self.activation)
+            preds = preds.reshape(B, S, *preds.shape[1:])
+            conf = conf.reshape(B, S, *conf.shape[1:])
+            # Merge direct image features
+            img_flat = images.reshape(B * S, -1, H, W)
+            img_feat = self.input_merger(img_flat)
+            fused = fused + img_feat
+            fused = fused.reshape(B, S, *fused.shape[1:]).float().contiguous()
+            if self.enable_depth_mask:
+                depth_mask = depth_mask.reshape(B, S, *depth_mask.shape[1:])
+                return fused, preds, conf, depth_mask
+            return fused, preds, conf
+        else:
+            out = self.scratch.output_conv2(fused.float().contiguous())
+            if self.enable_depth_mask:
+                preds, conf, depth_mask = self.activate_head(out, activation=self.activation)
+                preds = preds.reshape(B, S, *preds.shape[1:])
+                conf = conf.reshape(B, S, *conf.shape[1:])
+                depth_mask = depth_mask.reshape(B, S, *depth_mask.shape[1:])
+                return preds, conf, depth_mask
+            else:
+                preds, conf = self.activate_head(out, activation=self.activation)
+                preds = preds.reshape(B, S, *preds.shape[1:])
+                conf = conf.reshape(B, S, *conf.shape[1:])
+                return preds, conf
+    def activate_head(self, out_head: torch.Tensor, activation: str = "inv_log+expp1") -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Process network output to extract attribute (e.g. points, depth, etc.) and confidence values.
+        Args:
+            out_head: Network output tensor (B, C, H, W)
+            activation: Activation type for processing (e.g., "inv_log+expp1")
+        Returns:
+            Tuple of (attribute tensor, confidence tensor)
+        """
+        # Parse activation string
+        if self.enable_depth_mask:
+            act_attr, act_conf, act_depth_mask = (activation.split("+") if "+" in activation else (activation, "expp1", "linear"))
+            # (B,C,H,W) -> (B,H,W,C)
+            feat = out_head.permute(0, 2, 3, 1)
+            attr, conf, depth_mask = feat[..., :-2], feat[..., -2], feat[..., -1]
+        else:
+            act_attr, act_conf = (activation.split("+") if "+" in activation else (activation, "expp1"))
+            # (B,C,H,W) -> (B,H,W,C)
+            feat = out_head.permute(0, 2, 3, 1)
+            attr, conf = feat[..., :-1], feat[..., -1]
+        # Map point activations to lambdas for clarity and conciseness
+        attr_activations = {
+            "norm_exp": lambda x: (x / x.norm(dim=-1, keepdim=True).clamp(min=1e-8)) * torch.expm1(x.norm(dim=-1, keepdim=True)),
+            "norm": lambda x: x / x.norm(dim=-1, keepdim=True),
+            "exp": torch.exp,
+            "relu": F.relu,
+            "inv_log": self._apply_inverse_log_transform,
+            "xy_inv_log": lambda x: torch.cat([
+                x[..., :2] * self._apply_inverse_log_transform(x[..., 2:]),
+                self._apply_inverse_log_transform(x[..., 2:])
+            ], dim=-1),
+            "sigmoid": torch.sigmoid,
+            "linear": lambda x: x
+        }
+        if act_attr not in attr_activations:
+            raise ValueError(f"Unknown attribute activation: {act_attr}")
+        attr_out = attr_activations[act_attr](attr)
+        # Confidence activation mapping
+        conf_activations = {
+            "expp1": lambda c: 1 + c.exp(),
+            "expp0": torch.exp,
+            "sigmoid": torch.sigmoid
+        }
+        if act_conf not in conf_activations:
+            raise ValueError(f"Unknown confidence activation: {act_conf}")
+        conf_out = conf_activations[act_conf](conf)
+        if self.enable_depth_mask:
+            depth_mask_activations = {
+                "sigmoid": torch.sigmoid,
+                "linear": lambda x: x,
+            }
+            if act_depth_mask not in depth_mask_activations:
+                raise ValueError(f"Unknown depth mask activation: {act_depth_mask}")
+            depth_mask_out = depth_mask_activations[act_depth_mask](depth_mask)
+            return attr_out, conf_out, depth_mask_out
+        else:
+            return attr_out, conf_out
+    def _apply_inverse_log_transform(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Apply inverse logarithm transform: sign(y) * (exp(|y|) - 1)
+        Args:
+            input_tensor: Input tensor
+        Returns:
+            Transformed tensor
+        """
+        return torch.sign(input_tensor) * (torch.expm1(torch.abs(input_tensor)))
+################################################################################
+# DPT Modules
+################################################################################
+def _make_fusion_block(features: int, size: int = None, has_residual: bool = True, groups: int = 1) -> nn.Module:
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(inplace=True),
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+    )
+def _make_scratch(in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False) -> nn.Module:
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module with skip connection."""
+    def __init__(self, features, activation, bn, groups=1):
+        """Initialize ResidualConvUnit.
+        Args:
+            features (int): Number of input/output feature channels
+            activation: Activation function to use
+            bn (bool): Whether to use batch normalization (currently unused)
+            groups (int): Number of groups for grouped convolution
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = groups
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.norm1 = None
+        self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass with residual connection.
+        Args:
+            x (tensor): Input tensor of shape (B, C, H, W)
+        Returns:
+            tensor: Output tensor of shape (B, C, H, W) with residual added
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.norm2 is not None:
+            out = self.norm2(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+        has_residual=True,
+        groups=1,
+    ):
+        """Initialize FeatureFusionBlock.
+        Args:
+            features (int): Number of input/output feature channels
+            activation: Activation function to use
+            deconv (bool): Whether to use deconvolution
+            bn (bool): Whether to use batch normalization
+            expand (bool): Whether to expand features (halve output channels)
+            align_corners (bool): Align corners for interpolation
+            size: Target size for upsampling
+            has_residual (bool): Whether to include residual connection
+            groups (int): Number of groups for grouped convolution
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = groups
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups
+        )
+        if has_residual:
+            self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.has_residual = has_residual
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass through the feature fusion block.
+        Args:
+            *xs: Variable number of input tensors. First tensor is the main input,
+                 second tensor (if present) is used for residual connection.
+            size: Optional target size for upsampling. If None, uses self.size or scale_factor=2.
+        Returns:
+            torch.Tensor: Fused and upsampled output tensor.
+        """
+        output = xs[0]
+        if self.has_residual:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def custom_interpolate(
+    x: torch.Tensor,
+    size: Tuple[int, int] = None,
+    scale_factor: float = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    Custom interpolation function to handle large tensors by chunking.
+    Avoids INT_MAX overflow issues in nn.functional.interpolate when dealing with
+    very large input tensors by splitting them into smaller chunks.
+    Args:
+        x: Input tensor to interpolate
+        size: Target output size (H, W)
+        scale_factor: Scaling factor if size is not provided
+        mode: Interpolation mode (default: "bilinear")
+        align_corners: Whether to align corners in interpolation
+    Returns:
+        Interpolated tensor
+    """
+    if size is None:
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if input_elements > INT_MAX:
+        chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
+        interpolated_chunks = [
+            nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks
+        ]
+        x = torch.cat(interpolated_chunks, dim=0)
+        return x.contiguous()
+    else:
+        return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)

hyworldmirror/models/heads/gs_head.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import List
+import torch
+import torch.nn as nn
+from .dense_head import _BaseDPTHead
+class GSFeatHead(_BaseDPTHead):
+    """
+    GS feature head that only outputs fused GS features.
+    This head is used when gs depth is disabled. It skips the prediction
+    conv (output_conv2) and returns only the fused GS feature map.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        features: int = 256,
+        out_channels: List[int] = [256, 512, 1024, 1024],
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        gradient_checkpoint: bool = False,
+    ) -> None:
+        super().__init__(
+            dim_in=dim_in, patch_size=patch_size, features=features,
+            out_channels=out_channels, pos_embed=pos_embed,
+            down_ratio=down_ratio, gradient_checkpoint=gradient_checkpoint,
+            _cast_pos_embed_dtype=False,
+        )
+        conv2_in_channels = features // 2
+        self.input_merger = nn.Sequential(
+            nn.Conv2d(3, conv2_in_channels, 7, 1, 3),
+            nn.ReLU(),
+        )
+    def forward(
+        self,
+        token_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frames_chunk_size: int = 8,
+    ) -> torch.Tensor:
+        B, S, _, H, W = images.shape
+        if frames_chunk_size is None or frames_chunk_size >= S:
+            return self._forward_impl(token_list, images, patch_start_idx)
+        assert frames_chunk_size > 0
+        gs_chunks = []
+        for frame_start in range(0, S, frames_chunk_size):
+            frame_end = min(frame_start + frames_chunk_size, S)
+            gs = self._forward_impl(
+                token_list, images, patch_start_idx, frame_start, frame_end
+            )
+            gs_chunks.append(gs)
+        return torch.cat(gs_chunks, dim=1)
+    def _forward_impl(
+        self,
+        token_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frame_start: int = None,
+        frame_end: int = None,
+    ) -> torch.Tensor:
+        if frame_start is not None and frame_end is not None:
+            images = images[:, frame_start:frame_end].contiguous()
+        B, S, _, H, W = images.shape
+        fused = self._extract_fused_features(
+            token_list, B, S, H, W, patch_start_idx, frame_start, frame_end
+        )
+        img_flat = images.reshape(B * S, -1, H, W)
+        img_feat = self.input_merger(img_flat)
+        fused = fused + img_feat
+        fused = fused.reshape(B, S, *fused.shape[1:])
+        return fused

hyworldmirror/models/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .mlp import Mlp, MlpFP32
+from .patch_embed import PatchEmbed, PatchEmbed_Mlp
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

hyworldmirror/models/layers/attention.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from torch import Tensor
+from torch import nn
+import torch.nn.functional as F
+import torch
+try:
+    from flash_attn_interface import flash_attn_func as flash_attn_func_v3
+    _USE_FLASH_ATTN_V3 = True
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_func as flash_attn_func_v2
+    _USE_FLASH_ATTN_V3 = False
+from ...comm.padding import minimal_pad_to_divisible, depad_by_length, pad_by_length
+import torch.distributed as dist
+from ...comm.communication import _All2All, _Allgather
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def _compute_qkv(self, x: Tensor):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        return q, k, v, B, N, C
+    def _apply_attention(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        if q.dtype==torch.bfloat16 or q.dtype==torch.float16:
+            if q.is_contiguous():
+                q = q.transpose(1,2)
+            else:
+                q = q.transpose(1, 2).contiguous()
+            if k.is_contiguous():
+                k = k.transpose(1, 2)
+            else:
+                k = k.transpose(1, 2).contiguous()
+            if v.is_contiguous():
+                v = v.transpose(1, 2)
+            else:
+                v = v.transpose(1, 2).contiguous()
+            if _USE_FLASH_ATTN_V3:
+                x = flash_attn_func_v3(q, k, v)
+            else:
+                x = flash_attn_func_v2(q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0)
+            if x.is_contiguous():
+                x = x.transpose(1, 2)
+            else:
+                x = x.transpose(1, 2).contiguous()
+        else:
+            x = F.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0)
+        return x
+    def _project_output(self, x: Tensor, B: int, N: int, C: int) -> Tensor:
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        q, k, v, B, N, C = self._compute_qkv(x)
+        if self.rope is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        x = self._apply_attention(q, k, v)
+        return self._project_output(x, B, N, C)
+class DistAttention(Attention):
+    def forward(self, x: Tensor, pos=None, sp_size=1, sp_group=None, padding_tokens=0) -> Tensor:
+        q, k, v, B, N, C = self._compute_qkv(x)
+        if sp_size>1:
+            q = _All2All.apply(q,1,2,sp_group,False)
+            k = _All2All.apply(k,1,2,sp_group,False)
+            v = _All2All.apply(v,1,2,sp_group,False)
+            q = depad_by_length(q,padding_tokens,2)
+            k = depad_by_length(k,padding_tokens,2)
+            v = depad_by_length(v,padding_tokens,2)
+        if self.rope is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        x = self._apply_attention(q, k, v)
+        if sp_size>1:
+            x = pad_by_length(x,padding_tokens,2,0)
+            x = _All2All.apply(x,2,1,sp_group,False)
+        return self._project_output(x, B, N, C)
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None, pos=None) -> Tensor:
+        assert pos is None
+        if attn_bias is not None:
+            raise AssertionError("xFormers is required for using nested tensors")
+        return super().forward(x)

hyworldmirror/models/layers/block.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, DistAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+XFORMERS_AVAILABLE = False
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(2)) + shift.unsqueeze(2)
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            fused_attn=fused_attn,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, bias=ffn_bias
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x, pos=pos, residual_func=attn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+            x = drop_add_residual_stochastic_depth(
+                x, residual_func=ffn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos)
+            x = x + ffn_residual_func(x)
+        return x
+class DistBlock(Block):
+    def __init__(self, *args, attn_class: Callable[..., nn.Module] = DistAttention, **kwargs):
+        super().__init__(*args, attn_class=attn_class, **kwargs)
+    def forward(self, x: Tensor, pos=None, sp_size=1,sp_group=None,padding_tokens=0,block_type = None, token_shape=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None, sp_size=1,sp_group=None,padding_tokens=0) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos,sp_size=sp_size,sp_group=sp_group,padding_tokens=padding_tokens))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x, pos=pos, sp_size=sp_size,sp_group=sp_group,padding_tokens=padding_tokens,residual_func=attn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+            x = drop_add_residual_stochastic_depth(
+                x, residual_func=ffn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos,sp_size=sp_size,sp_group=sp_group,padding_tokens=padding_tokens))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos,sp_size=sp_size,sp_group=sp_group,padding_tokens=padding_tokens)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor, residual_func: Callable[[Tensor], Tensor], sample_drop_ratio: float = 0.0, pos=None
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    if pos is not None:
+        # if necessary, apply rope to the subset
+        pos = pos[brange]
+        residual = residual_func(x_subset, pos=pos)
+    else:
+        residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(self.ls1.gamma if isinstance(self.ls1, LayerScale) else None),
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(self.ls2.gamma if isinstance(self.ls1, LayerScale) else None),
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

hyworldmirror/models/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

hyworldmirror/models/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(self, dim: int, init_values: Union[float, Tensor] = 1e-5, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

hyworldmirror/models/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+import torch
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class MlpFP32(Mlp):
+    @staticmethod
+    def map_to_args_to_float(args, kwargs):
+        args = tuple(
+            torch.float32 if isinstance(arg, torch.dtype) else arg
+            for arg in args
+        )
+        kwargs = dict(kwargs)
+        for key in kwargs:
+            if key == "dtype":
+                kwargs[key] = torch.float32
+        return args, kwargs
+    def to(self, *args, **kwargs):
+        self.fc1 = self.fc1.to(*args, **kwargs)
+        args, kwargs = self.map_to_args_to_float(args, kwargs)
+        self.fc2 = self.fc2.to(*args, **kwargs)
+        return self
+    def forward_infer(self, x):
+        x = self.fc1(x)
+        x = 0.5 * x * (1 + torch.erf(x * 2**-0.5))
+        x = self.fc2(x.float())
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self.forward_infer(x)

hyworldmirror/models/layers/norm_rope.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import math
+from typing import Dict, Literal, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+class PositionGetter:
+    """Generates and caches 2D spatial positions for patches in a grid."""
+    def __init__(self) -> None:
+        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}
+    def __call__(self, batch_size: int, height: int, width: int, device: torch.device) -> torch.Tensor:
+        if (height, width) not in self.position_cache:
+            y_coords = torch.arange(height, device=device)
+            x_coords = torch.arange(width, device=device)
+            self.position_cache[height, width] = torch.cartesian_prod(y_coords, x_coords)
+        cached_positions = self.position_cache[height, width]
+        return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+class NormalizedRotaryPositionEmbedding2D(nn.Module):
+    """DINOv3-aligned 2D Rotary Position Embedding."""
+    def __init__(
+        self,
+        *,
+        head_dim: int,
+        base: float = 100.0,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: Union[float, None] = None,
+        jitter_coords: Union[float, None] = None,
+        rescale_coords: Union[float, None] = None,
+        dtype: Union[torch.dtype, None] = None,
+        device: Union[torch.device, None] = None,
+        **ignored_kwargs,
+    ) -> None:
+        super().__init__()
+        if len(ignored_kwargs) > 0:
+            # maintain parity with DINOv3 implementation that warns on ignored kwargs
+            pass
+        if head_dim % 4 != 0:
+            raise ValueError("head_dim must be divisible by 4 for 2D RoPE")
+        self.head_dim = head_dim
+        self.base = base
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+        self.dtype = dtype
+        quarter_dim = head_dim // 4
+        self.register_buffer(
+            "periods",
+            torch.empty(quarter_dim, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_periods()
+    def _init_periods(self) -> None:
+        quarter_dim = self.periods.shape[0]
+        half_dim = self.head_dim // 2
+        exponents = 2 * torch.arange(quarter_dim, device=self.periods.device, dtype=self.dtype) / half_dim
+        periods = self.base ** exponents
+        self.periods.data.copy_(periods)
+    def _get_sincos_for_grid(self, H: int, W: int, device: torch.device, dtype: torch.dtype) -> Tuple[Tensor, Tensor]:
+        dd = {"device": device, "dtype": dtype}
+        if self.normalize_coords == "max":
+            max_hw = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_hw
+            coords_w = torch.arange(0.5, W, **dd) / max_hw
+        elif self.normalize_coords == "min":
+            min_hw = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_hw
+            coords_w = torch.arange(0.5, W, **dd) / min_hw
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H
+            coords_w = torch.arange(0.5, W, **dd) / W
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)  # [H, W, 2]
+        coords = coords.flatten(0, 1)  # [HW, 2]
+        coords = 2.0 * coords - 1.0
+        if self.training:
+            if self.shift_coords is not None:
+                shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
+                coords += shift_hw[None, :]
+            if self.jitter_coords is not None:
+                jitter_max = np.log(self.jitter_coords)
+                jitter_hw = torch.empty(2, **dd).uniform_(-jitter_max, jitter_max).exp()
+                coords *= jitter_hw[None, :]
+            if self.rescale_coords is not None:
+                rescale_max = np.log(self.rescale_coords)
+                rescale_hw = torch.empty(1, **dd).uniform_(-rescale_max, rescale_max).exp()
+                coords *= rescale_hw
+        periods = self.periods.to(device=device, dtype=dtype)
+        angles = (2 * math.pi * coords[:, :, None]) / periods[None, None, :]  # [HW, 2, D/4]
+        angles = angles.flatten(1, 2)  # [HW, D/2]
+        angles = torch.cat((angles, angles), dim=-1)  # [HW, D]
+        cos = torch.cos(angles)
+        sin = torch.sin(angles)
+        return sin, cos
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        # Validate inputs
+        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
+        assert positions.ndim == 3 and positions.shape[-1] == 2, "Positions must have shape (batch_size, n_tokens, 2)"
+        B, _, N, C_head = tokens.shape
+        if C_head != self.head_dim:
+            raise ValueError(f"Head dim {C_head} doesn't match configured {self.head_dim}")
+        H = int(positions[..., 0].max().item() + 1)
+        W = int(positions[..., 1].max().item() + 1)
+        sin, cos = self._get_sincos_for_grid(H, W, tokens.device, tokens.dtype)
+        indices = (positions[..., 0] * W + positions[..., 1]).long()
+        flat_indices = indices.view(-1)
+        gathered_sin = sin[flat_indices].view(B, 1, N, C_head)
+        gathered_cos = cos[flat_indices].view(B, 1, N, C_head)
+        return (tokens * gathered_cos) + (_rotate_half(tokens) * gathered_sin)

hyworldmirror/models/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from itertools import repeat
+import collections.abc
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (image_HW[0] // patch_HW[0], image_HW[1] // patch_HW[1])
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+class PatchEmbed_Mlp(PatchEmbed):
+    def __init__(self, img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 norm_layer=None,
+                 flatten_embedding=True):
+        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten_embedding)
+        self.proj = nn.Sequential(
+            PixelUnshuffle(patch_size),
+            Permute((0,2,3,1)),
+            Mlp(in_chans * patch_size**2, 4*embed_dim, embed_dim),
+            Permute((0,3,1,2)),
+            )
+class PixelUnshuffle (nn.Module):
+    def __init__(self, downscale_factor):
+        super().__init__()
+        self.downscale_factor = downscale_factor
+    def forward(self, input):
+        if input.numel() == 0:
+            # this is not in the original torch implementation
+            C,H,W = input.shape[-3:]
+            assert H and W and H % self.downscale_factor == W%self.downscale_factor == 0
+            return input.view(*input.shape[:-3], C*self.downscale_factor**2, H//self.downscale_factor, W//self.downscale_factor)
+        else:
+            return F.pixel_unshuffle(input, self.downscale_factor)
+class Permute(torch.nn.Module):
+    dims: tuple[int, ...]
+    def __init__(self, dims: tuple[int, ...]) -> None:
+        super().__init__()
+        self.dims = tuple(dims)
+    def __repr__(self):
+        return f"Permute{self.dims}"
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.permute(*self.dims)
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x

hyworldmirror/models/layers/rope.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Implementation of 2D Rotary Position Embeddings (RoPE).
+# This module provides a clean implementation of 2D Rotary Position Embeddings,
+# which extends the original RoPE concept to handle 2D spatial positions.
+# Inspired by:
+#         https://github.com/meta-llama/codellama/blob/main/llama/model.py
+#         https://github.com/naver-ai/rope-vit
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple
+class PositionGetter:
+    """Generates and caches 2D spatial positions for patches in a grid.
+    This class efficiently manages the generation of spatial coordinates for patches
+    in a 2D grid, caching results to avoid redundant computations.
+    Attributes:
+        position_cache: Dictionary storing precomputed position tensors for different
+            grid dimensions.
+    """
+    def __init__(self):
+        """Initializes the position generator with an empty cache."""
+        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}
+    def __call__(self, batch_size: int, height: int, width: int, device: torch.device) -> torch.Tensor:
+        """Generates spatial positions for a batch of patches.
+        Args:
+            batch_size: Number of samples in the batch.
+            height: Height of the grid in patches.
+            width: Width of the grid in patches.
+            device: Target device for the position tensor.
+        Returns:
+            Tensor of shape (batch_size, height*width, 2) containing y,x coordinates
+            for each position in the grid, repeated for each batch item.
+        """
+        if (height, width) not in self.position_cache:
+            y_coords = torch.arange(height, device=device)
+            x_coords = torch.arange(width, device=device)
+            positions = torch.cartesian_prod(y_coords, x_coords)
+            self.position_cache[height, width] = positions
+        cached_positions = self.position_cache[height, width]
+        return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+class RotaryPositionEmbedding2D(nn.Module):
+    """2D Rotary Position Embedding implementation.
+    This module applies rotary position embeddings to input tokens based on their
+    2D spatial positions. It handles the position-dependent rotation of features
+    separately for vertical and horizontal dimensions.
+    Args:
+        frequency: Base frequency for the position embeddings. Default: 100.0
+        scaling_factor: Scaling factor for frequency computation. Default: 1.0
+    Attributes:
+        base_frequency: Base frequency for computing position embeddings.
+        scaling_factor: Factor to scale the computed frequencies.
+        frequency_cache: Cache for storing precomputed frequency components.
+    """
+    def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0,):
+        """Initializes the 2D RoPE module."""
+        super().__init__()
+        self.base_frequency = frequency
+        self.scaling_factor = scaling_factor
+        self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {}
+    def _compute_frequency_components(
+        self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Computes frequency components for rotary embeddings.
+        Args:
+            dim: Feature dimension (must be even).
+            seq_len: Maximum sequence length.
+            device: Target device for computations.
+            dtype: Data type for the computed tensors.
+        Returns:
+            Tuple of (cosine, sine) tensors for frequency components.
+        """
+        cache_key = (dim, seq_len, device, dtype)
+        if cache_key not in self.frequency_cache:
+            # Compute frequency bands
+            exponents = torch.arange(0, dim, 2, device=device).float() / dim
+            inv_freq = 1.0 / (self.base_frequency**exponents)
+            # Generate position-dependent frequencies
+            positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            angles = torch.einsum("i,j->ij", positions, inv_freq)
+            # Compute and cache frequency components
+            angles = angles.to(dtype)
+            angles = torch.cat((angles, angles), dim=-1)
+            cos_components = angles.cos().to(dtype)
+            sin_components = angles.sin().to(dtype)
+            self.frequency_cache[cache_key] = (cos_components, sin_components)
+        return self.frequency_cache[cache_key]
+    @staticmethod
+    def _rotate_features(x: torch.Tensor) -> torch.Tensor:
+        """Performs feature rotation by splitting and recombining feature dimensions.
+        Args:
+            x: Input tensor to rotate.
+        Returns:
+            Rotated feature tensor.
+        """
+        feature_dim = x.shape[-1]
+        x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def _apply_1d_rope(
+        self, tokens: torch.Tensor, positions: torch.Tensor, cos_comp: torch.Tensor, sin_comp: torch.Tensor
+    ) -> torch.Tensor:
+        """Applies 1D rotary position embeddings along one dimension.
+        Args:
+            tokens: Input token features.
+            positions: Position indices.
+            cos_comp: Cosine components for rotation.
+            sin_comp: Sine components for rotation.
+        Returns:
+            Tokens with applied rotary position embeddings.
+        """
+        # Embed positions with frequency components
+        cos = F.embedding(positions, cos_comp)[:, None, :, :]
+        sin = F.embedding(positions, sin_comp)[:, None, :, :]
+        # Apply rotation
+        return (tokens * cos) + (self._rotate_features(tokens) * sin)
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        """Applies 2D rotary position embeddings to input tokens.
+        Args:
+            tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim).
+                   The feature dimension (dim) must be divisible by 4.
+            positions: Position tensor of shape (batch_size, n_tokens, 2) containing
+                      the y and x coordinates for each token.
+        Returns:
+            Tensor of same shape as input with applied 2D rotary position embeddings.
+        Raises:
+            AssertionError: If input dimensions are invalid or positions are malformed.
+        """
+        # Validate inputs
+        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
+        assert positions.ndim == 3 and positions.shape[-1] == 2, "Positions must have shape (batch_size, n_tokens, 2)"
+        # Compute feature dimension for each spatial direction
+        feature_dim = tokens.size(-1) // 2
+        # Get frequency components
+        max_position = int(positions.max()) + 1
+        cos_comp, sin_comp = self._compute_frequency_components(feature_dim, max_position, tokens.device, tokens.dtype)
+        # Split features for vertical and horizontal processing
+        vertical_features, horizontal_features = tokens.chunk(2, dim=-1)
+        # Apply RoPE separately for each dimension
+        vertical_features = self._apply_1d_rope(vertical_features, positions[..., 0], cos_comp, sin_comp)
+        horizontal_features = self._apply_1d_rope(horizontal_features, positions[..., 1], cos_comp, sin_comp)
+        # Combine processed features
+        return torch.cat((vertical_features, horizontal_features), dim=-1)

hyworldmirror/models/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+SwiGLU = SwiGLUFFN
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(in_features=in_features, hidden_features=hidden_features, out_features=out_features, bias=bias)

hyworldmirror/models/layers/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from torch.nn.init import trunc_normal_
+from . import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+log = logging.getLogger(__name__)
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        qk_norm=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.use_reentrant = False # hardcoded to False
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            log.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            log.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            log.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                qk_norm=qk_norm,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat((x[:, :1], self.register_tokens.expand(x.shape[0], -1, -1), x[:, 1:]), dim=1)
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            if self.training:
+                # x = blk(x)
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.training:
+                # x = blk(x)
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=True, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

hyworldmirror/models/models/__init__.py ADDED Viewed

File without changes

hyworldmirror/models/models/rasterization.py ADDED Viewed

	@@ -0,0 +1,525 @@

+from typing import Dict, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from einops import rearrange
+from gsplat.rendering import rasterization
+from gsplat.strategy import DefaultStrategy
+from ..utils.frustum import calculate_unprojected_mask
+from ..utils.geometry import depth_to_world_coords_points
+from ..utils import sh_utils, act_gs
+from typing import List
+class Rasterizer:
+    def __init__(self, rasterization_mode="classic", packed=True, abs_grad=True, with_eval3d=False,
+                 camera_model="pinhole", sparse_grad=False, distributed=False, grad_strategy=DefaultStrategy):
+        self.rasterization_mode = rasterization_mode
+        self.packed = packed
+        self.abs_grad = abs_grad
+        self.camera_model = camera_model
+        self.sparse_grad = sparse_grad
+        self.grad_strategy = grad_strategy
+        self.distributed = distributed
+        self.with_eval3d = with_eval3d
+    def rasterize_splats(
+        self,
+        means,
+        quats,
+        scales,
+        opacities,
+        colors,
+        camtoworlds: Tensor,
+        Ks: Tensor,
+        width: int,
+        height: int,
+        **kwargs,
+    ) -> Tuple[Tensor, Tensor, Dict]:
+        render_colors, render_alphas, _ = rasterization(
+            means=means,
+            quats=quats,
+            scales=scales,
+            opacities=opacities,
+            colors=colors,
+            viewmats=torch.linalg.inv(camtoworlds),  # [C, 4, 4]
+            Ks=Ks,  # [C, 3, 3]
+            width=width,
+            height=height,
+            packed=self.packed,
+            absgrad=(
+                self.abs_grad
+                if isinstance(self.grad_strategy, DefaultStrategy)
+                else False
+            ),
+            sparse_grad=self.sparse_grad,
+            rasterize_mode=self.rasterization_mode,
+            distributed=self.distributed,
+            camera_model=self.camera_model,
+            with_eval3d=self.with_eval3d,
+            render_mode="RGB+ED",
+            **kwargs,
+        )
+        return render_colors[..., :3], render_colors[..., 3:], render_alphas
+    def rasterize_batches(self, means, quats, scales, opacities, colors, viewmats, Ks, width, height, **kwargs):
+        rendered_colors, rendered_depths, rendered_alphas = [], [], []
+        batch_size = len(means)
+        for i in range(batch_size):
+            means_i = means[i]  # [N, 4]
+            quats_i = quats[i]  # [N, 4]
+            scales_i = scales[i]  # [N, 3]
+            opacities_i = opacities[i]  # [N,]
+            colors_i = colors[i]  # [N, 3]
+            viewmats_i = viewmats[i]  # [V, 4, 4]
+            Ks_i = Ks[i]  # [V, 3, 3]
+            render_colors_i, render_depths_i, render_alphas_i = self.rasterize_splats(
+                means_i, quats_i, scales_i, opacities_i, colors_i, viewmats_i, Ks_i, width, height, **kwargs
+            )
+            rendered_colors.append(render_colors_i)  # V H W 3
+            rendered_depths.append(render_depths_i)  # V H W 1
+            rendered_alphas.append(render_alphas_i)  # V H W 1
+        rendered_colors = torch.stack(rendered_colors, dim=0)  # B V H W 3
+        rendered_depths = torch.stack(rendered_depths, dim=0)  # B V H W 1
+        rendered_alphas = torch.stack(rendered_alphas, dim=0)  # B V H W 1
+        return rendered_colors, rendered_depths, rendered_alphas
+class GaussianSplatRenderer(nn.Module):
+    def __init__(
+        self,
+        feature_dim: int = 256,       # Output channels of gs_feat_head
+        sh_degree: int = 0,
+        enable_prune: bool = True,
+        voxel_size: float = 0.002,    # Default voxel size for prune_gs
+        enable_conf_filter: bool = False,  # Enable confidence filtering
+        conf_threshold_percent: float = 30.0,  # Confidence threshold percentage
+        max_gaussians: int = 5000000,  # Maximum number of Gaussians
+    ):
+        super().__init__()
+        self.feature_dim = feature_dim
+        self.sh_degree = sh_degree
+        self.nums_sh = (sh_degree + 1) ** 2
+        self.voxel_size = voxel_size
+        self.enable_prune = enable_prune
+        self.enable_conf_filter = enable_conf_filter
+        self.conf_threshold_percent = conf_threshold_percent
+        self.max_gaussians = max_gaussians
+        # Predict Gaussian parameters from GS features (quaternions/scales/opacities/SH/weights)
+        splits_and_inits = [
+            (4, 1.0, 0.0),                # quats
+            (3, 0.00003, -7.0),           # scales
+            (1, 1.0, -2.0),               # opacities
+            (3 * self.nums_sh, 1.0, 0.0), # residual_sh
+            (1, 1.0, -2.0),               # weights
+        ]
+        gaussian_raw_channels = 4 + 3 + 1 + self.nums_sh * 3 + 1
+        self.gs_head = nn.Sequential(
+            nn.Conv2d(feature_dim // 2, feature_dim, kernel_size=3, padding=1, bias=False),
+            nn.ReLU(True),
+            nn.Conv2d(feature_dim, gaussian_raw_channels, kernel_size=1),
+        )
+        # Initialize weights and biases of the final layer by segments
+        final_conv_layer = self.gs_head[-1]
+        start_channels = 0
+        for out_channel, s, b in splits_and_inits:
+            nn.init.xavier_uniform_(final_conv_layer.weight[start_channels:start_channels+out_channel], s)
+            nn.init.constant_(final_conv_layer.bias[start_channels:start_channels+out_channel], b)
+            start_channels += out_channel
+        # Rasterizer
+        self.rasterizer = Rasterizer()
+    # ======== Main entry point: Complete GS rendering and fill results back to predictions ========
+    def render(
+        self,
+        gs_feats: torch.Tensor,                    # [B, S, 3, H, W]
+        images: torch.Tensor,                      # [B, S+V, 3, H, W]
+        predictions: Dict[str, torch.Tensor],      # From WorldMirror: pose/depth/pts3d etc
+        views: Dict[str, torch.Tensor],
+        context_predictions: Dict[str, torch.Tensor],
+        is_inference: bool=True,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Returns predictions with the following fields filled:
+        - rendered_colors / rendered_depths / (rendered_alphas during training)
+        - gt_colors / gt_depths / valid_masks
+        - splats / rendered_extrinsics / rendered_intrinsics
+        """
+        B, _, _, H, W = images.shape
+        S = context_predictions.get("imgs", images).shape[1] # context view nums
+        V = images.shape[1] - S                              # target view nums
+        # 1) Predict GS features from tokens, then convert to Gaussian parameters
+        gs_feats_reshape = rearrange(gs_feats, "b s c h w -> (b s) c h w")
+        # Align input dtype with gs_head weights (handles fp32 input from
+        # precision-critical DPT output_conv2 when model runs in bf16 mode)
+        head_dtype = next(self.gs_head.parameters()).dtype
+        if gs_feats_reshape.dtype != head_dtype:
+            gs_feats_reshape = gs_feats_reshape.to(head_dtype)
+        gs_params = self.gs_head(gs_feats_reshape)
+        gt_colors = images.permute(0, 1, 3, 4, 2)
+        # 2) Select rendering cameras
+        if self.training:
+            # Using all gt cameras
+            render_viewmats, render_Ks = self.prepare_cameras(views, S + V)
+            gt_valid_masks_src = views["valid_mask"][:, :S]      # [B, S, H, W]
+            gt_valid_masks_tgt = views["valid_mask"][:, S:]     # [B, V, H, W]
+            unproject_masks = calculate_unprojected_mask(views, S)     # [B, V, H, W]
+            valid_masks = torch.cat([gt_valid_masks_src, (gt_valid_masks_tgt & unproject_masks)], dim=1)
+        else:
+            # Re-predict the camera for novel views and perform translation scale alignment
+            pred_all_extrinsic, pred_all_intrinsic = self.prepare_cameras(predictions, S + V)
+            scale_factor = torch.ones(B, device=images.device)
+            if "camera_poses" in context_predictions:
+                pred_context_extrinsic, _ = self.prepare_cameras(context_predictions, S)
+                scale_factor = pred_context_extrinsic[:, :, :3, 3].norm(dim=-1).mean(dim=1, keepdim=True) / (
+                    pred_all_extrinsic[:, :S, :3, 3].norm(dim=-1).mean(dim=1, keepdim=True) + 1e-6
+                )
+            pred_all_extrinsic[..., :3, 3] = pred_all_extrinsic[..., :3, 3] * scale_factor.unsqueeze(-1)
+            render_viewmats, render_Ks = pred_all_extrinsic, pred_all_intrinsic
+            valid_masks = views.get("valid_mask", torch.ones(B, S + V, H, W, dtype=bool, device=images.device))
+        # 3) Generate splats from gs_params + predictions, and perform voxel merging
+        if self.training:
+            splats = self.prepare_splats(
+                views,
+                predictions,
+                images,
+                gs_params,
+                S,
+                position_from="gsdepth+gtcamera",
+            )
+        elif not is_inference:
+            splats = self.prepare_splats(
+                views,
+                predictions,
+                images,
+                gs_params,
+                S,
+                context_predictions,
+                position_from="gsdepth+predcamera",
+            )
+        else:
+            splats = self.prepare_splats(
+                views,
+                predictions,
+                images,
+                gs_params,
+                S,
+                position_from="gsdepth+predcamera",
+            )
+        if is_inference:
+            predictions["splats"] = splats
+            return predictions
+        # Apply confidence filtering before pruning
+        if self.enable_conf_filter and "gs_depth_conf" in predictions:
+            splats = self.apply_confidence_filter(splats, predictions["gs_depth_conf"])
+        if self.enable_prune:
+            splats = self.prune_gs(splats, voxel_size=self.voxel_size)
+        predictions["splats"] = splats
+        # 4) Rasterization rendering (training: chunked rendering + novel view valid mask correction; evaluation: view-by-view)
+        # Prevent OOM by using chunked rendering
+        rendered_colors_list, rendered_depths_list, rendered_alphas_list = [], [], []
+        chunk_size = 2
+        for i in range(0, gt_colors.shape[1], chunk_size):
+            end_idx = min(i + chunk_size, gt_colors.shape[1])
+            viewmats_i = render_viewmats[:, i:end_idx]
+            Ks_i = render_Ks[:, i:end_idx]
+            rendered_colors, rendered_depths, rendered_alphas = self.rasterizer.rasterize_batches(
+                splats["means"], splats["quats"], splats["scales"], splats["opacities"],
+                splats["sh"] if "sh" in splats else splats["colors"],
+                viewmats_i.detach(), Ks_i.detach(),
+                width=images.shape[-1], height=images.shape[-2],
+                sh_degree=min(self.sh_degree, 0) if "sh" in splats else None,
+            )
+            rendered_colors_list.append(rendered_colors)
+            rendered_depths_list.append(rendered_depths)
+            rendered_alphas_list.append(rendered_alphas)
+        rendered_colors = torch.cat(rendered_colors_list, dim=1)
+        rendered_depths = torch.cat(rendered_depths_list, dim=1)
+        rendered_alphas = torch.cat(rendered_alphas_list, dim=1)
+        if self.training and V > 0:
+            nvs_rendered_mask = rendered_alphas[:, S:, ..., 0].detach() > 0.1
+            valid_masks[:, S:] = nvs_rendered_mask & valid_masks[:, S:]
+        # 5) return predictions
+        predictions["rendered_colors"] = rendered_colors
+        predictions["rendered_depths"] = rendered_depths
+        predictions["rendered_alphas"] = rendered_alphas
+        predictions["gt_colors"] = gt_colors.float()
+        predictions["gt_depths"] = views.get("depthmap")
+        predictions["valid_masks"] = valid_masks.bool()
+        predictions["rendered_extrinsics"] = render_viewmats
+        predictions["rendered_intrinsics"] = render_Ks
+        return predictions
+    def apply_confidence_filter(self, splats, gs_depth_conf):
+        """
+        Apply confidence filtering to Gaussian splats before pruning.
+        Discard bottom p% confidence points, keep top (100-p)%.
+        Args:
+            splats: Dictionary containing Gaussian parameters
+            gs_depth_conf: Confidence tensor [B, S, H, W]
+        Returns:
+            Filtered splats dictionary
+        """
+        if not self.enable_conf_filter or gs_depth_conf is None:
+            return splats
+        device = splats["means"].device
+        B, N = splats["means"].shape[:2]
+        # Flatten confidence: [B, S, H, W] -> [B, N]
+        conf = gs_depth_conf.flatten(1).to(device)
+        # Mask invalid/very small values
+        conf = conf.masked_fill(conf <= 1e-5, float("-inf"))
+        # Keep top (100-p)% points, discard bottom p%
+        if self.conf_threshold_percent > 0:
+            keep_from_percent = int(np.ceil(N * (100.0 - self.conf_threshold_percent) / 100.0))
+        else:
+            keep_from_percent = N
+        K = max(1, min(self.max_gaussians, keep_from_percent))
+        # Select top-K indices for each batch (deterministic, no randomness)
+        topk_idx = torch.topk(conf, K, dim=1, largest=True, sorted=False).indices  # [B, K]
+        filtered = {}
+        mask_keys = ["means", "quats", "scales", "opacities", "sh", "weights"]
+        for key in splats.keys():
+            if key in mask_keys and key in splats:
+                x = splats[key]
+                if x.ndim == 2:  # [B, N]
+                    filtered[key] = torch.gather(x, 1, topk_idx)
+                else:
+                    # Expand indices to match tensor dimensions
+                    expand_idx = topk_idx.clone()
+                    for i in range(x.ndim - 2):
+                        expand_idx = expand_idx.unsqueeze(-1)
+                    expand_idx = expand_idx.expand(-1, -1, *x.shape[2:])
+                    filtered[key] = torch.gather(x, 1, expand_idx)
+            else:
+                filtered[key] = splats[key]
+        return filtered
+    def prune_gs(self, splats, voxel_size=0.002, filter_mask=None):
+        """
+        Prune Gaussian splats by optional mask filtering + voxel merging.
+        Args:
+            splats: Dictionary containing Gaussian parameters.
+                    Each value is [B, S*H*W, ...] (batch of per-pixel gaussians).
+            voxel_size: Size of voxels for spatial grouping.
+            filter_mask: Optional bool tensor [B, S*H*W] or numpy [S, H, W].
+                         True = keep, False = discard.  Applied before voxel merge.
+        Returns:
+            Dictionary with pruned/merged splats (list-of-tensors per batch).
+        """
+        B = splats["means"].shape[0]
+        merged_splats_list = []
+        device = splats["means"].device
+        for i in range(B):
+            # Extract splats for current batch
+            splats_i = {k: splats[k][i] for k in ["means", "quats", "scales", "opacities", "sh", "weights"]}
+            # --- Apply filter_mask (discard unwanted gaussians before merge) ---
+            if filter_mask is not None:
+                if isinstance(filter_mask, np.ndarray):
+                    fm = torch.from_numpy(filter_mask.reshape(-1)).to(device)
+                elif filter_mask.dim() == 3:
+                    # [S, H, W] -> flatten
+                    fm = filter_mask.reshape(-1).to(device)
+                else:
+                    fm = filter_mask[i].to(device)
+                fm = fm.bool()
+                splats_i = {k: v[fm] for k, v in splats_i.items()}
+            N_in = splats_i["means"].shape[0]
+            if N_in == 0:
+                # All filtered out — push empty tensors
+                merged_splats_list.append({
+                    "means": torch.zeros((0, 3), device=device),
+                    "quats": torch.zeros((0, 4), device=device),
+                    "scales": torch.zeros((0, 3), device=device),
+                    "opacities": torch.zeros(0, device=device),
+                    "sh": torch.zeros((0, self.nums_sh, 3), device=device),
+                })
+                continue
+            # Compute voxel indices
+            coords = splats_i["means"]
+            voxel_indices = (coords / voxel_size).floor().long()
+            min_indices = voxel_indices.min(dim=0)[0]
+            voxel_indices = voxel_indices - min_indices
+            max_dims = voxel_indices.max(dim=0)[0] + 1
+            # Flatten 3D voxel indices to 1D
+            flat_indices = (voxel_indices[:, 0] * max_dims[1] * max_dims[2] +
+                           voxel_indices[:, 1] * max_dims[2] +
+                           voxel_indices[:, 2])
+            # Find unique voxels and inverse mapping
+            unique_voxels, inverse_indices = torch.unique(flat_indices, return_inverse=True)
+            K = len(unique_voxels)
+            # Initialize merged splats
+            merged = {
+                "means": torch.zeros((K, 3), device=device),
+                "quats": torch.zeros((K, 4), device=device),
+                "scales": torch.zeros((K, 3), device=device),
+                "opacities": torch.zeros(K, device=device),
+                "sh": torch.zeros((K, self.nums_sh, 3), device=device)
+            }
+            # Get weights and compute weight sums per voxel
+            weights = splats_i["weights"]
+            weight_sums = torch.zeros(K, device=device)
+            weight_sums.scatter_add_(0, inverse_indices, weights)
+            weight_sums = torch.clamp(weight_sums, min=1e-8)
+            # Merge means (weighted average)
+            for d in range(3):
+                merged["means"][:, d].scatter_add_(0, inverse_indices,
+                                                 splats_i["means"][:, d] * weights)
+            merged["means"] = merged["means"] / weight_sums.unsqueeze(1)
+            # Merge spherical harmonics (weighted average)
+            for d in range(3):
+                merged["sh"][:, 0, d].scatter_add_(0, inverse_indices,
+                                                  splats_i["sh"][:, 0, d] * weights)
+            merged["sh"] = merged["sh"] / weight_sums.unsqueeze(-1).unsqueeze(-1)
+            # Merge opacities (weighted sum of squares)
+            merged["opacities"].scatter_add_(0, inverse_indices, weights * weights)
+            merged["opacities"] = merged["opacities"] / weight_sums
+            # Merge scales (weighted average)
+            for d in range(3):
+                merged["scales"][:, d].scatter_add_(0, inverse_indices,
+                                                  splats_i["scales"][:, d] * weights)
+            merged["scales"] = merged["scales"] / weight_sums.unsqueeze(1)
+            # Merge quaternions (weighted average + normalization)
+            for d in range(4):
+                merged["quats"][:, d].scatter_add_(0, inverse_indices,
+                                                 splats_i["quats"][:, d] * weights)
+            quat_norms = torch.norm(merged["quats"], dim=1, keepdim=True)
+            merged["quats"] = merged["quats"] / torch.clamp(quat_norms, min=1e-8)
+            merged_splats_list.append(merged)
+        # Reorganize output
+        output = {}
+        for key in ["means", "sh", "opacities", "scales", "quats"]:
+            output[key] = [merged[key] for merged in merged_splats_list]
+        return output
+    def prepare_splats(self, views, predictions, images, gs_params, context_nums,
+                       context_predictions={}, position_from="gsdepth+gtcamera"):
+        """
+        Prepare Gaussian splats from model predictions and input data.
+        Args:
+            views: Dictionary containing view data (camera poses, intrinsics, etc.)
+            predictions: Model predictions including depth, pose_enc, etc.
+            images: Input images [B, S_all, 3, H, W]
+            gs_params: Gaussian splatting parameters from model
+            context_predictions: Optional context predictions for camera poses
+            position_from: Method to compute 3D positions ("pts3d", "gsdepth+gtcamera", "gsdepth+predcamera",
+            "depth_head+gtcamera", "depth_head+predcamera")
+            debug: Whether to use debug mode with ground truth data
+        Returns:
+            splats: Dictionary containing prepared Gaussian splat parameters
+        """
+        B, _, _, H, W = images.shape
+        S = context_nums
+        splats = {}
+        # Only take parameters from source view branch
+        gs_params = rearrange(gs_params, "(b s) c h w -> b s h w c", b=B)
+        splats["gs_feats"] = gs_params.reshape(B, S*H*W, -1)
+        # Split Gaussian parameters
+        quats, scales, opacities, residual_sh, weights = torch.split(
+            gs_params, [4, 3, 1, self.nums_sh * 3, 1], dim=-1
+        )
+        # Apply activation functions to Gaussian parameters
+        splats["quats"] = act_gs.reg_dense_rotation(quats.reshape(B, S * H * W, 4))
+        splats["scales"] = act_gs.reg_dense_scales(scales.reshape(B, S * H * W, 3)).clamp_max(0.3)
+        splats["opacities"] = act_gs.reg_dense_opacities(opacities.reshape(B, S * H * W))
+        residual_sh = act_gs.reg_dense_sh(residual_sh.reshape(B, S * H * W, self.nums_sh * 3))
+        # Handle spherical harmonics (SH) coefficients
+        new_sh = torch.zeros_like(residual_sh)
+        new_sh[..., 0, :] = sh_utils.RGB2SH(
+            images[:, :S].permute(0, 1, 3, 4, 2).reshape(B, S * H * W, 3)
+        )
+        splats['sh'] = new_sh + residual_sh
+        splats['residual_sh'] = residual_sh
+        splats["weights"] = act_gs.reg_dense_weights(weights.reshape(B, S * H * W))
+        # Compute 3D positions based on specified method
+        if position_from == "pts3d":
+            pts3d = predictions["pts3d"][:, :S].reshape(B, S * H * W, 3)
+            splats["means"] = pts3d
+        elif position_from == "gsdepth+gtcamera":
+            depth = predictions["gs_depth"][:, :S].reshape(B * S, H, W)
+            pose4x4 = views["camera_poses"][:, :S].reshape(B * S, 4, 4)
+            intrinsic = views["camera_intrs"][:, :S].reshape(B * S, 3, 3)
+            pts3d, _, _ = depth_to_world_coords_points(depth, pose4x4, intrinsic)
+            pts3d = pts3d.reshape(B, S * H * W, 3)
+            splats["means"] = pts3d
+        elif position_from == "gsdepth+predcamera":
+            depth = predictions["gs_depth"][:, :S].reshape(B * S , H, W)
+            pose4x4 = context_predictions.get("camera_poses", predictions["camera_poses"])[:, :S].reshape(B * S, 4, 4)
+            intrinsic = context_predictions.get("camera_intrs", predictions["camera_intrs"])[:, :S].reshape(B * S, 3, 3)
+            pts3d, _, _ = depth_to_world_coords_points(depth, pose4x4.detach(), intrinsic.detach())
+            pts3d = pts3d.reshape(B, S * H * W, 3)
+            splats["means"] = pts3d
+        else:
+            raise ValueError(f"Invalid position_from={position_from}")
+        return splats
+    def prepare_cameras(self, views, nums):
+        viewmats = views['camera_poses'][:, :nums]
+        Ks = views['camera_intrs'][:, :nums]
+        return viewmats, Ks

hyworldmirror/models/models/visual_transformer.py ADDED Viewed

	@@ -0,0 +1,542 @@

+import logging
+import random
+from typing import Tuple, List
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from ..layers import PatchEmbed, PatchEmbed_Mlp
+from ..layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2
+from ..layers.block import Block, DistBlock
+from ...comm.padding import minimal_pad_to_divisible,depad_by_length,pad_by_length
+import torch.distributed as dist
+from ...comm.communication import _All2All,_Allgather
+logger = logging.getLogger(__name__)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+class VisualGeometryTransformer(nn.Module):
+    """
+    The VisualGeometryTransformer applies alternating-attention over input frames,
+    as described in VGGT: Visual Geometry Grounded Transformer.
+    Args:
+        img_size (int): Image size in pixels.
+        patch_size (int): Size of each patch for PatchEmbed.
+        embed_dim (int): Dimension of the token embeddings.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+        num_register_tokens (int): Number of register tokens.
+        block_fn (nn.Module): The block type used for attention (Block by default).
+        qkv_bias (bool): Whether to include bias in QKV projections.
+        proj_bias (bool): Whether to include bias in the output projection.
+        ffn_bias (bool): Whether to include bias in MLP layers.
+        patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg".
+        aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"].
+        qk_norm (bool): Whether to apply QK normalization.
+        rope_base (int): Base frequency for rotary embedding.
+        rope_normalize_coords (str): Normalize coordinates for rotary embedding.
+        rope_shift_coords (float): Shift coordinates for rotary embedding.
+        rope_jitter_coords (float): Jitter coordinates for rotary embedding.
+        rope_rescale_coords (float): Rescale coordinates for rotary embedding.
+        init_values (float): Init scale for layer scale.
+        enable_condition (bool): Whether to enable conditioning inputs.
+        sampling_strategy (str): Sampling strategy for patches.
+        fixed_patch_embed (bool): Whether to fix patch embedding weights.
+        condition_strategy (list[str]): Strategy for each conditioning input.
+    """
+    def __init__(
+        self,
+        img_size=518,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_register_tokens=4,
+        block_fn=Block,
+        qkv_bias=True,
+        proj_bias=True,
+        ffn_bias=True,
+        patch_embed="dinov2_vitl14_reg",
+        qk_norm=True,
+        rope_base=100.0,
+        normalized_rope=False,
+        rope_normalize_coords="separate",
+        rope_shift_coords=None,
+        rope_jitter_coords=None,
+        rope_rescale_coords=None,
+        init_values=0.01,
+        enable_cond=False,
+        sampling_strategy="uniform",
+        fixed_patch_embed=False,
+        condition_strategy=["token", "pow3r", "token"],
+        intermediate_idxs: List[int] = [4, 11, 17, 23]
+    ):
+        super().__init__()
+        # Store config parameters
+        self.enable_cond = enable_cond
+        self.sampling_strategy = sampling_strategy
+        self.cond_methods = condition_strategy
+        self.intermediate_idxs = intermediate_idxs
+        self.depth = depth
+        self.patch_size = patch_size
+        # Initialize patch embedding module
+        self.patch_embed = self._init_patch_embedding_module(
+            patch_embed, img_size, patch_size, num_register_tokens,
+            embed_dim=embed_dim, is_fixed=fixed_patch_embed
+        )
+        # Initialize conditioning embeddings if enabled
+        if self.enable_cond:
+            self._init_cond_embeddings(embed_dim, img_size, patch_size, num_register_tokens)
+        # Initialize rotary position embedding
+        self._init_rotary_position_embedding(rope_base, normalized_rope, embed_dim // num_heads, rope_normalize_coords, rope_shift_coords, rope_jitter_coords, rope_rescale_coords)
+        # Initialize transformer blocks
+        self._init_transformer_blocks(block_fn, embed_dim, num_heads, mlp_ratio, qkv_bias, proj_bias, ffn_bias, init_values, qk_norm)
+        # Initialize learnable tokens
+        self._init_learnable_tokens(embed_dim, num_register_tokens)
+        # Calculate patch start index based on conditioning
+        if self.enable_cond:
+            self.patch_start_idx = 1 + num_register_tokens + 1 + 1  # camera + register + pose + rays
+        else:
+            self.patch_start_idx = 1 + num_register_tokens  # camera + register
+        # Register normalization constants
+        for name, value in (("_resnet_mean", _RESNET_MEAN), ("_resnet_std", _RESNET_STD)):
+            self.register_buffer(name, torch.FloatTensor(value).reshape(1, 1, 3, 1, 1), persistent=False)
+        self.use_reentrant = False
+    def _init_patch_embedding_module(
+        self,
+        patch_embed_type,
+        img_size,
+        patch_size,
+        num_reg_tokens,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        block_chunks=0,
+        init_values=1.0,
+        embed_dim=1024,
+        is_fixed=False,
+        in_chans=3
+    ):
+        """
+        Create the patch embedding module. If 'conv', we use a
+        simple PatchEmbed conv layer. Otherwise, we use a vision transformer.
+        """
+        if "conv" in patch_embed_type:
+            if 'mlp' in patch_embed_type:
+                patch_embed_module = PatchEmbed_Mlp(
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    in_chans=in_chans,
+                    embed_dim=embed_dim
+                )
+            else:
+                patch_embed_module = PatchEmbed(
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    in_chans=in_chans,
+                    embed_dim=embed_dim
+                )
+        else:
+            vit_models = {
+                "dinov2_vitl14_reg": vit_large,
+                "dinov2_vitb14_reg": vit_base,
+                "dinov2_vits14_reg": vit_small,
+                "dinov2_vitg2_reg": vit_giant2,
+            }
+            patch_embed_module = vit_models[patch_embed_type](
+                img_size=img_size,
+                patch_size=patch_size,
+                num_register_tokens=num_reg_tokens,
+                interpolate_antialias=interpolate_antialias,
+                interpolate_offset=interpolate_offset,
+                block_chunks=block_chunks,
+                init_values=init_values,
+            )
+            # Disable gradient updates for mask token
+            if hasattr(patch_embed_module, "mask_token"):
+                patch_embed_module.mask_token.requires_grad_(False)
+        if is_fixed:
+            for param in patch_embed_module.parameters():
+                param.requires_grad_(False)
+        return patch_embed_module
+    def _init_cond_embeddings(self, embed_dim, img_size, patch_size, num_reg_tokens):
+        """Initialize conditioning embeddings for camera, depth, and rays."""
+        assert self.cond_methods is not None
+        assert self.cond_methods[0] == "token"
+        # Camera pose embedding
+        if self.cond_methods[0] == "token":
+            self.pose_embed = nn.Sequential(
+                nn.Linear(7, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True)
+            )
+        else:
+            raise NotImplementedError
+        # Depth map embedding
+        if self.cond_methods[1] == "pow3r":
+            self.depth_embed = self._init_patch_embedding_module(
+                "conv+mlp", img_size, patch_size, num_reg_tokens,
+                embed_dim=embed_dim, in_chans=1
+            )
+        else:
+            raise NotImplementedError
+        # Ray direction embedding
+        if self.cond_methods[2] == "token":
+            self.ray_embed = nn.Sequential(
+                nn.Linear(4, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True)
+            )
+        else:
+            raise NotImplementedError
+    def _init_rotary_position_embedding(self, rope_base, normalized_rope, head_dim, rope_normalize_coords, rope_shift_coords, rope_jitter_coords, rope_rescale_coords):
+        if normalized_rope:
+            print("[INFO] Using normalized RoPE!")
+            from ..layers.norm_rope import NormalizedRotaryPositionEmbedding2D, PositionGetter
+            if head_dim % 4 != 0:
+                raise ValueError("RoPE requires head_dim divisible by 4 (embed_dim must be divisible by 4*num_heads)")
+            self.rope = NormalizedRotaryPositionEmbedding2D(
+                head_dim=head_dim,
+                base=rope_base,
+                normalize_coords=rope_normalize_coords,
+                shift_coords=rope_shift_coords,
+                jitter_coords=rope_jitter_coords,
+                rescale_coords=rope_rescale_coords,
+            ) if rope_base > 0 else None
+            self.pos_getter = PositionGetter() if self.rope is not None else None
+        else:
+            from ..layers.rope import RotaryPositionEmbedding2D, PositionGetter
+            print("[INFO] Using standard RoPE!")
+            self.rope = RotaryPositionEmbedding2D(
+                frequency=rope_base,
+            ) if rope_base > 0 else None
+            self.pos_getter = PositionGetter() if self.rope is not None else None
+    def _init_transformer_blocks(self, block_fn, embed_dim, num_heads, mlp_ratio, qkv_bias, proj_bias, ffn_bias, init_values, qk_norm):
+        self.frame_blocks = nn.ModuleList([
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                init_values=init_values,
+                qk_norm=qk_norm,
+                rope=self.rope,
+            )
+            for _ in range(self.depth)
+        ])
+        self.global_blocks = nn.ModuleList([
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                init_values=init_values,
+                qk_norm=qk_norm,
+                rope=self.rope
+            )
+            for _ in range(self.depth)
+        ])
+    def _init_learnable_tokens(self, embed_dim, num_reg_tokens):
+        """Initialize learnable tokens."""
+        self.cam_token = nn.Parameter(torch.zeros(1, 2, 1, embed_dim))
+        self.reg_token = nn.Parameter(torch.zeros(1, 2, num_reg_tokens, embed_dim))
+        nn.init.normal_(self.cam_token, std=1e-6)
+        nn.init.normal_(self.reg_token, std=1e-6)
+    def forward(self, images: torch.Tensor, priors: List | None=None, cond_flags: List[int]=[0,0,0], ctx_frames: int=None, enable_bf16=False, sp_size: int=1, sp_group: torch._C._distributed_c10d.ProcessGroup=None) -> Tuple[List[torch.Tensor], int]:
+        """
+        Args:
+            images: Input images with shape [B, S, 3, H, W], in range [0, 1]
+            priors: Optional tuple of (depth, rays, poses) for conditioning
+            cond_flags: List indicating which conditions to use [pose, depth, rays]
+            ctx_frames: Number of context frames to use
+        Returns:
+            (list[torch.Tensor], int): List of attention block outputs and patch_start_idx
+        """
+        depth_maps, ray_dirs, poses = priors if priors is not None else (None, None, None)
+        # Slice to context frames if specified
+        if ctx_frames is not None:
+            for var_name in ['images', 'depth_maps', 'ray_dirs', 'poses']:
+                var = locals()[var_name]
+                if var is not None:
+                    locals()[var_name] = var[:, :ctx_frames].clone()
+        # Process image tokens
+        b, seq_len, ch, h, w = images.shape
+        if ch != 3:
+            raise ValueError(f"Expected 3 input channels, got {ch}")
+        with torch.amp.autocast('cuda', enabled=(not enable_bf16), dtype=torch.bfloat16):
+            images = (images - self._resnet_mean) / self._resnet_std
+            images = images.reshape(b * seq_len, ch, h, w)
+            patch_tokens = self.patch_embed(images)
+            if isinstance(patch_tokens, dict):
+                patch_tokens = patch_tokens["x_norm_patchtokens"]
+        _, patch_count, embed_dim = patch_tokens.shape
+        # Prepare special tokens
+        cam_tokens = expand_and_flatten_special_tokens(self.cam_token, b, seq_len)
+        reg_tokens = expand_and_flatten_special_tokens(self.reg_token, b, seq_len)
+        # Process all tokens (optional conditioning)
+        if self.enable_cond:
+            pose_tokens, depth_tokens, ray_tokens = self._process_conditioning(depth_maps, ray_dirs, poses, b, seq_len, patch_count, embed_dim, images, cond_flags)
+            # Add condition tokens to patch tokens
+            patch_tokens = patch_tokens + depth_tokens
+            all_tokens = torch.cat([cam_tokens, reg_tokens, pose_tokens, ray_tokens, patch_tokens], dim=1)
+        else:
+            all_tokens = torch.cat([cam_tokens, reg_tokens, patch_tokens], dim=1)
+        _, patch_count, embed_dim = all_tokens.shape
+        # Position embedding
+        pos_emb = None
+        if self.rope is not None:
+            pos_emb = self.pos_getter(b * seq_len, h // self.patch_size, w // self.patch_size, device=images.device)
+            if self.patch_start_idx > 0:
+                pos_emb = pos_emb + 1
+                special_pos = torch.zeros(b * seq_len, self.patch_start_idx, 2, device=images.device, dtype=pos_emb.dtype)
+                pos_emb = torch.cat([special_pos, pos_emb], dim=1)
+        if sp_size>1:
+            rank_in_sp_group = dist.get_group_rank(sp_group,dist.get_rank())
+            all_tokens,tk_padding_len = minimal_pad_to_divisible(all_tokens, sp_size, dim=1,pad_value=0)
+            all_tokens = torch.chunk(all_tokens, sp_size,dim=1)[rank_in_sp_group]
+        _, patch_count, embed_dim = all_tokens.shape
+        token_shape = (b, seq_len, patch_count, embed_dim)
+        # Forward through attention blocks
+        with torch.amp.autocast('cuda', enabled=(not enable_bf16), dtype=torch.bfloat16):
+            outputs = []
+            global_tokens = None
+            if sp_size>1:
+                for idx in range(self.depth):
+                    local_tokens = self._process_dist_attention_blocks(
+                                tokens=all_tokens if global_tokens is None else global_tokens,
+                                b=b,
+                                seq_len=seq_len,
+                                patch_count=patch_count,
+                                embed_dim=embed_dim,
+                                block_idx=idx,
+                                blocks=self.frame_blocks,
+                                block_type='frame',
+                                pos=pos_emb,
+                                sp_size = sp_size,
+                                sp_group = sp_group,
+                                padding_tokens = tk_padding_len
+                            )
+                    global_tokens = self._process_dist_attention_blocks(
+                                tokens=local_tokens,
+                                b=b,
+                                seq_len=seq_len,
+                                patch_count=patch_count,
+                                embed_dim=embed_dim,
+                                block_idx=idx,
+                                blocks=self.global_blocks,
+                                block_type='global',
+                                pos=pos_emb,
+                                sp_size = sp_size,
+                                sp_group = sp_group,
+                                padding_tokens = tk_padding_len
+                            )
+                    global_tokens = global_tokens.reshape(b,-1,embed_dim)
+                    global_tokens = _Allgather.apply(global_tokens,1,sp_group,False)
+                    global_tokens = depad_by_length(global_tokens,tk_padding_len*seq_len,1)
+                    global_tokens = global_tokens.reshape(b,seq_len,-1,embed_dim)
+                    global_tokens = pad_by_length(global_tokens,tk_padding_len,2)
+                    global_tokens = torch.chunk(global_tokens, sp_size,dim=2)[rank_in_sp_group]
+                    # Combine frame and global intermediates
+                    if idx in self.intermediate_idxs:
+                        local_tokens = _Allgather.apply(local_tokens,2,sp_group,False)
+                        local_tokens = depad_by_length(local_tokens,tk_padding_len,2)
+                        global_tokens = _Allgather.apply(global_tokens,2,sp_group,False)
+                        global_tokens = depad_by_length(global_tokens,tk_padding_len,2)
+                        combined_out = torch.cat([local_tokens, global_tokens], dim=-1)
+                        outputs.append(combined_out)
+                        global_tokens = pad_by_length(global_tokens,tk_padding_len,2)
+                        global_tokens = torch.chunk(global_tokens, sp_size,dim=2)[rank_in_sp_group]
+            else:
+                for idx in range(self.depth):
+                    local_tokens = self._process_attention_blocks(
+                                tokens=all_tokens if global_tokens is None else global_tokens,
+                                b=b,
+                                seq_len=seq_len,
+                                patch_count=patch_count,
+                                embed_dim=embed_dim,
+                                block_idx=idx,
+                                blocks=self.frame_blocks,
+                                block_type='frame',
+                                pos=pos_emb,
+                            )
+                    global_tokens = self._process_attention_blocks(
+                                tokens=local_tokens,
+                                b=b,
+                                seq_len=seq_len,
+                                patch_count=patch_count,
+                                embed_dim=embed_dim,
+                                block_idx=idx,
+                                blocks=self.global_blocks,
+                                block_type='global',
+                                pos=pos_emb,
+                            )
+                    # Combine frame and global intermediates
+                    if idx in self.intermediate_idxs:
+                        combined_out = torch.cat([local_tokens, global_tokens], dim=-1)
+                        outputs.append(combined_out)
+                # Combine frame and global intermediates
+                if idx in self.intermediate_idxs:
+                    combined_out = torch.cat([local_tokens, global_tokens], dim=-1)
+                    outputs.append(combined_out)
+        return outputs, self.patch_start_idx
+    def _process_conditioning(self, depth_maps, ray_dirs, poses, b, seq_len, patch_count, embed_dim, images, cond_flags):
+        """Process conditioning inputs."""
+        h, w = images.shape[-2:]
+        # Process camera pose embedding
+        use_poses = (cond_flags[0] == 1 and poses is not None)
+        if use_poses:
+            poses = poses.reshape(b*seq_len, -1)
+            pose_tokens = self.pose_embed(poses).unsqueeze(1)
+        else:
+            pose_tokens = torch.zeros((b*seq_len, 1, embed_dim), device=images.device, dtype=images.dtype)
+        # Process depth map embedding
+        use_depth = cond_flags[1] == 1 and depth_maps is not None
+        if use_depth:
+            depth_maps = depth_maps.reshape(b*seq_len, 1, h, w)
+            depth_tokens = self.depth_embed(depth_maps).reshape(b * seq_len, patch_count, embed_dim)
+        else:
+            depth_tokens = torch.zeros((b*seq_len, patch_count, embed_dim), device=images.device, dtype=images.dtype)
+        # Process ray direction embedding
+        use_rays = cond_flags[2] == 1 and ray_dirs is not None
+        if use_rays:
+            ray_dirs = ray_dirs.reshape(b*seq_len, -1)
+            ray_tokens = self.ray_embed(ray_dirs).unsqueeze(1)
+        else:
+            ray_tokens = torch.zeros((b*seq_len, 1, embed_dim), device=images.device, dtype=images.dtype)
+        return pose_tokens, depth_tokens, ray_tokens
+    def _process_attention_blocks(self, tokens, b, seq_len, patch_count, embed_dim, block_idx, blocks, block_type, pos=None):
+        """Process attention blocks with tokens in shape (B*S, P, C)."""
+        token_shape = (b, seq_len, patch_count, embed_dim)
+        if block_type == 'frame': # local
+            target_shape = (b * seq_len, patch_count, embed_dim)
+            pos_target_shape = (b * seq_len, patch_count, 2) if pos is not None else None
+        else:  # global
+            target_shape = (b, seq_len * patch_count, embed_dim)
+            pos_target_shape = (b, seq_len * patch_count, 2) if pos is not None else None
+        if tokens.shape != target_shape:
+            tokens = tokens.reshape(*target_shape)
+        if pos is not None and pos.shape != pos_target_shape:
+            pos = pos.reshape(*pos_target_shape)
+        if self.training:
+            # tokens = blocks[block_idx](tokens, pos=pos)
+            tokens = checkpoint(blocks[block_idx], tokens, pos=pos, use_reentrant=self.use_reentrant)
+        else:
+            tokens = blocks[block_idx](tokens, pos=pos)
+        return tokens.reshape(*token_shape)
+    def _process_dist_attention_blocks(self, tokens, b, seq_len, patch_count, embed_dim, block_idx, blocks, block_type, pos=None,
+                                sp_size = 1,
+                                sp_group = None,
+                                padding_tokens = 0):
+        """Process attention blocks with tokens in shape (B*S, P, C)."""
+        token_shape = (b, seq_len, patch_count, embed_dim)
+        if block_type == 'frame': # local
+            target_shape = (b * seq_len, patch_count, embed_dim)
+            pos_target_shape = (b * seq_len, patch_count*sp_size-padding_tokens, 2) if pos is not None else None
+        else:  # global
+            target_shape = (b, seq_len * patch_count, embed_dim)
+            pos_target_shape = (b, seq_len * (patch_count*sp_size-padding_tokens), 2) if pos is not None else None
+            # padding_tokens = padding_tokens*seq_len
+        if block_type=="global":
+            rank_in_sp_group = dist.get_group_rank(sp_group,dist.get_rank())
+            tokens = _Allgather.apply(tokens,2,sp_group,False) #(1,7,4*146,64)
+            tokens = depad_by_length(tokens,padding_tokens,2) #(1,7,4*146-2,64)
+            tokens = tokens.reshape(b,-1,embed_dim) #(1,7*(4*146-2),64)
+            padding_tokens = padding_tokens*seq_len
+            tokens = pad_by_length(tokens,padding_tokens,1) #(1,4088,1024)
+            tokens = torch.chunk(tokens, sp_size,dim=1)[rank_in_sp_group]
+        if tokens.shape != target_shape:
+            tokens = tokens.reshape(*target_shape)
+        if pos is not None and pos.shape != pos_target_shape:
+            pos = pos.reshape(*pos_target_shape)
+        if self.training:
+            # tokens = blocks[block_idx](tokens, pos=pos)
+            tokens = checkpoint(blocks[block_idx], tokens, pos=pos, use_reentrant=self.use_reentrant, sp_size=sp_size, sp_group=sp_group, padding_tokens=padding_tokens, block_type =block_type,token_shape=token_shape)
+        else:
+            tokens = blocks[block_idx](tokens, pos=pos, sp_size=sp_size, sp_group=sp_group, padding_tokens=padding_tokens, block_type =block_type,token_shape=token_shape)
+        return tokens.reshape(*token_shape)
+def expand_and_flatten_special_tokens(token_tensor, b, seq_len):
+    """
+    Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing.
+    Uses first position for frame 0, second position for remaining frames.
+    Args:
+        token_tensor: Input tensor with shape (1, 2, X, C)
+        b: Batch size
+        seq_len: Sequence length
+    Returns:
+        torch.Tensor: Processed tokens with shape (B*S, X, C)
+    """
+    # First frame uses position 0, remaining frames use position 1
+    first_frame_tokens = token_tensor[:, 0:1, ...].expand(b, 1, *token_tensor.shape[2:])
+    remaining_frame_tokens = token_tensor[:, 1:, ...].expand(b, seq_len - 1, *token_tensor.shape[2:])
+    # Concatenate and flatten
+    combined_tokens = torch.cat([first_frame_tokens, remaining_frame_tokens], dim=1)
+    return combined_tokens.reshape(b * seq_len, *combined_tokens.shape[2:])

hyworldmirror/models/models/worldmirror.py ADDED Viewed

	@@ -0,0 +1,685 @@

+from typing import Dict, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .visual_transformer import VisualGeometryTransformer
+from ..heads.camera_head import CameraHead
+from ..heads.dense_head import DPTHead
+from ..heads.gs_head import GSFeatHead
+from .rasterization import GaussianSplatRenderer
+from ..utils.camera_utils import (
+    vector_to_camera_matrices,
+    extrinsics_to_vector,
+)
+from ..utils.priors import normalize_depth, normalize_poses
+from huggingface_hub import PyTorchModelHubMixin
+from ..layers.block import Block, DistBlock
+import torch.distributed as dist
+class WorldMirror(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        img_size=518,
+        patch_size=14,
+        model_size="large",
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        gs_dim=256,
+        num_register_tokens=4,
+        enable_cond=True,
+        enable_cam=True,
+        enable_pts=True,
+        enable_depth=True,
+        enable_depth_mask=True,
+        enable_norm=True,
+        enable_gs=True,
+        enable_bf16=False,
+        patch_embed="dinov2_vitl14_reg",
+        fixed_patch_embed=False,
+        sampling_strategy="uniform",
+        dpt_gradient_checkpoint=False,
+        condition_strategy=["token", "pow3r", "token"],
+        rope_base=100.0,
+        normalized_rope=True,
+        rope_normalize_coords="separate",
+        rope_shift_coords=None,
+        rope_jitter_coords=None,
+        rope_rescale_coords=None,
+        sp_size=1,
+        # Legacy parameters (ignored, kept for checkpoint compatibility)
+        set_sky_region_to_maxdepth=False,
+        disable_gs_depth=False,
+    ):
+        super().__init__()
+        self.intermediate_layer_idx = {
+            "small": [2, 5, 8, 11],
+            "base": [2, 5, 8, 11],
+            "large": [4, 11, 17, 23],
+            "giant": [9, 19, 29, 39],
+        }
+        self.model_size = model_size
+        if model_size == "large":
+            embed_dim = 1024
+            depth = 24
+            num_heads = 16
+            mlp_ratio = 4.0
+            gs_dim = 256
+            num_register_tokens = 4
+        elif model_size == "base":
+            embed_dim = 768
+            depth = 12
+            num_heads = 12
+            mlp_ratio = 4.0
+            gs_dim = 256
+            num_register_tokens = 4
+        elif model_size == "small":
+            embed_dim = 384
+            depth = 12
+            num_heads = 6
+            mlp_ratio = 4.0
+            gs_dim = 128
+            num_register_tokens = 4
+        elif model_size is None:
+            pass
+        print(
+            f"[WorldMirror] model_size: {model_size}, embed_dim: {embed_dim}, "
+            f"depth: {depth}, num_heads: {num_heads}, mlp_ratio: {mlp_ratio}, "
+            f"gs_dim: {gs_dim}, num_register_tokens: {num_register_tokens}"
+        )
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.gs_dim = gs_dim
+        self.num_register_tokens = num_register_tokens
+        self.normalized_rope = normalized_rope
+        self.rope_normalize_coords = rope_normalize_coords
+        self.rope_shift_coords = rope_shift_coords
+        self.rope_jitter_coords = rope_jitter_coords
+        self.rope_rescale_coords = rope_rescale_coords
+        self.enable_cam = enable_cam
+        self.enable_pts = enable_pts
+        self.enable_depth = enable_depth
+        self.enable_depth_mask = enable_depth_mask
+        self.enable_cond = enable_cond
+        self.enable_norm = enable_norm
+        self.enable_gs = enable_gs
+        self.enable_bf16 = enable_bf16
+        self.patch_embed = patch_embed
+        self.sampling = sampling_strategy
+        self.dpt_checkpoint = dpt_gradient_checkpoint
+        self.cond_methods = condition_strategy
+        self.config = self._store_config()
+        self.sp_size = sp_size
+        self.visual_geometry_transformer = VisualGeometryTransformer(
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            num_register_tokens=num_register_tokens,
+            block_fn=Block if self.sp_size == 1 else DistBlock,
+            normalized_rope=normalized_rope,
+            rope_normalize_coords=rope_normalize_coords,
+            rope_shift_coords=rope_shift_coords,
+            rope_jitter_coords=rope_jitter_coords,
+            rope_rescale_coords=rope_rescale_coords,
+            enable_cond=enable_cond,
+            sampling_strategy=sampling_strategy,
+            patch_embed=patch_embed,
+            fixed_patch_embed=fixed_patch_embed,
+            condition_strategy=condition_strategy,
+            intermediate_idxs=self.intermediate_layer_idx[model_size],
+        )
+        self._init_heads(embed_dim, patch_size, gs_dim)
+        if enable_bf16:
+            self.to = self._bf16_to
+    def _store_config(self):
+        """Save the model configuration."""
+        return {
+            "img_size": self.img_size,
+            "patch_size": self.patch_size,
+            "embed_dim": self.embed_dim,
+            "depth": self.depth,
+            "num_heads": self.num_heads,
+            "mlp_ratio": self.mlp_ratio,
+            "gs_dim": self.gs_dim,
+            "num_register_tokens": self.num_register_tokens,
+            "normalized_rope": self.normalized_rope,
+            "rope_normalize_coords": self.rope_normalize_coords,
+            "rope_shift_coords": self.rope_shift_coords,
+            "rope_jitter_coords": self.rope_jitter_coords,
+            "rope_rescale_coords": self.rope_rescale_coords,
+            "enable_cam": self.enable_cam,
+            "enable_pts": self.enable_pts,
+            "enable_depth": self.enable_depth,
+            "enable_depth_mask": self.enable_depth_mask,
+            "enable_norm": self.enable_norm,
+            "enable_gs": self.enable_gs,
+            "patch_embed": self.patch_embed,
+            "sampling_strategy": self.sampling,
+            "dpt_gradient_checkpoint": self.dpt_checkpoint,
+            "condition_strategy": self.cond_methods,
+            "model_size": self.model_size,
+        }
+    def _init_heads(self, dim, patch_size, gs_dim):
+        """Initialize all prediction heads."""
+        if self.enable_cam:
+            self.cam_head = CameraHead(
+                dim_in=2 * dim,
+                block_fn=Block if self.sp_size == 1 else DistBlock,
+            )
+        if self.enable_pts:
+            self.pts_head = DPTHead(
+                dim_in=2 * dim,
+                output_dim=4,
+                patch_size=patch_size,
+                activation="inv_log+expp1",
+                gradient_checkpoint=self.dpt_checkpoint,
+            )
+        if self.enable_depth:
+            self.depth_head = DPTHead(
+                dim_in=2 * dim,
+                output_dim=2 if not self.enable_depth_mask else 3,
+                patch_size=patch_size,
+                activation="exp+expp1" if not self.enable_depth_mask else "exp+expp1+linear",
+                enable_depth_mask=self.enable_depth_mask,
+                gradient_checkpoint=self.dpt_checkpoint,
+            )
+        if self.enable_norm:
+            self.norm_head = DPTHead(
+                dim_in=2 * dim,
+                output_dim=4,
+                patch_size=patch_size,
+                activation="norm+expp1",
+                gradient_checkpoint=self.dpt_checkpoint,
+            )
+        if self.enable_gs:
+            self.gs_head = DPTHead(
+                dim_in=2 * dim,
+                output_dim=2 if not self.enable_depth_mask else 3,
+                patch_size=patch_size,
+                features=gs_dim,
+                is_gsdpt=True,
+                activation="exp+expp1" if not self.enable_depth_mask else "exp+expp1+linear",
+                enable_depth_mask=self.enable_depth_mask,
+                gradient_checkpoint=self.dpt_checkpoint,
+            )
+            self.gs_renderer = GaussianSplatRenderer(
+                feature_dim=gs_dim,
+                sh_degree=0,
+                enable_prune=True,
+                voxel_size=0.002,
+            )
+    def _bf16_to(self, *args, **kwargs):
+        """Custom to() for bf16 mode: selectively move heads to target device/dtype."""
+        self.visual_geometry_transformer = self.visual_geometry_transformer.to(*args, **kwargs)
+        if self.enable_cam:
+            self.cam_head = self.cam_head.to(*args, **kwargs)
+        if self.enable_pts:
+            self.pts_head = self.pts_head.to(*args, **kwargs)
+        if self.enable_depth:
+            self.depth_head = self.depth_head.to(*args, **kwargs)
+        if self.enable_norm:
+            self.norm_head = self.norm_head.to(*args, **kwargs)
+        if self.enable_gs:
+            self.gs_head = self.gs_head.to(*args, **kwargs)
+            self.gs_renderer = self.gs_renderer.to(*args, **kwargs)
+        return self
+    def forward(
+        self,
+        views: Dict[str, torch.Tensor],
+        cond_flags: List[int] = [0, 0, 0],
+        is_inference=True,
+        sp_size=1,
+        sp_group=None,
+    ):
+        """Execute forward pass through the WorldMirror model.
+        Args:
+            views: Input data dictionary containing 'img' and optional priors.
+            cond_flags: Conditioning flags [pose, depth, intrinsics].
+            is_inference: Whether running in inference mode.
+            sp_size: Sequence parallel size (>1 for multi-GPU).
+            sp_group: Process group for SP communication.
+        Returns:
+            dict: Prediction results dictionary.
+        """
+        if self.enable_bf16:
+            views['img'] = views['img'].to(torch.bfloat16)
+        imgs = views["img"]
+        use_cond = sum(cond_flags) > 0
+        if use_cond:
+            priors = self.extract_priors(views)
+            token_list, patch_start_idx = self.visual_geometry_transformer(
+                imgs, priors, cond_flags=cond_flags,
+                enable_bf16=self.enable_bf16, sp_size=sp_size, sp_group=sp_group,
+            )
+        else:
+            token_list, patch_start_idx = self.visual_geometry_transformer(
+                imgs, enable_bf16=self.enable_bf16, sp_size=sp_size, sp_group=sp_group,
+            )
+        with torch.amp.autocast('cuda', enabled=(not self.enable_bf16), dtype=torch.float32):
+            if sp_size > 1:
+                preds = self._gen_all_preds_frame_sp(
+                    token_list, imgs, patch_start_idx, views, cond_flags,
+                    is_inference, sp_size, sp_group,
+                )
+            else:
+                preds = self._gen_all_preds(
+                    token_list, imgs, patch_start_idx, views, cond_flags, is_inference,
+                )
+        return preds
+    def _gen_all_preds_frame_sp(
+        self, token_list, imgs, patch_start_idx, views, cond_flags, is_inference,
+        sp_size, sp_group,
+    ):
+        """Generate predictions with frame-parallel DPT heads for SP inference.
+        Splits S frames across sp_size ranks. Each rank processes S/sp_size frames
+        through ALL head types, then Allgather to reconstruct full results.
+        CameraHead runs on all frames on every rank (cross-view attention needed).
+        """
+        preds = {}
+        rank = dist.get_rank()
+        rank_in_sp = dist.get_group_rank(sp_group, rank)
+        B, S, C_img, H, W = imgs.shape
+        # Determine frame assignment for this rank
+        if S >= sp_size:
+            base_chunk = S // sp_size
+            remainder = S % sp_size
+            if rank_in_sp < remainder:
+                my_count = base_chunk + 1
+                my_start = rank_in_sp * (base_chunk + 1)
+            else:
+                my_count = base_chunk
+                my_start = remainder * (base_chunk + 1) + (rank_in_sp - remainder) * base_chunk
+        else:
+            if rank_in_sp < S:
+                my_count = 1
+                my_start = rank_in_sp
+            else:
+                my_count = 0
+                my_start = S
+        my_end = my_start + my_count
+        has_frames = my_count > 0
+        if has_frames:
+            token_list_chunk = [t[:, my_start:my_end].contiguous() for t in token_list]
+            imgs_chunk = imgs[:, my_start:my_end].contiguous()
+        # Camera head: runs on ALL frames on every rank (cross-view attention)
+        if self.enable_cam:
+            cam_seq = self.cam_head(token_list)
+            cam_params = cam_seq[-1]
+            preds["camera_params"] = cam_params
+            c2w_mat, int_mat = self.transform_camera_vector(cam_params, H, W)
+            preds["camera_poses"] = c2w_mat
+            preds["camera_intrs"] = int_mat
+        # DPT heads: frame-parallel
+        if self.enable_depth:
+            if has_frames:
+                if self.enable_depth_mask:
+                    depth_chunk, depth_conf_chunk, depth_mask_logits_chunk = self.depth_head(
+                        token_list_chunk, images=imgs_chunk, patch_start_idx=patch_start_idx,
+                    )
+                else:
+                    depth_chunk, depth_conf_chunk = self.depth_head(
+                        token_list_chunk, images=imgs_chunk, patch_start_idx=patch_start_idx,
+                    )
+            else:
+                depth_chunk = torch.zeros(B, 0, H, W, 1, dtype=imgs.dtype, device=imgs.device)
+                depth_conf_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+                if self.enable_depth_mask:
+                    depth_mask_logits_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+            preds["depth"] = self._frame_allgather_variable(depth_chunk, my_count, S, sp_size, sp_group, dim=1)
+            preds["depth_conf"] = self._frame_allgather_variable(depth_conf_chunk, my_count, S, sp_size, sp_group, dim=1)
+            if self.enable_depth_mask:
+                depth_mask_logits_full = self._frame_allgather_variable(
+                    depth_mask_logits_chunk, my_count, S, sp_size, sp_group, dim=1,
+                )
+                preds["depth_mask_logits"] = depth_mask_logits_full
+                preds["depth_mask"] = depth_mask_logits_full.sigmoid()
+        if self.enable_pts:
+            if has_frames:
+                pts_chunk, pts_conf_chunk = self.pts_head(
+                    token_list_chunk, images=imgs_chunk, patch_start_idx=patch_start_idx,
+                )
+            else:
+                pts_chunk = torch.zeros(B, 0, H, W, 3, dtype=imgs.dtype, device=imgs.device)
+                pts_conf_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+            preds["pts3d"] = self._frame_allgather_variable(pts_chunk, my_count, S, sp_size, sp_group, dim=1)
+            preds["pts3d_conf"] = self._frame_allgather_variable(pts_conf_chunk, my_count, S, sp_size, sp_group, dim=1)
+        if self.enable_norm:
+            if has_frames:
+                normals_chunk, norm_conf_chunk = self.norm_head(
+                    token_list_chunk, images=imgs_chunk, patch_start_idx=patch_start_idx,
+                )
+            else:
+                normals_chunk = torch.zeros(B, 0, H, W, 3, dtype=imgs.dtype, device=imgs.device)
+                norm_conf_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+            preds["normals"] = self._frame_allgather_variable(normals_chunk, my_count, S, sp_size, sp_group, dim=1)
+            preds["normals_conf"] = self._frame_allgather_variable(norm_conf_chunk, my_count, S, sp_size, sp_group, dim=1)
+        # GS head: frame-parallel, then render on full gathered data
+        if self.enable_gs:
+            context_preds, context_nums = self.prepare_contexts(views, cond_flags, is_inference)
+            gs_token_list = context_preds.get("token_list", token_list)
+            gs_imgs = context_preds.get("imgs", imgs)
+            gs_S = gs_imgs.shape[1]
+            if gs_S == S and has_frames:
+                gs_token_chunk = [t[:, my_start:my_end].contiguous() for t in gs_token_list]
+                gs_imgs_chunk = gs_imgs[:, my_start:my_end].contiguous()
+                if self.enable_depth_mask:
+                    gs_feat_chunk, gs_depth_chunk, gs_depth_conf_chunk, gs_dmask_chunk = self.gs_head(
+                        gs_token_chunk, images=gs_imgs_chunk, patch_start_idx=patch_start_idx,
+                    )
+                else:
+                    gs_feat_chunk, gs_depth_chunk, gs_depth_conf_chunk = self.gs_head(
+                        gs_token_chunk, images=gs_imgs_chunk, patch_start_idx=patch_start_idx,
+                    )
+                gs_feat = self._frame_allgather_variable(gs_feat_chunk, my_count, gs_S, sp_size, sp_group, dim=1)
+                gs_depth = self._frame_allgather_variable(gs_depth_chunk, my_count, gs_S, sp_size, sp_group, dim=1)
+                gs_depth_conf = self._frame_allgather_variable(gs_depth_conf_chunk, my_count, gs_S, sp_size, sp_group, dim=1)
+                if self.enable_depth_mask:
+                    gs_depth_mask_logits = self._frame_allgather_variable(
+                        gs_dmask_chunk, my_count, gs_S, sp_size, sp_group, dim=1,
+                    )
+                    preds["gs_depth_mask_logits"] = gs_depth_mask_logits
+                    preds["gs_depth_mask"] = gs_depth_mask_logits.sigmoid()
+            elif gs_S == S and not has_frames:
+                gs_feat_c = self.gs_dim // 2
+                gs_feat_chunk = torch.zeros(B, 0, gs_feat_c, H, W, dtype=imgs.dtype, device=imgs.device)
+                gs_depth_chunk = torch.zeros(B, 0, H, W, 1, dtype=imgs.dtype, device=imgs.device)
+                gs_depth_conf_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+                gs_feat = self._frame_allgather_variable(gs_feat_chunk, 0, gs_S, sp_size, sp_group, dim=1)
+                gs_depth = self._frame_allgather_variable(gs_depth_chunk, 0, gs_S, sp_size, sp_group, dim=1)
+                gs_depth_conf = self._frame_allgather_variable(gs_depth_conf_chunk, 0, gs_S, sp_size, sp_group, dim=1)
+                if self.enable_depth_mask:
+                    gs_dmask_chunk = torch.zeros(B, 0, H, W, dtype=imgs.dtype, device=imgs.device)
+                    gs_depth_mask_logits = self._frame_allgather_variable(
+                        gs_dmask_chunk, 0, gs_S, sp_size, sp_group, dim=1,
+                    )
+                    preds["gs_depth_mask_logits"] = gs_depth_mask_logits
+                    preds["gs_depth_mask"] = gs_depth_mask_logits.sigmoid()
+            else:
+                if self.enable_depth_mask:
+                    gs_feat, gs_depth, gs_depth_conf, gs_depth_mask_logits = self.gs_head(
+                        gs_token_list, images=gs_imgs, patch_start_idx=patch_start_idx,
+                    )
+                    preds["gs_depth_mask_logits"] = gs_depth_mask_logits
+                    preds["gs_depth_mask"] = gs_depth_mask_logits.sigmoid()
+                else:
+                    gs_feat, gs_depth, gs_depth_conf = self.gs_head(
+                        gs_token_list, images=gs_imgs, patch_start_idx=patch_start_idx,
+                    )
+            preds["gs_depth"] = gs_depth
+            preds["gs_depth_conf"] = gs_depth_conf
+            preds = self.gs_renderer.render(
+                gs_feats=gs_feat,
+                images=imgs,
+                predictions=preds,
+                views=views,
+                context_predictions=context_preds,
+                is_inference=is_inference,
+            )
+        return preds
+    def _frame_allgather_variable(self, chunk, my_count, total_S, sp_size, sp_group, dim=1):
+        """Allgather tensors with potentially variable chunk sizes across ranks.
+        Pads each chunk to max_chunk_size, allgathers, then extracts valid frames
+        from each rank's chunk to reconstruct the correct frame order.
+        """
+        if sp_size <= 1:
+            return chunk
+        if total_S >= sp_size:
+            base_chunk = total_S // sp_size
+            remainder = total_S % sp_size
+            counts = [(base_chunk + 1) if r < remainder else base_chunk
+                      for r in range(sp_size)]
+        else:
+            counts = [1 if r < total_S else 0 for r in range(sp_size)]
+        max_chunk = max(counts)
+        current_size = chunk.shape[dim]
+        if current_size < max_chunk:
+            pad_size = max_chunk - current_size
+            pad_shape = list(chunk.shape)
+            pad_shape[dim] = pad_size
+            padding = torch.zeros(pad_shape, dtype=chunk.dtype, device=chunk.device)
+            chunk = torch.cat([chunk, padding], dim=dim)
+        chunk = chunk.contiguous()
+        gathered_list = [torch.zeros_like(chunk) for _ in range(sp_size)]
+        dist.all_gather(gathered_list, chunk, group=sp_group)
+        valid_chunks = []
+        for r in range(sp_size):
+            cnt = counts[r]
+            if cnt > 0:
+                slices = [slice(None)] * gathered_list[r].dim()
+                slices[dim] = slice(0, cnt)
+                valid_chunks.append(gathered_list[r][tuple(slices)])
+        return torch.cat(valid_chunks, dim=dim).contiguous()
+    def _gen_all_preds(
+        self, token_list, imgs, patch_start_idx, views, cond_flags, is_inference
+    ):
+        """Generate all enabled predictions (single-GPU path)."""
+        preds = {}
+        if self.enable_cam:
+            cam_seq = self.cam_head(token_list)
+            cam_params = cam_seq[-1]
+            preds["camera_params"] = cam_params
+            c2w_mat, int_mat = self.transform_camera_vector(
+                cam_params, imgs.shape[-2], imgs.shape[-1]
+            )
+            preds["camera_poses"] = c2w_mat
+            preds["camera_intrs"] = int_mat
+        if self.enable_depth:
+            if self.enable_depth_mask:
+                depth, depth_conf, depth_mask_logits = self.depth_head(
+                    token_list, images=imgs, patch_start_idx=patch_start_idx,
+                )
+                preds["depth_mask_logits"] = depth_mask_logits
+                preds["depth_mask"] = depth_mask_logits.sigmoid()
+            else:
+                depth, depth_conf = self.depth_head(
+                    token_list, images=imgs, patch_start_idx=patch_start_idx,
+                )
+            preds["depth"] = depth
+            preds["depth_conf"] = depth_conf
+        if self.enable_pts:
+            pts, pts_conf = self.pts_head(
+                token_list, images=imgs, patch_start_idx=patch_start_idx,
+            )
+            preds["pts3d"] = pts
+            preds["pts3d_conf"] = pts_conf
+        if self.enable_norm:
+            normals, norm_conf = self.norm_head(
+                token_list, images=imgs, patch_start_idx=patch_start_idx,
+            )
+            preds["normals"] = normals
+            preds["normals_conf"] = norm_conf
+        if self.enable_gs:
+            context_preds, context_nums = self.prepare_contexts(views, cond_flags, is_inference)
+            if self.enable_depth_mask:
+                gs_feat, gs_depth, gs_depth_conf, gs_depth_mask_logits = self.gs_head(
+                    context_preds.get("token_list", token_list),
+                    images=context_preds.get("imgs", imgs),
+                    patch_start_idx=patch_start_idx,
+                )
+                preds["gs_depth_mask_logits"] = gs_depth_mask_logits
+                preds["gs_depth_mask"] = gs_depth_mask_logits.sigmoid()
+            else:
+                gs_feat, gs_depth, gs_depth_conf = self.gs_head(
+                    context_preds.get("token_list", token_list),
+                    images=context_preds.get("imgs", imgs),
+                    patch_start_idx=patch_start_idx,
+                )
+            preds["gs_depth"] = gs_depth
+            preds["gs_depth_conf"] = gs_depth_conf
+            preds = self.gs_renderer.render(
+                gs_feats=gs_feat,
+                images=imgs,
+                predictions=preds,
+                views=views,
+                context_predictions=context_preds,
+                is_inference=is_inference,
+            )
+        return preds
+    def extract_priors(self, views):
+        """Extract and normalize geometric priors from input views.
+        Returns (depths, rays, poses) tuple — each may be None if unavailable.
+        """
+        h, w = views["img"].shape[-2:]
+        depths = rays = poses = None
+        if "camera_poses" in views:
+            extrinsics = views["camera_poses"][:, :, :3]
+            extrinsics = normalize_poses(extrinsics)
+            cam_params = extrinsics_to_vector(extrinsics)
+            poses = cam_params[:, :, :7]
+            if self.enable_bf16:
+                poses = poses.to(torch.bfloat16)
+        if "depthmap" in views:
+            depth_h, depth_w = views["depthmap"].shape[-2:]
+            depths = views["depthmap"]
+            if depth_h != h or depth_w != w:
+                depths = F.interpolate(depths, size=(h, w), mode="bilinear", align_corners=False)
+            depths = normalize_depth(depths)
+            if self.enable_bf16:
+                depths = depths.to(torch.bfloat16)
+        if "camera_intrs" in views:
+            intrinsics = views["camera_intrs"][:, :, :3, :3]
+            fx, fy = intrinsics[:, :, 0, 0] / w, intrinsics[:, :, 1, 1] / h
+            cx, cy = intrinsics[:, :, 0, 2] / w, intrinsics[:, :, 1, 2] / h
+            rays = torch.stack([fx, fy, cx, cy], dim=-1)
+            if self.enable_bf16:
+                rays = rays.to(torch.bfloat16)
+        return (depths, rays, poses)
+    def transform_camera_vector(self, camera_params, h, w):
+        """Convert camera parameter vector to c2w and intrinsic matrices."""
+        ext_mat, int_mat = vector_to_camera_matrices(camera_params, image_hw=(h, w))
+        homo_row = torch.tensor([0, 0, 0, 1], device=ext_mat.device).view(1, 1, 1, 4)
+        homo_row = homo_row.repeat(ext_mat.shape[0], ext_mat.shape[1], 1, 1)
+        w2c_mat = torch.cat([ext_mat, homo_row], dim=2)
+        try:
+            c2w_mat = torch.linalg.inv(w2c_mat)
+        except Exception as e:
+            print(f"[WorldMirror] linalg.inv fallback to CPU: {e}")
+            c2w_mat = torch.linalg.inv(w2c_mat.cpu()).to(camera_params.device)
+        return c2w_mat, int_mat
+    def prepare_contexts(self, views, cond_flags, is_inference):
+        """Prepare context views for GS rendering (training only, passthrough in inference)."""
+        context_preds = {}
+        if is_inference:
+            return context_preds, views["img"].shape[1]
+        assert self.enable_cam and self.enable_gs
+        if "is_target" not in views:
+            context_nums = views["img"].shape[1]
+        else:
+            context_nums = (views["is_target"][0] == False).sum().item()
+        context_imgs = views["img"][:, :context_nums]
+        use_cond = sum(cond_flags) > 0
+        if self.enable_bf16:
+            context_imgs = context_imgs.to(torch.bfloat16)
+        with torch.amp.autocast('cuda', enabled=(not self.enable_bf16), dtype=torch.bfloat16):
+            if use_cond:
+                priors = self.extract_priors(views)
+                context_priors = (
+                    prior[:, :context_nums] if prior is not None else None
+                    for prior in priors
+                )
+                context_token_list, _ = self.visual_geometry_transformer(
+                    context_imgs, context_priors, cond_flags=cond_flags,
+                    enable_bf16=self.enable_bf16,
+                )
+            else:
+                context_token_list, _ = self.visual_geometry_transformer(
+                    context_imgs, enable_bf16=self.enable_bf16,
+                )
+        context_cam_seq = self.cam_head(context_token_list)
+        context_cam_params = context_cam_seq[-1]
+        context_c2w_mat, context_int_mat = self.transform_camera_vector(
+            context_cam_params, context_imgs.shape[-2], context_imgs.shape[-1]
+        )
+        context_preds["camera_poses"] = context_c2w_mat
+        context_preds["camera_intrs"] = context_int_mat
+        context_preds["token_list"] = context_token_list
+        context_preds["imgs"] = context_imgs
+        return context_preds, context_nums

hyworldmirror/models/utils/__init__.py ADDED Viewed

File without changes

hyworldmirror/models/utils/act_gs.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from einops import rearrange
+def reg_dense_offsets(xyz, shift=6.0):
+    d = xyz.norm(dim=-1, keepdim=True)
+    return xyz / d.clamp(min=1e-8) * (torch.exp(d - shift) - torch.exp(-shift))
+def reg_dense_scales(scales):
+    return scales.exp()
+def reg_dense_rotation(rotations, eps=1e-8):
+    return rotations / (rotations.norm(dim=-1, keepdim=True) + eps)
+def reg_dense_sh(sh):
+    return rearrange(sh, '... (d_sh xyz) -> ... d_sh xyz', xyz=3)
+def reg_dense_opacities(opacities):
+    return opacities.sigmoid()
+def reg_dense_weights(weights):
+    return weights.sigmoid()

hyworldmirror/models/utils/camera_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from .rotation import quat_to_rotmat, rotmat_to_quat
+def camera_params_to_vector(
+    ext, intr, image_hw=None
+):
+    """Convert camera matrices to a compact vector."""
+    # ext: (..., 3, 4): Camera-to-world extrinsic [R|t]
+    # intr: (..., 3, 3): Intrinsics
+    # image_hw: (h, w)
+    R = ext[..., :3, :3]           # Rotation part
+    t = ext[..., :3, 3]            # Translation part
+    q = rotmat_to_quat(R)  # Quaternion (wxyz)
+    h, w = image_hw
+    fov_v = 2.0 * torch.atan(h * 0.5 / intr[..., 1, 1])  # Vertical FOV
+    fov_u = 2.0 * torch.atan(w * 0.5 / intr[..., 0, 0])  # Horizontal FOV
+    vec = torch.stack([
+        t[..., 0], t[..., 1], t[..., 2],
+        q[..., 0], q[..., 1], q[..., 2], q[..., 3],
+        fov_v, fov_u
+    ], dim=-1).float()
+    return vec
+def extrinsics_to_vector(ext):
+    """Convert extrinsics to [t, q] vector."""
+    # ext: (..., 3, 4)
+    R = ext[..., :3, :3]
+    t = ext[..., :3, 3]
+    q = rotmat_to_quat(R)
+    vec = torch.stack([
+        t[..., 0], t[..., 1], t[..., 2],
+        q[..., 0], q[..., 1], q[..., 2], q[..., 3]
+    ], dim=-1).float()
+    return vec
+def vector_to_extrinsics(cam_vec):
+    """Convert [t, q] vector to extrinsic matrix."""
+    # cam_vec: (..., 7)
+    q = cam_vec[..., 3:7]
+    t = cam_vec[..., :3]
+    R = quat_to_rotmat(q)
+    ext = torch.cat([R, t.unsqueeze(-1)], dim=-1)
+    return ext
+def vector_to_camera_matrices(
+    cam_vec, image_hw=None, build_intr=True
+):
+    """Reconstruct extrinsic and intrinsic matrix from vector."""
+    # cam_vec: (..., 9)
+    intr = None
+    # Decompose vector
+    t = cam_vec[..., 0:3]
+    q = cam_vec[..., 3:7]
+    fov_v = cam_vec[..., 7]
+    fov_u = cam_vec[..., 8]
+    # Build extrinsic: [R|t]
+    R = quat_to_rotmat(q)
+    ext = torch.cat([R, t.unsqueeze(-1)], dim=-1)
+    # Build intrinsic if needed
+    if build_intr:
+        h, w = image_hw
+        fy = h * 0.5 / torch.tan(fov_v * 0.5)
+        fx = w * 0.5 / torch.tan(fov_u * 0.5)
+        shape = cam_vec.shape[:-1] + (3, 3)
+        intr = torch.zeros(shape, device=cam_vec.device, dtype=cam_vec.dtype)
+        intr[..., 0, 0] = fx
+        intr[..., 1, 1] = fy
+        intr[..., 0, 2] = w * 0.5
+        intr[..., 1, 2] = h * 0.5
+        intr[..., 2, 2] = 1.0
+    return ext, intr

hyworldmirror/models/utils/frustum.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import einops
+import torch
+# Calculate the loss mask for the target views in the batch
+@torch.no_grad()
+def calculate_unprojected_mask(views, context_nums):
+    '''Calcuate the loss mask for the target views in the batch'''
+    target_depth = views["depthmap"][:, context_nums:]
+    target_intrinsics = views["camera_intrs"][:, context_nums:]
+    target_c2w = views["camera_poses"][:, context_nums:]
+    context_depth = views["depthmap"][:, :context_nums]
+    context_intrinsics = views["camera_intrs"][:, :context_nums]
+    context_c2w = views["camera_poses"][:, :context_nums]
+    target_intrinsics = target_intrinsics[..., :3, :3]
+    context_intrinsics = context_intrinsics[..., :3, :3]
+    mask = calculate_in_frustum_mask(
+        target_depth, target_intrinsics, target_c2w,
+        context_depth, context_intrinsics, context_c2w
+    )
+    return mask
+@torch.no_grad()
+def calculate_in_frustum_mask(depth_1, intrinsics_1, c2w_1, depth_2, intrinsics_2, c2w_2):
+    """
+    A function that takes in the depth, intrinsics and c2w matrices of two sets
+    of views, and then works out which of the pixels in the first set of views
+    has a direct corresponding pixel in any of views in the second set
+    Args:
+        depth_1: (b, v1, h, w)
+        intrinsics_1: (b, v1, 3, 3)
+        c2w_1: (b, v1, 4, 4)
+        depth_2: (b, v2, h, w)
+        intrinsics_2: (b, v2, 3, 3)
+        c2w_2: (b, v2, 4, 4)
+    Returns:
+        torch.Tensor: valid mask with shape (b, v1, v2, h, w).
+    """
+    _, v1, h, w = depth_1.shape
+    _, v2, _, _ = depth_2.shape
+    # Unproject the depth to get the 3D points in world space
+    points_3d = unproject_depth(depth_1[..., None], intrinsics_1, c2w_1)  # (b, v1, h, w, 3)
+    # Project the 3D points into the pixel space of all the second views simultaneously
+    camera_points = world_space_to_camera_space(points_3d, c2w_2)  # (b, v1, v2, h, w, 3)
+    points_2d = camera_space_to_pixel_space(camera_points, intrinsics_2)  # (b, v1, v2, h, w, 2)
+    # Calculate the depth of each point
+    rendered_depth = camera_points[..., 2]  # (b, v1, v2, h, w)
+    # We use three conditions to determine if a point should be masked
+    # Condition 1: Check if the points are in the frustum of any of the v2 views
+    in_frustum_mask = (
+        (points_2d[..., 0] > 0) &
+        (points_2d[..., 0] < w) &
+        (points_2d[..., 1] > 0) &
+        (points_2d[..., 1] < h)
+    )  # (b, v1, v2, h, w)
+    in_frustum_mask = in_frustum_mask.any(dim=-3)  # (b, v1, h, w)
+    # Condition 2: Check if the points have non-zero (i.e. valid) depth in the input view
+    non_zero_depth = depth_1 > 1e-6
+    # Condition 3: Check if the points have matching depth to any of the v2
+    # views torch.nn.functional.grid_sample expects the input coordinates to
+    # be normalized to the range [-1, 1], so we normalize first
+    points_2d[..., 0] /= w
+    points_2d[..., 1] /= h
+    points_2d = points_2d * 2 - 1
+    matching_depth = torch.ones_like(rendered_depth, dtype=torch.bool)
+    for b in range(depth_1.shape[0]):
+        for i in range(v1):
+            for j in range(v2):
+                depth = einops.rearrange(depth_2[b, j], 'h w -> 1 1 h w')
+                coords = einops.rearrange(points_2d[b, i, j], 'h w c -> 1 h w c')
+                sampled_depths = torch.nn.functional.grid_sample(depth, coords, align_corners=False)[0, 0]
+                matching_depth[b, i, j] = torch.isclose(rendered_depth[b, i, j], sampled_depths, atol=1e-1)
+    matching_depth = matching_depth.any(dim=-3)  # (..., v1, h, w)
+    mask = in_frustum_mask & non_zero_depth & matching_depth
+    return mask
+# --- Projections ---
+def homogenize_points(points):
+    """Append a '1' along the final dimension of the tensor (i.e. convert xyz->xyz1)"""
+    return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+def normalize_homogenous_points(points):
+    """Normalize the point vectors"""
+    return points / points[..., -1:]
+def pixel_space_to_camera_space(pixel_space_points, depth, intrinsics):
+    """
+    Convert pixel space points to camera space points.
+    Args:
+        pixel_space_points (torch.Tensor): Pixel space points with shape (h, w, 2)
+        depth (torch.Tensor): Depth map with shape (b, v, h, w, 1)
+        intrinsics (torch.Tensor): Camera intrinsics with shape (b, v, 3, 3)
+    Returns:
+        torch.Tensor: Camera space points with shape (b, v, h, w, 3).
+    """
+    pixel_space_points = homogenize_points(pixel_space_points)
+    camera_space_points = torch.einsum('b v i j , h w j -> b v h w i', intrinsics.inverse(), pixel_space_points)
+    camera_space_points = camera_space_points * depth
+    return camera_space_points
+def camera_space_to_world_space(camera_space_points, c2w):
+    """
+    Convert camera space points to world space points.
+    Args:
+        camera_space_points (torch.Tensor): Camera space points with shape (b, v, h, w, 3)
+        c2w (torch.Tensor): Camera to world extrinsics matrix with shape (b, v, 4, 4)
+    Returns:
+        torch.Tensor: World space points with shape (b, v, h, w, 3).
+    """
+    camera_space_points = homogenize_points(camera_space_points)
+    world_space_points = torch.einsum('b v i j , b v h w j -> b v h w i', c2w, camera_space_points)
+    return world_space_points[..., :3]
+def camera_space_to_pixel_space(camera_space_points, intrinsics):
+    """
+    Convert camera space points to pixel space points.
+    Args:
+        camera_space_points (torch.Tensor): Camera space points with shape (b, v1, v2, h, w, 3)
+        c2w (torch.Tensor): Camera to world extrinsics matrix with shape (b, v2, 3, 3)
+    Returns:
+        torch.Tensor: World space points with shape (b, v1, v2, h, w, 2).
+    """
+    camera_space_points = normalize_homogenous_points(camera_space_points)
+    pixel_space_points = torch.einsum('b u i j , b v u h w j -> b v u h w i', intrinsics, camera_space_points)
+    return pixel_space_points[..., :2]
+def world_space_to_camera_space(world_space_points, c2w):
+    """
+    Convert world space points to pixel space points.
+    Args:
+        world_space_points (torch.Tensor): World space points with shape (b, v1, h, w, 3)
+        c2w (torch.Tensor): Camera to world extrinsics matrix with shape (b, v2, 4, 4)
+    Returns:
+        torch.Tensor: Camera space points with shape (b, v1, v2, h, w, 3).
+    """
+    world_space_points = homogenize_points(world_space_points)
+    camera_space_points = torch.einsum('b u i j , b v h w j -> b v u h w i', c2w.inverse(), world_space_points)
+    return camera_space_points[..., :3]
+def unproject_depth(depth, intrinsics, c2w):
+    """
+    Turn the depth map into a 3D point cloud in world space
+    Args:
+        depth: (b, v, h, w, 1)
+        intrinsics: (b, v, 3, 3)
+        c2w: (b, v, 4, 4)
+    Returns:
+        torch.Tensor: World space points with shape (b, v, h, w, 3).
+    """
+    # Compute indices of pixels
+    h, w = depth.shape[-3], depth.shape[-2]
+    x_grid, y_grid = torch.meshgrid(
+        torch.arange(w, device=depth.device, dtype=torch.float32),
+        torch.arange(h, device=depth.device, dtype=torch.float32),
+        indexing='xy'
+    )  # (h, w), (h, w)
+    # Compute coordinates of pixels in camera space
+    pixel_space_points = torch.stack((x_grid, y_grid), dim=-1)  # (..., h, w, 2)
+    camera_points = pixel_space_to_camera_space(pixel_space_points, depth, intrinsics)  # (..., h, w, 3)
+    # Convert points to world space
+    world_points = camera_space_to_world_space(camera_points, c2w)  # (..., h, w, 3)
+    return world_points

hyworldmirror/models/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import numpy as np
+from typing import Tuple
+def depth_to_camera_coords(depthmap, camera_intrinsics):
+    """
+    Convert depth map to 3D camera coordinates.
+    Args:
+        depthmap (BxHxW tensor): Batch of depth maps
+        camera_intrinsics (Bx3x3 tensor): Camera intrinsics matrix for each camera
+    Returns:
+        X_cam (BxHxWx3 tensor): 3D points in camera coordinates
+        valid_mask (BxHxW tensor): Mask indicating valid depth pixels
+    """
+    B, H, W = depthmap.shape
+    device = depthmap.device
+    dtype = depthmap.dtype
+    # Ensure intrinsics are float
+    camera_intrinsics = camera_intrinsics.float()
+    # Extract focal lengths and principal points
+    fx = camera_intrinsics[:, 0, 0]  # (B,)
+    fy = camera_intrinsics[:, 1, 1]  # (B,)
+    cx = camera_intrinsics[:, 0, 2]  # (B,)
+    cy = camera_intrinsics[:, 1, 2]  # (B,)
+    # Generate pixel grid
+    v_grid, u_grid = torch.meshgrid(
+        torch.arange(H, dtype=dtype, device=device),
+        torch.arange(W, dtype=dtype, device=device),
+        indexing='ij'
+    )
+    # Reshape for broadcasting: (1, H, W)
+    u_grid = u_grid.unsqueeze(0)
+    v_grid = v_grid.unsqueeze(0)
+    # Compute 3D camera coordinates
+    # X = (u - cx) * Z / fx
+    # Y = (v - cy) * Z / fy
+    # Z = depth
+    z_cam = depthmap  # (B, H, W)
+    x_cam = (u_grid - cx.view(B, 1, 1)) * z_cam / fx.view(B, 1, 1)
+    y_cam = (v_grid - cy.view(B, 1, 1)) * z_cam / fy.view(B, 1, 1)
+    # Stack to form (B, H, W, 3)
+    X_cam = torch.stack([x_cam, y_cam, z_cam], dim=-1)
+    # Valid depth mask
+    valid_mask = depthmap > 0.0
+    return X_cam, valid_mask
+def depth_to_world_coords_points(
+    depth_map: torch.Tensor, extrinsic: torch.Tensor, intrinsic: torch.Tensor, eps=1e-8
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Convert a batch of depth maps to world coordinates.
+    Args:
+        depth_map (torch.Tensor): (B, H, W) Depth map
+        extrinsic (torch.Tensor): (B, 4, 4) Camera extrinsic matrix (camera-to-world transformation)
+        intrinsic (torch.Tensor): (B, 3, 3) Camera intrinsic matrix
+    Returns:
+        world_coords_points (torch.Tensor): (B, H, W, 3) World coordinates
+        camera_points (torch.Tensor): (B, H, W, 3) Camera coordinates
+        point_mask (torch.Tensor): (B, H, W) Valid depth mask
+    """
+    if depth_map is None:
+        return None, None, None
+    # Valid depth mask (B, H, W)
+    point_mask = depth_map > eps
+    # Convert depth map to camera coordinates (B, H, W, 3)
+    camera_points, _ = depth_to_camera_coords(depth_map, intrinsic)
+    # Apply extrinsic matrix (camera -> world)
+    R_cam_to_world = extrinsic[:, :3, :3]   # (B, 3, 3)
+    t_cam_to_world = extrinsic[:, :3, 3]    # (B, 3)
+    # Transform (B, H, W, 3) x (B, 3, 3)^T + (B, 3) -> (B, H, W, 3)
+    world_coords_points = torch.einsum('bhwi,bji->bhwj', camera_points, R_cam_to_world) + t_cam_to_world[:, None, None, :]
+    return world_coords_points, camera_points, point_mask
+def closed_form_inverse_se3(se3: torch.Tensor) -> torch.Tensor:
+    """
+    Efficiently invert batched SE(3) matrices of shape (B, 4, 4).
+    Args:
+        se3 (torch.Tensor): (B, 4, 4) Transformation matrices
+    Returns:
+        out (torch.Tensor): (B, 4, 4) Inverse transformation matrices
+    """
+    assert se3.ndim == 3 and se3.shape[1:] == (4, 4), f"se3 must be (B, 4, 4), got {se3.shape}"
+    R = se3[:, :3, :3]        # (B, 3, 3)
+    t = se3[:, :3, 3]         # (B, 3)
+    Rt = R.transpose(1, 2)    # (B, 3, 3)
+    t_inv = -torch.bmm(Rt, t.unsqueeze(-1)).squeeze(-1)  # (B, 3)
+    out = se3.new_zeros(se3.shape)
+    out[:, :3, :3] = Rt
+    out[:, :3, 3] = t_inv
+    out[:, 3, 3] = 1.0
+    return out

hyworldmirror/models/utils/grid.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor:
+    """
+    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
+    Args:
+        pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
+        embed_dim: Output channel dimension for embeddings
+        omega_0: Base frequency for sinusoidal encoding
+    Returns:
+        Tensor of shape (H, W, embed_dim) with positional embeddings
+    """
+    H, W, grid_dim = pos_grid.shape
+    assert grid_dim == 2
+    assert embed_dim % 2 == 0
+    device = pos_grid.device
+    pos_flat = pos_grid.reshape(-1, grid_dim)  # Flatten to (H*W, 2)
+    # Generate frequency bands
+    omega = torch.arange(embed_dim // 4, dtype=torch.float32 if device.type == "mps" else torch.double, device=device)
+    omega /= embed_dim / 4.0
+    omega = 1.0 / omega_0**omega  # (D/4,)
+    # Process x and y coordinates separately
+    pos_x = pos_flat[:, 0].reshape(-1)  # (H*W,)
+    pos_y = pos_flat[:, 1].reshape(-1)  # (H*W,)
+    # Compute outer products
+    out_x = torch.einsum("m,d->md", pos_x, omega)  # (H*W, D/4)
+    out_y = torch.einsum("m,d->md", pos_y, omega)  # (H*W, D/4)
+    # Apply sin and cos
+    emb_x = torch.cat([torch.sin(out_x), torch.cos(out_x)], dim=1)  # (H*W, D/2)
+    emb_y = torch.cat([torch.sin(out_y), torch.cos(out_y)], dim=1)  # (H*W, D/2)
+    # Combine x and y embeddings
+    emb = torch.cat([emb_x, emb_y], dim=-1)  # (H*W, D)
+    return emb.float().view(H, W, embed_dim)  # [H, W, D]
+# Inspired by https://github.com/microsoft/moge
+def create_uv_grid(
+    width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None
+) -> torch.Tensor:
+    """
+    Create a normalized UV grid of shape (width, height, 2).
+    The grid spans horizontally and vertically according to an aspect ratio,
+    ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right
+    corner is at (x_span, y_span), normalized by the diagonal of the plane.
+    Args:
+        width (int): Number of points horizontally.
+        height (int): Number of points vertically.
+        aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height.
+        dtype (torch.dtype, optional): Data type of the resulting tensor.
+        device (torch.device, optional): Device on which the tensor is created.
+    Returns:
+        torch.Tensor: A (width, height, 2) tensor of UV coordinates.
+    """
+    # Derive aspect ratio if not explicitly provided
+    if aspect_ratio is None:
+        aspect_ratio = float(width) / float(height)
+    # Compute normalized spans for X and Y
+    diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
+    span_x = aspect_ratio / diag_factor
+    span_y = 1.0 / diag_factor
+    # Establish the linspace boundaries
+    left_x = -span_x * (width - 1) / width
+    right_x = span_x * (width - 1) / width
+    top_y = -span_y * (height - 1) / height
+    bottom_y = span_y * (height - 1) / height
+    # Generate 1D coordinates
+    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
+    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
+    # Create 2D meshgrid (width x height) and stack into UV
+    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
+    uv_grid = torch.stack((uu, vv), dim=-1)
+    return uv_grid

hyworldmirror/models/utils/priors.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+def normalize_poses(extrinsics, padding=0.1, return_stats=False):
+    """
+    Normalize camera positions to unit cube, processing each batch separately
+    Args:
+        extrinsics: Camera extrinsic matrices with shape (B, S, 3, 4)
+        padding: Boundary space within [0,1] range to prevent values near boundaries
+        return_stats: Whether to return normalization statistics
+    Returns:
+        normalized_extrinsics: Normalized extrinsic matrices
+        (optional) stats: Dictionary containing scale and translation information
+    """
+    B, S, _, _ = extrinsics.shape
+    device = extrinsics.device
+    # Check input validity and handle NaN/Inf values
+    for i in range(B):
+        if torch.isnan(extrinsics[i]).any() or torch.isinf(extrinsics[i]).any():
+            print(f"Warning: dataset sample has NaN/Inf in extrinsics")
+            extrinsics[i] = torch.nan_to_num(
+                extrinsics[i], nan=0.0, posinf=1e6, neginf=-1e6
+            )
+    normalized_extrinsics = extrinsics.clone()
+    # Store normalization parameters if needed
+    if return_stats:
+        stats = {
+            'scale_factors': torch.zeros(B, device=device),
+            'translation_vectors': torch.zeros(B, 3, device=device)
+        }
+    for b in range(B):
+        # Extract camera positions for this batch
+        positions = extrinsics[b, :, :3, 3]  # (S, 3)
+        # Filter valid positions to ignore outliers
+        valid_mask = torch.isfinite(positions).all(dim=1)  # (S,)
+        if valid_mask.sum() == 0:
+            # No valid positions, use default values
+            print(f"Warning: Batch {b} has no valid camera positions")
+            normalized_extrinsics[b, :, :3, 3] = 0.5  # Place at center
+            if return_stats:
+                stats['scale_factors'][b] = 1.0
+                stats['translation_vectors'][b] = 0.0
+            continue
+        valid_positions = positions[valid_mask]
+        # Calculate bounds using percentiles for robustness
+        if valid_positions.shape[0] > 10:
+            # Use 5% and 95% percentiles instead of min/max
+            min_pos = torch.quantile(valid_positions, 0.05, dim=0)
+            max_pos = torch.quantile(valid_positions, 0.95, dim=0)
+        else:
+            # Too few samples, use min/max
+            min_pos = torch.min(valid_positions, dim=0)[0]
+            max_pos = torch.max(valid_positions, dim=0)[0]
+        # Calculate scale factor considering all dimensions
+        pos_range = max_pos - min_pos
+        # Add small epsilon to prevent dimension collapse
+        eps = torch.maximum(
+            torch.tensor(1e-6, device=device),
+            torch.abs(max_pos) * 1e-6
+        )
+        pos_range = torch.maximum(pos_range, eps)
+        # Use maximum range as scale factor for uniform scaling
+        scale_factor = torch.max(pos_range)
+        scale_factor = torch.clamp(scale_factor, min=1e-6, max=1e6)
+        # Calculate center point for centering
+        center = (min_pos + max_pos) / 2.0
+        # Normalize: center first, then scale with padding
+        actual_scale = scale_factor / (1 - 2 * padding)
+        normalized_positions = (positions - center) / actual_scale + 0.5
+        # Ensure all values are within valid range
+        normalized_positions = torch.clamp(normalized_positions, 0.0, 1.0)
+        # Handle invalid positions by setting them to scene center
+        invalid_mask = ~torch.isfinite(positions).all(dim=1)
+        if invalid_mask.any():
+            normalized_positions[invalid_mask] = 0.5
+        normalized_extrinsics[b, :, :3, 3] = normalized_positions
+        if return_stats:
+            stats['scale_factors'][b] = actual_scale
+            stats['translation_vectors'][b] = center
+    # Final validation
+    assert torch.isfinite(normalized_extrinsics).all(), "Output contains non-finite values"
+    if return_stats:
+        return normalized_extrinsics, stats
+    return normalized_extrinsics
+def normalize_depth(depth, eps=1e-6, min_percentile=1, max_percentile=99):
+    """
+    Normalize depth values to [0, 1] range using percentile-based scaling.
+    Args:
+        depth: Input depth tensor with shape (B, S, H, W)
+        eps: Small epsilon value to prevent division by zero
+        min_percentile: Lower percentile for robust min calculation (default: 1)
+        max_percentile: Upper percentile for robust max calculation (default: 99)
+    Returns:
+        normalized_depth: Depth tensor normalized to [0, 1] range with same shape (B, S, H, W)
+    """
+    B, S, H, W = depth.shape
+    depth = depth.flatten(0,1)  # [B*S, H, W]
+    # Handle invalid values
+    depth = torch.nan_to_num(depth, nan=0.0, posinf=1e6, neginf=0.0)
+    normalized_list = []
+    for i in range(depth.shape[0]):
+        depth_img = depth[i]  # [H, W]
+        depth_flat = depth_img.flatten()
+        # Filter out zero values if needed
+        non_zero_mask = depth_flat > 0
+        if non_zero_mask.sum() > 0:
+            values_to_use = depth_flat[non_zero_mask]
+        else:
+            values_to_use = depth_flat
+        # Only calculate percentiles when there are enough values
+        if values_to_use.numel() > 100:  # Ensure enough samples for percentile calculation
+            # Calculate min and max percentiles
+            depth_min = torch.quantile(values_to_use, min_percentile/100.0)
+            depth_max = torch.quantile(values_to_use, max_percentile/100.0)
+        else:
+            # If too few samples, use min/max values
+            depth_min = values_to_use.min()
+            depth_max = values_to_use.max()
+        # Handle case where max equals min
+        if depth_max == depth_min:
+            depth_max = depth_min + 1.0
+        # Use relative epsilon
+        scale = torch.abs(depth_max - depth_min)
+        eps_val = max(eps, scale.item() * eps)
+        # Perform normalization
+        depth_norm_img = (depth_img - depth_min) / (depth_max - depth_min + eps_val)
+        # Ensure output is within [0,1] range
+        depth_norm_img = torch.clamp(depth_norm_img, 0.0, 1.0)
+        normalized_list.append(depth_norm_img)
+    # Recombine all normalized images
+    depth_norm = torch.stack(normalized_list)
+    return depth_norm.reshape(B, S, H, W)

hyworldmirror/models/utils/rotation.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Modified from PyTorch3D, https://github.com/facebookresearch/pytorch3d
+import torch
+import numpy as np
+import torch.nn.functional as F
+def quat_to_rotmat(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Quaternion Order: XYZW or say ijkr, scalar-last
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    i, j, k, r = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def rotmat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1)
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [1.0 + m00 + m11 + m22, 1.0 + m00 - m11 - m22, 1.0 - m00 + m11 - m22, 1.0 - m00 - m11 + m22], dim=-1
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,))
+    # Convert from rijk to ijkr
+    out = out[..., [1, 2, 3, 0]]
+    out = standardize_quaternion(out)
+    return out
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)

hyworldmirror/models/utils/sh_utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#  Copyright 2021 The PlenOctree Authors.
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

hyworldmirror/utils/__init__.py ADDED Viewed

File without changes

hyworldmirror/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""
+Utilities for geometry operations.
+References: DUSt3R, MoGe
+"""
+from numbers import Number
+from typing import Tuple, Union
+import numpy as np
+from .warnings import no_warnings
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+def angle_diff_vec3_numpy(v1: np.ndarray, v2: np.ndarray, eps: float = 1e-12):
+    """
+    Compute angle difference between 3D vectors using NumPy.
+    Args:
+        v1 (np.ndarray): First vector of shape (..., 3)
+        v2 (np.ndarray): Second vector of shape (..., 3)
+        eps (float, optional): Small epsilon value for numerical stability. Defaults to 1e-12.
+    Returns:
+        np.ndarray: Angle differences in radians
+    """
+    return np.arctan2(
+        np.linalg.norm(np.cross(v1, v2, axis=-1), axis=-1) + eps, (v1 * v2).sum(axis=-1)
+    )
+@no_warnings(category=RuntimeWarning)
+def points_to_normals(
+    point: np.ndarray, mask: np.ndarray = None, edge_threshold: float = None
+) -> np.ndarray:
+    """
+    Calculate normal map from point map. Value range is [-1, 1].
+    Args:
+        point (np.ndarray): shape (height, width, 3), point map
+        mask (optional, np.ndarray): shape (height, width), dtype=bool. Mask of valid depth pixels. Defaults to None.
+        edge_threshold (optional, float): threshold for the angle (in degrees) between the normal and the view direction. Defaults to None.
+    Returns:
+        normal (np.ndarray): shape (height, width, 3), normal map.
+    """
+    height, width = point.shape[-3:-1]
+    has_mask = mask is not None
+    if mask is None:
+        mask = np.ones_like(point[..., 0], dtype=bool)
+    mask_pad = np.zeros((height + 2, width + 2), dtype=bool)
+    mask_pad[1:-1, 1:-1] = mask
+    mask = mask_pad
+    pts = np.zeros((height + 2, width + 2, 3), dtype=point.dtype)
+    pts[1:-1, 1:-1, :] = point
+    up = pts[:-2, 1:-1, :] - pts[1:-1, 1:-1, :]
+    left = pts[1:-1, :-2, :] - pts[1:-1, 1:-1, :]
+    down = pts[2:, 1:-1, :] - pts[1:-1, 1:-1, :]
+    right = pts[1:-1, 2:, :] - pts[1:-1, 1:-1, :]
+    normal = np.stack(
+        [
+            np.cross(up, left, axis=-1),
+            np.cross(left, down, axis=-1),
+            np.cross(down, right, axis=-1),
+            np.cross(right, up, axis=-1),
+        ]
+    )
+    normal = normal / (np.linalg.norm(normal, axis=-1, keepdims=True) + 1e-12)
+    valid = (
+        np.stack(
+            [
+                mask[:-2, 1:-1] & mask[1:-1, :-2],
+                mask[1:-1, :-2] & mask[2:, 1:-1],
+                mask[2:, 1:-1] & mask[1:-1, 2:],
+                mask[1:-1, 2:] & mask[:-2, 1:-1],
+            ]
+        )
+        & mask[None, 1:-1, 1:-1]
+    )
+    if edge_threshold is not None:
+        view_angle = angle_diff_vec3_numpy(pts[None, 1:-1, 1:-1, :], normal)
+        view_angle = np.minimum(view_angle, np.pi - view_angle)
+        valid = valid & (view_angle < np.deg2rad(edge_threshold))
+    normal = (normal * valid[..., None]).sum(axis=0)
+    normal = normal / (np.linalg.norm(normal, axis=-1, keepdims=True) + 1e-12)
+    if has_mask:
+        normal_mask = valid.any(axis=0)
+        normal = np.where(normal_mask[..., None], normal, 0)
+        return normal, normal_mask
+    else:
+        return normal
+def sliding_window_1d(x: np.ndarray, window_size: int, stride: int, axis: int = -1):
+    """
+    Create a sliding window view of the input array along a specified axis.
+    This function creates a memory-efficient view of the input array with sliding windows
+    of the specified size and stride. The window dimension is appended to the end of the
+    output array's shape. This is useful for operations like convolution, pooling, or
+    any analysis that requires examining local neighborhoods in the data.
+    Args:
+        x (np.ndarray): Input array with shape (..., axis_size, ...)
+        window_size (int): Size of the sliding window
+        stride (int): Stride of the sliding window (step size between consecutive windows)
+        axis (int, optional): Axis to perform sliding window over. Defaults to -1 (last axis)
+    Returns:
+        np.ndarray: View of the input array with shape (..., n_windows, ..., window_size),
+                   where n_windows = (axis_size - window_size + 1) // stride
+    Raises:
+        AssertionError: If window_size is larger than the size of the specified axis
+    Example:
+        >>> x = np.array([1, 2, 3, 4, 5, 6])
+        >>> sliding_window_1d(x, window_size=3, stride=2)
+        array([[1, 2, 3],
+               [3, 4, 5]])
+    """
+    assert x.shape[axis] >= window_size, (
+        f"kernel_size ({window_size}) is larger than axis_size ({x.shape[axis]})"
+    )
+    axis = axis % x.ndim
+    shape = (
+        *x.shape[:axis],
+        (x.shape[axis] - window_size + 1) // stride,
+        *x.shape[axis + 1 :],
+        window_size,
+    )
+    strides = (
+        *x.strides[:axis],
+        stride * x.strides[axis],
+        *x.strides[axis + 1 :],
+        x.strides[axis],
+    )
+    x_sliding = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
+    return x_sliding
+def sliding_window_nd(
+    x: np.ndarray,
+    window_size: Tuple[int, ...],
+    stride: Tuple[int, ...],
+    axis: Tuple[int, ...],
+) -> np.ndarray:
+    """
+    Create sliding windows along multiple dimensions of the input array.
+    This function applies sliding_window_1d sequentially along multiple axes to create
+    N-dimensional sliding windows. This is useful for operations that need to examine
+    local neighborhoods in multiple dimensions simultaneously.
+    Args:
+        x (np.ndarray): Input array
+        window_size (Tuple[int, ...]): Size of the sliding window for each axis
+        stride (Tuple[int, ...]): Stride of the sliding window for each axis
+        axis (Tuple[int, ...]): Axes to perform sliding window over
+    Returns:
+        np.ndarray: Array with sliding windows along the specified dimensions.
+                   The window dimensions are appended to the end of the shape.
+    Note:
+        The length of window_size, stride, and axis tuples must be equal.
+    Example:
+        >>> x = np.random.rand(10, 10)
+        >>> windows = sliding_window_nd(x, window_size=(3, 3), stride=(2, 2), axis=(-2, -1))
+        >>> # Creates 3x3 sliding windows with stride 2 in both dimensions
+    """
+    axis = [axis[i] % x.ndim for i in range(len(axis))]
+    for i in range(len(axis)):
+        x = sliding_window_1d(x, window_size[i], stride[i], axis[i])
+    return x
+def sliding_window_2d(
+    x: np.ndarray,
+    window_size: Union[int, Tuple[int, int]],
+    stride: Union[int, Tuple[int, int]],
+    axis: Tuple[int, int] = (-2, -1),
+) -> np.ndarray:
+    """
+    Create 2D sliding windows over the input array.
+    Convenience function for creating 2D sliding windows, commonly used for image
+    processing operations like convolution, pooling, or patch extraction.
+    Args:
+        x (np.ndarray): Input array
+        window_size (Union[int, Tuple[int, int]]): Size of the 2D sliding window.
+                                                  If int, same size is used for both dimensions.
+        stride (Union[int, Tuple[int, int]]): Stride of the 2D sliding window.
+                                             If int, same stride is used for both dimensions.
+        axis (Tuple[int, int], optional): Two axes to perform sliding window over.
+                                         Defaults to (-2, -1) (last two dimensions).
+    Returns:
+        np.ndarray: Array with 2D sliding windows. The window dimensions (height, width)
+                   are appended to the end of the shape.
+    Example:
+        >>> image = np.random.rand(100, 100)
+        >>> patches = sliding_window_2d(image, window_size=8, stride=4)
+        >>> # Creates 8x8 patches with stride 4 from the image
+    """
+    if isinstance(window_size, int):
+        window_size = (window_size, window_size)
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    return sliding_window_nd(x, window_size, stride, axis)
+def max_pool_1d(
+    x: np.ndarray, kernel_size: int, stride: int, padding: int = 0, axis: int = -1
+):
+    """
+    Perform 1D max pooling on the input array.
+    Max pooling reduces the dimensionality of the input by taking the maximum value
+    within each sliding window. This is commonly used in neural networks and signal
+    processing for downsampling and feature extraction.
+    Args:
+        x (np.ndarray): Input array
+        kernel_size (int): Size of the pooling kernel
+        stride (int): Stride of the pooling operation
+        padding (int, optional): Amount of padding to add on both sides. Defaults to 0.
+        axis (int, optional): Axis to perform max pooling over. Defaults to -1.
+    Returns:
+        np.ndarray: Max pooled array with reduced size along the specified axis
+    Note:
+        - For floating point arrays, padding is done with np.nan values
+        - For integer arrays, padding is done with the minimum value of the dtype
+        - np.nanmax is used to handle NaN values in the computation
+    Example:
+        >>> x = np.array([1, 3, 2, 4, 5, 1, 2])
+        >>> max_pool_1d(x, kernel_size=3, stride=2)
+        array([3, 5, 2])
+    """
+    axis = axis % x.ndim
+    if padding > 0:
+        fill_value = np.nan if x.dtype.kind == "f" else np.iinfo(x.dtype).min
+        padding_arr = np.full(
+            (*x.shape[:axis], padding, *x.shape[axis + 1 :]),
+            fill_value=fill_value,
+            dtype=x.dtype,
+        )
+        x = np.concatenate([padding_arr, x, padding_arr], axis=axis)
+    a_sliding = sliding_window_1d(x, kernel_size, stride, axis)
+    max_pool = np.nanmax(a_sliding, axis=-1)
+    return max_pool
+def max_pool_nd(
+    x: np.ndarray,
+    kernel_size: Tuple[int, ...],
+    stride: Tuple[int, ...],
+    padding: Tuple[int, ...],
+    axis: Tuple[int, ...],
+) -> np.ndarray:
+    """
+    Perform N-dimensional max pooling on the input array.
+    This function applies max_pool_1d sequentially along multiple axes to perform
+    multi-dimensional max pooling. This is useful for downsampling multi-dimensional
+    data while preserving the most important features.
+    Args:
+        x (np.ndarray): Input array
+        kernel_size (Tuple[int, ...]): Size of the pooling kernel for each axis
+        stride (Tuple[int, ...]): Stride of the pooling operation for each axis
+        padding (Tuple[int, ...]): Amount of padding for each axis
+        axis (Tuple[int, ...]): Axes to perform max pooling over
+    Returns:
+        np.ndarray: Max pooled array with reduced size along the specified axes
+    Note:
+        The length of kernel_size, stride, padding, and axis tuples must be equal.
+        Max pooling is applied sequentially along each axis in the order specified.
+    Example:
+        >>> x = np.random.rand(10, 10, 10)
+        >>> pooled = max_pool_nd(x, kernel_size=(2, 2, 2), stride=(2, 2, 2),
+        ...                      padding=(0, 0, 0), axis=(-3, -2, -1))
+        >>> # Reduces each dimension by half with 2x2x2 max pooling
+    """
+    for i in range(len(axis)):
+        x = max_pool_1d(x, kernel_size[i], stride[i], padding[i], axis[i])
+    return x
+def max_pool_2d(
+    x: np.ndarray,
+    kernel_size: Union[int, Tuple[int, int]],
+    stride: Union[int, Tuple[int, int]],
+    padding: Union[int, Tuple[int, int]],
+    axis: Tuple[int, int] = (-2, -1),
+):
+    """
+    Perform 2D max pooling on the input array.
+    Convenience function for 2D max pooling, commonly used in computer vision
+    and image processing for downsampling images while preserving important features.
+    Args:
+        x (np.ndarray): Input array
+        kernel_size (Union[int, Tuple[int, int]]): Size of the 2D pooling kernel.
+                                                  If int, same size is used for both dimensions.
+        stride (Union[int, Tuple[int, int]]): Stride of the 2D pooling operation.
+                                             If int, same stride is used for both dimensions.
+        padding (Union[int, Tuple[int, int]]): Amount of padding for both dimensions.
+                                              If int, same padding is used for both dimensions.
+        axis (Tuple[int, int], optional): Two axes to perform max pooling over.
+                                         Defaults to (-2, -1) (last two dimensions).
+    Returns:
+        np.ndarray: 2D max pooled array with reduced size along the specified axes
+    Example:
+        >>> image = np.random.rand(64, 64)
+        >>> pooled = max_pool_2d(image, kernel_size=2, stride=2, padding=0)
+        >>> # Reduces image size from 64x64 to 32x32 with 2x2 max pooling
+    """
+    if isinstance(kernel_size, Number):
+        kernel_size = (kernel_size, kernel_size)
+    if isinstance(stride, Number):
+        stride = (stride, stride)
+    if isinstance(padding, Number):
+        padding = (padding, padding)
+    axis = tuple(axis)
+    return max_pool_nd(x, kernel_size, stride, padding, axis)
+@no_warnings(category=RuntimeWarning)
+def depth_edge(
+    depth: np.ndarray,
+    atol: float = None,
+    rtol: float = None,
+    kernel_size: int = 3,
+    mask: np.ndarray = None,
+) -> np.ndarray:
+    """
+    Compute the edge mask from depth map. The edge is defined as the pixels whose neighbors have large difference in depth.
+    Args:
+        depth (np.ndarray): shape (..., height, width), linear depth map
+        atol (float): absolute tolerance
+        rtol (float): relative tolerance
+    Returns:
+        edge (np.ndarray): shape (..., height, width) of dtype torch.bool
+    """
+    if mask is None:
+        diff = max_pool_2d(
+            depth, kernel_size, stride=1, padding=kernel_size // 2
+        ) + max_pool_2d(-depth, kernel_size, stride=1, padding=kernel_size // 2)
+    else:
+        diff = max_pool_2d(
+            np.where(mask, depth, -np.inf),
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        ) + max_pool_2d(
+            np.where(mask, -depth, -np.inf),
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        )
+    edge = np.zeros_like(depth, dtype=bool)
+    if atol is not None:
+        edge |= diff > atol
+    if rtol is not None:
+        edge |= diff / depth > rtol
+    return edge
+def depth_aliasing(
+    depth: np.ndarray,
+    atol: float = None,
+    rtol: float = None,
+    kernel_size: int = 3,
+    mask: np.ndarray = None,
+) -> np.ndarray:
+    """
+    Compute the map that indicates the aliasing of x depth map. The aliasing is defined as the pixels which neither close to the maximum nor the minimum of its neighbors.
+    Args:
+        depth (np.ndarray): shape (..., height, width), linear depth map
+        atol (float): absolute tolerance
+        rtol (float): relative tolerance
+    Returns:
+        edge (np.ndarray): shape (..., height, width) of dtype torch.bool
+    """
+    if mask is None:
+        diff_max = (
+            max_pool_2d(depth, kernel_size, stride=1, padding=kernel_size // 2) - depth
+        )
+        diff_min = (
+            max_pool_2d(-depth, kernel_size, stride=1, padding=kernel_size // 2) + depth
+        )
+    else:
+        diff_max = (
+            max_pool_2d(
+                np.where(mask, depth, -np.inf),
+                kernel_size,
+                stride=1,
+                padding=kernel_size // 2,
+            )
+            - depth
+        )
+        diff_min = (
+            max_pool_2d(
+                np.where(mask, -depth, -np.inf),
+                kernel_size,
+                stride=1,
+                padding=kernel_size // 2,
+            )
+            + depth
+        )
+    diff = np.minimum(diff_max, diff_min)
+    edge = np.zeros_like(depth, dtype=bool)
+    if atol is not None:
+        edge |= diff > atol
+    if rtol is not None:
+        edge |= diff / depth > rtol
+    return edge
+@no_warnings(category=RuntimeWarning)
+def normals_edge(
+    normals: np.ndarray, tol: float, kernel_size: int = 3, mask: np.ndarray = None
+) -> np.ndarray:
+    """
+    Compute the edge mask from normal map.
+    Args:
+        normal (np.ndarray): shape (..., height, width, 3), normal map
+        tol (float): tolerance in degrees
+    Returns:
+        edge (np.ndarray): shape (..., height, width) of dtype torch.bool
+    """
+    assert normals.ndim >= 3 and normals.shape[-1] == 3, (
+        "normal should be of shape (..., height, width, 3)"
+    )
+    normals = normals / (np.linalg.norm(normals, axis=-1, keepdims=True) + 1e-12)
+    padding = kernel_size // 2
+    normals_window = sliding_window_2d(
+        np.pad(
+            normals,
+            (
+                *([(0, 0)] * (normals.ndim - 3)),
+                (padding, padding),
+                (padding, padding),
+                (0, 0),
+            ),
+            mode="edge",
+        ),
+        window_size=kernel_size,
+        stride=1,
+        axis=(-3, -2),
+    )
+    if mask is None:
+        angle_diff = np.arccos(
+            (normals[..., None, None] * normals_window).sum(axis=-3)
+        ).max(axis=(-2, -1))
+    else:
+        mask_window = sliding_window_2d(
+            np.pad(
+                mask,
+                (*([(0, 0)] * (mask.ndim - 3)), (padding, padding), (padding, padding)),
+                mode="edge",
+            ),
+            window_size=kernel_size,
+            stride=1,
+            axis=(-3, -2),
+        )
+        angle_diff = np.where(
+            mask_window,
+            np.arccos((normals[..., None, None] * normals_window).sum(axis=-3)),
+            0,
+        ).max(axis=(-2, -1))
+    angle_diff = max_pool_2d(
+        angle_diff, kernel_size, stride=1, padding=kernel_size // 2
+    )
+    edge = angle_diff > np.deg2rad(tol)
+    return edge

hyworldmirror/utils/inference_utils.py ADDED Viewed

	@@ -0,0 +1,824 @@

+"""
+Inference utilities for WorldMirror pipeline.
+Includes: image preprocessing, input preparation, prior loading, mask computation,
+result saving, and timing utilities.
+"""
+import glob
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from ..models.utils.camera_utils import vector_to_camera_matrices
+from ..models.utils.geometry import depth_to_world_coords_points
+from .save_utils import (
+    save_depth_png, save_depth_npy, save_normal_png,
+    save_gs_ply, save_points_ply, save_camera_params,
+)
+from .video_utils import video_to_image_frames, video_to_image_frames_new
+from .visual_util import segment_sky, download_file_from_url
+from .geometry import depth_edge, normals_edge
+_IO_WORKERS = 8
+# ============================================================
+# Image Preprocessing
+# ============================================================
+def _handle_alpha_channel(img_data):
+    """Process RGBA images by blending with white background."""
+    if img_data.mode == "RGBA":
+        white_bg = Image.new("RGBA", img_data.size, (255, 255, 255, 255))
+        img_data = Image.alpha_composite(white_bg, img_data)
+    return img_data.convert("RGB")
+def _calculate_resize_dims(orig_w, orig_h, max_dim, resize_strategy, patch_size=14):
+    """Calculate new dimensions based on resize strategy."""
+    if orig_w >= orig_h:
+        new_w = max_dim
+        new_h = round(orig_h * (new_w / orig_w) / patch_size) * patch_size
+    else:
+        new_h = max_dim
+        new_w = round(orig_w * (new_h / orig_h) / patch_size) * patch_size
+    return new_w, new_h
+def _apply_padding(tensor_img, target_dim):
+    """Apply padding to make tensor square."""
+    h_pad = target_dim - tensor_img.shape[1]
+    w_pad = target_dim - tensor_img.shape[2]
+    if h_pad > 0 or w_pad > 0:
+        pad_top, pad_bottom = h_pad // 2, h_pad - h_pad // 2
+        pad_left, pad_right = w_pad // 2, w_pad - w_pad // 2
+        return torch.nn.functional.pad(
+            tensor_img, (pad_left, pad_right, pad_top, pad_bottom),
+            mode="constant", value=1.0,
+        )
+    return tensor_img
+def prepare_images_to_tensor(file_paths, resize_strategy="crop", target_size=518):
+    """Process image files into uniform tensor batch [1, N, 3, H, W]."""
+    if not file_paths:
+        raise ValueError("At least 1 image is required")
+    if resize_strategy not in ["crop", "pad"]:
+        raise ValueError("Strategy must be 'crop' or 'pad'")
+    tensor_list = []
+    converter = transforms.ToTensor()
+    for file_path in file_paths:
+        img_data = Image.open(file_path)
+        img_data = _handle_alpha_channel(img_data)
+        orig_w, orig_h = img_data.size
+        new_w, new_h = _calculate_resize_dims(orig_w, orig_h, target_size, resize_strategy)
+        img_data = img_data.resize((new_w, new_h), Image.Resampling.BICUBIC)
+        tensor_img = converter(img_data)
+        if resize_strategy == "crop":
+            if new_h > target_size:
+                crop_start = (new_h - target_size) // 2
+                tensor_img = tensor_img[:, crop_start:crop_start + target_size, :]
+            if new_w > target_size:
+                crop_start = (new_w - target_size) // 2
+                tensor_img = tensor_img[:, :, crop_start:crop_start + target_size]
+        elif resize_strategy == "pad":
+            tensor_img = _apply_padding(tensor_img, target_size)
+        tensor_list.append(tensor_img)
+    shapes = set((t.shape[1], t.shape[2]) for t in tensor_list)
+    if len(shapes) > 1:
+        raise ValueError(
+            f"Inconsistent resolutions after preprocessing: {shapes}. "
+            f"All input images must have the same aspect ratio."
+        )
+    batch_tensor = torch.stack(tensor_list)
+    if batch_tensor.dim() == 3:
+        batch_tensor = batch_tensor.unsqueeze(0)
+    return batch_tensor.unsqueeze(0)
+# ============================================================
+# Input Preparation
+# ============================================================
+def prepare_input(input_path, target_size=518, fps=1,
+                  video_strategy="new", min_frames=1, max_frames=64,
+                  temp_dir=None):
+    """Read images or extract video frames. Returns (img_paths, subdir_name)."""
+    input_path = Path(input_path)
+    video_exts = ['.mp4', '.avi', '.mov', '.webm', '.gif']
+    if input_path.is_file() and input_path.suffix.lower() in video_exts:
+        subdir_name = input_path.stem
+        frames_dir = Path(temp_dir or "/tmp") / f"frames_{subdir_name}"
+        frames_dir.mkdir(parents=True, exist_ok=True)
+        min_f = max(1, min_frames)
+        max_f = min(64, max_frames)
+        if video_strategy == "new":
+            img_paths = video_to_image_frames_new(
+                str(input_path), str(frames_dir),
+                min_frames=min_f, max_frames=max_f, fallback_fps=fps,
+            )
+        else:
+            img_paths = video_to_image_frames(str(input_path), str(frames_dir), fps=fps)
+            if len(img_paths) > max_f:
+                indices = np.linspace(0, len(img_paths) - 1, max_f, dtype=int)
+                img_paths = [img_paths[i] for i in indices]
+        if not img_paths:
+            raise RuntimeError(f"Failed to extract frames from {input_path}")
+        img_paths = sorted(img_paths)
+        print(f"[Input] Extracted {len(img_paths)} frames from video: {input_path}")
+    elif input_path.is_dir():
+        subdir_name = input_path.name
+        img_paths = []
+        for ext in ["*.jpeg", "*.jpg", "*.png", "*.webp"]:
+            img_paths.extend(glob.glob(os.path.join(str(input_path), ext)))
+        if not img_paths:
+            raise FileNotFoundError(f"No images found in {input_path}")
+        img_paths = sorted(img_paths)
+        print(f"[Input] Loaded {len(img_paths)} images from: {input_path}")
+    elif input_path.is_file() and input_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.webp']:
+        subdir_name = input_path.stem
+        img_paths = [str(input_path)]
+        print(f"[Input] Single image input: {input_path}")
+    else:
+        raise ValueError(f"Invalid input path: {input_path}")
+    return img_paths, subdir_name
+def compute_adaptive_target_size(img_paths, max_target_size=518, patch_size=14):
+    """Compute inference resolution = min(image_longest_edge, max_target_size).
+    Rounds down to nearest multiple of patch_size. Avoids upsampling small images.
+    """
+    first_img = Image.open(img_paths[0])
+    orig_w, orig_h = first_img.size
+    longest_edge = max(orig_w, orig_h)
+    effective = min(longest_edge, max_target_size)
+    effective = (effective // patch_size) * patch_size
+    return max(effective, patch_size * 2)
+# ============================================================
+# Prior Loading
+# ============================================================
+def compute_preprocessing_transform(img_paths, target_size, patch_size=14):
+    """Compute the resize + center-crop transform applied by prepare_images_to_tensor.
+    Returns dict with orig/new/final sizes and scale/crop parameters.
+    """
+    first_img = Image.open(img_paths[0])
+    orig_w, orig_h = first_img.size
+    new_w, new_h = _calculate_resize_dims(orig_w, orig_h, target_size, "crop", patch_size)
+    crop_y = (new_h - target_size) // 2 if new_h > target_size else 0
+    crop_x = (new_w - target_size) // 2 if new_w > target_size else 0
+    return {
+        "orig_w": orig_w, "orig_h": orig_h,
+        "new_w": new_w, "new_h": new_h,
+        "crop_x": crop_x, "crop_y": crop_y,
+        "final_w": min(new_w, target_size), "final_h": min(new_h, target_size),
+        "scale_x": new_w / orig_w, "scale_y": new_h / orig_h,
+    }
+def load_prior_camera(prior_cam_path, img_paths, preprocess_transform=None):
+    """Load camera priors from JSON. Returns (extrinsics [1,N,4,4], intrinsics [1,N,3,3])."""
+    with open(prior_cam_path, "r") as f:
+        cam_data = json.load(f)
+    stem_to_idx = {Path(p).stem: i for i, p in enumerate(img_paths)}
+    N = len(img_paths)
+    extrinsics = None
+    extr_list = cam_data.get("extrinsics", [])
+    if extr_list:
+        extr_array = np.zeros((N, 4, 4), dtype=np.float32)
+        matched = 0
+        for entry in extr_list:
+            cam_id = str(entry["camera_id"])
+            idx = stem_to_idx.get(cam_id)
+            if idx is None and cam_id.isdigit() and int(cam_id) < N:
+                idx = int(cam_id)
+            if idx is not None:
+                extr_array[idx] = np.array(entry["matrix"], dtype=np.float32)
+                matched += 1
+        if matched == N:
+            extrinsics = torch.from_numpy(extr_array).unsqueeze(0)
+            print(f"[Prior] Loaded extrinsics for {matched}/{N} cameras")
+        else:
+            print(f"[Prior] Warning: extrinsics matched {matched}/{N}, disabling")
+    intrinsics = None
+    intr_list = cam_data.get("intrinsics", [])
+    if intr_list:
+        intr_array = np.zeros((N, 3, 3), dtype=np.float32)
+        matched = 0
+        for entry in intr_list:
+            cam_id = str(entry["camera_id"])
+            idx = stem_to_idx.get(cam_id)
+            if idx is None and cam_id.isdigit() and int(cam_id) < N:
+                idx = int(cam_id)
+            if idx is not None:
+                intr_array[idx] = np.array(entry["matrix"], dtype=np.float32)
+                matched += 1
+        if matched == N:
+            intrinsics = torch.from_numpy(intr_array).unsqueeze(0)
+            print(f"[Prior] Loaded intrinsics for {matched}/{N} cameras")
+        else:
+            print(f"[Prior] Warning: intrinsics matched {matched}/{N}, disabling")
+    if intrinsics is not None and preprocess_transform is not None:
+        sx, sy = preprocess_transform["scale_x"], preprocess_transform["scale_y"]
+        cx_off, cy_off = preprocess_transform["crop_x"], preprocess_transform["crop_y"]
+        intrinsics = intrinsics.clone()
+        intrinsics[:, :, 0, :] *= sx
+        intrinsics[:, :, 1, :] *= sy
+        intrinsics[:, :, 0, 2] -= cx_off
+        intrinsics[:, :, 1, 2] -= cy_off
+    return extrinsics, intrinsics
+def _read_depth_file(depth_path):
+    """Read a single depth file (.npy, .exr, .png). Returns float32 [H, W]."""
+    ext = Path(depth_path).suffix.lower()
+    if ext == ".npy":
+        depthmap = np.load(depth_path).astype(np.float32)
+        if depthmap.ndim == 3:
+            depthmap = depthmap[:, :, 0]
+    elif ext == ".exr":
+        depthmap = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH).astype(np.float32)
+        if depthmap.ndim == 3:
+            depthmap = depthmap[:, :, 0]
+    elif ext == ".png":
+        depthmap = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+        if depthmap is None:
+            raise FileNotFoundError(f"Cannot read depth PNG: {depth_path}")
+        depthmap = depthmap.astype(np.float32)
+        if depthmap.ndim == 3:
+            depthmap = depthmap[:, :, 0]
+        if depthmap.max() > 255:
+            depthmap = depthmap / 1000.0
+    else:
+        raise ValueError(f"Unsupported depth format: {ext}")
+    return np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+def load_prior_depth(prior_depth_path, img_paths, target_h, target_w,
+                     preprocess_transform=None):
+    """Load depth priors from a folder. Returns [1, N, H, W] or None."""
+    depth_dir = Path(prior_depth_path)
+    if not depth_dir.is_dir():
+        return None
+    depth_files = {}
+    for f in sorted(depth_dir.iterdir()):
+        if f.suffix.lower() in (".npy", ".exr", ".png"):
+            if f.stem not in depth_files or f.suffix.lower() == ".npy":
+                depth_files[f.stem] = str(f)
+    N = len(img_paths)
+    depth_maps = []
+    for img_p in img_paths:
+        img_stem = Path(img_p).stem
+        dpath = depth_files.get(img_stem)
+        if dpath is None:
+            img_nums = ''.join(filter(str.isdigit, img_stem))
+            for dstem, dc in depth_files.items():
+                if img_nums and img_nums == ''.join(filter(str.isdigit, dstem)):
+                    dpath = dc
+                    break
+        if dpath is None:
+            return None
+        depthmap = _read_depth_file(dpath)
+        if preprocess_transform is not None:
+            nw, nh = preprocess_transform["new_w"], preprocess_transform["new_h"]
+            cx, cy = preprocess_transform["crop_x"], preprocess_transform["crop_y"]
+            fw, fh = preprocess_transform["final_w"], preprocess_transform["final_h"]
+            if depthmap.shape[:2] != (nh, nw):
+                depthmap = cv2.resize(depthmap, (nw, nh), interpolation=cv2.INTER_LINEAR)
+            depthmap = depthmap[cy:cy + fh, cx:cx + fw]
+        else:
+            if depthmap.shape[:2] != (target_h, target_w):
+                depthmap = cv2.resize(depthmap, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
+        depth_maps.append(depthmap)
+    depth_tensor = torch.from_numpy(np.stack(depth_maps, axis=0)).unsqueeze(0)
+    print(f"[Prior] Loaded {N} depth maps from {prior_depth_path}")
+    return depth_tensor
+# ============================================================
+# Mask Computation
+# ============================================================
+def create_filter_mask(
+    pts3d_conf, depth_preds, normal_preds, sky_mask,
+    confidence_percentile=10.0, edge_normal_threshold=5.0,
+    edge_depth_threshold=0.03, apply_confidence_mask=True,
+    apply_edge_mask=True, apply_sky_mask=False, gs_depth_preds=None,
+):
+    """Create filter mask based on confidence, edges, and sky segmentation.
+    Returns pts_mask [S,H,W] or (pts_mask, gs_mask) tuple if gs_depth_preds given.
+    """
+    S, H, W = pts3d_conf.shape[:3]
+    final_mask_list = []
+    gs_mask_list = [] if gs_depth_preds is not None else None
+    for i in range(S):
+        final_mask = None
+        if apply_confidence_mask:
+            threshold = np.quantile(pts3d_conf[i], confidence_percentile / 100.0)
+            conf_mask = pts3d_conf[i] >= threshold
+            final_mask = conf_mask if final_mask is None else final_mask & conf_mask
+        pre_edge_mask = final_mask
+        if apply_edge_mask:
+            n_edges = normals_edge(normal_preds[i], tol=edge_normal_threshold, mask=pre_edge_mask)
+            d_edges = depth_edge(depth_preds[i, :, :, 0], rtol=edge_depth_threshold, mask=pre_edge_mask)
+            edge_mask = ~(d_edges & n_edges)
+            final_mask = edge_mask if final_mask is None else final_mask & edge_mask
+            if gs_depth_preds is not None:
+                gs_d_edges = depth_edge(gs_depth_preds[i, :, :, 0], rtol=edge_depth_threshold, mask=pre_edge_mask)
+                gs_edge_mask = ~(gs_d_edges & n_edges)
+                gs_frame_mask = gs_edge_mask if pre_edge_mask is None else pre_edge_mask & gs_edge_mask
+        if apply_sky_mask:
+            final_mask = sky_mask[i] if final_mask is None else final_mask & sky_mask[i]
+            if gs_depth_preds is not None and apply_edge_mask:
+                gs_frame_mask = gs_frame_mask & sky_mask[i]
+        final_mask_list.append(final_mask)
+        if gs_mask_list is not None:
+            gs_mask_list.append(gs_frame_mask if apply_edge_mask else final_mask)
+    def _stack(ml):
+        return np.stack(ml, axis=0) if ml[0] is not None else np.ones((S, H, W), dtype=bool)
+    pts_mask = _stack(final_mask_list)
+    if gs_mask_list is not None:
+        return pts_mask, _stack(gs_mask_list)
+    return pts_mask
+def _compute_sky_mask_from_model(predictions, H, W, S, threshold=0.5):
+    """Build sky mask from model predictions. Returns [S,H,W] bool or None."""
+    for key in ("gs_depth_mask_logits", "gs_depth_mask", "depth_mask_logits", "depth_mask"):
+        if key in predictions:
+            prob = predictions[key].sigmoid() if "logits" in key else predictions[key]
+            dm = prob[0].detach().cpu()
+            if dm.dim() == 4 and dm.shape[-1] == 1:
+                dm = dm.squeeze(-1)
+            if dm.dim() != 3 or dm.shape[0] != S:
+                return None
+            mask = (dm > threshold).numpy().astype(bool)
+            if mask.shape[1] != H or mask.shape[2] != W:
+                mask = np.stack([cv2.resize(mask[i].astype(np.uint8), (W, H),
+                                            interpolation=cv2.INTER_NEAREST) > 0
+                                 for i in range(S)], axis=0)
+            return mask
+    return None
+def compute_sky_mask(img_paths, H, W, S, predictions=None, source="auto",
+                     model_threshold=0.5, processed_aspect_ratio=None):
+    """Compute sky segmentation mask [S,H,W] (True=non-sky, False=sky)."""
+    if source == "model":
+        mask = _compute_sky_mask_from_model(predictions, H, W, S, model_threshold) if predictions else None
+        return mask if mask is not None else np.ones((S, H, W), dtype=bool)
+    skyseg_path = "skyseg.onnx"
+    if not os.path.exists(skyseg_path):
+        download_file_from_url(
+            "https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx",
+            skyseg_path,
+        )
+    import onnxruntime
+    session = onnxruntime.InferenceSession(skyseg_path)
+    sky_list = []
+    for i in range(S):
+        if processed_aspect_ratio is not None:
+            pil_img = Image.open(img_paths[i]).convert("RGB")
+            sw, sh = pil_img.size
+            if sw / sh > processed_aspect_ratio:
+                cw = int(round(sh * processed_aspect_ratio))
+                ch = sh
+            else:
+                cw = sw
+                ch = int(round(sw / processed_aspect_ratio))
+            left, top = (sw - cw) // 2, (sh - ch) // 2
+            pil_img = pil_img.crop((left, top, left + cw, top + ch))
+            frame = segment_sky(cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR), session)
+        else:
+            frame = segment_sky(img_paths[i], session)
+        if frame.shape[:2] != (H, W):
+            frame = cv2.resize(frame, (W, H))
+        sky_list.append(frame)
+    sky_mask = np.stack(sky_list, axis=0) > 0
+    if source == "auto" and predictions is not None:
+        model_mask = _compute_sky_mask_from_model(predictions, H, W, S, model_threshold)
+        if model_mask is not None:
+            sky_mask = sky_mask & model_mask
+    return sky_mask
+def compute_filter_mask(predictions, imgs, img_paths, H, W, S,
+                        apply_confidence_mask=False, apply_edge_mask=False,
+                        apply_sky_mask=False, confidence_percentile=10.0,
+                        edge_normal_threshold=5.0, edge_depth_threshold=0.03,
+                        sky_mask=None, use_gs_depth=False):
+    """Compute unified filter mask. Returns (filter_mask, gs_filter_mask) tuple."""
+    if not (apply_confidence_mask or apply_edge_mask or apply_sky_mask):
+        return np.ones((S, H, W), dtype=bool), None
+    if apply_sky_mask and sky_mask is None:
+        sky_mask = compute_sky_mask(img_paths, H, W, S, processed_aspect_ratio=W / H)
+    elif sky_mask is None:
+        sky_mask = np.ones((S, H, W), dtype=bool)
+    if "pts3d_conf" in predictions:
+        conf_np = predictions["pts3d_conf"][0].detach().cpu().float().numpy()
+    elif "depth_conf" in predictions:
+        conf_np = predictions["depth_conf"][0].detach().cpu().float().numpy()
+    else:
+        conf_np = np.ones((S, H, W), dtype=np.float32)
+    depth_np = predictions["depth"][0].detach().cpu().float().numpy()
+    normal_np = predictions["normals"][0].detach().cpu().float().numpy()
+    gs_depth_np = None
+    if use_gs_depth and "gs_depth" in predictions:
+        raw = predictions["gs_depth"][0].detach().cpu().float().numpy()
+        gs_depth_np = raw if raw.ndim == 4 else raw[..., np.newaxis]
+    result = create_filter_mask(
+        conf_np, depth_np, normal_np, sky_mask,
+        confidence_percentile=confidence_percentile,
+        edge_normal_threshold=edge_normal_threshold,
+        edge_depth_threshold=edge_depth_threshold,
+        apply_confidence_mask=apply_confidence_mask,
+        apply_edge_mask=apply_edge_mask,
+        apply_sky_mask=apply_sky_mask,
+        gs_depth_preds=gs_depth_np,
+    )
+    if gs_depth_np is not None:
+        pts_mask, gs_mask = result
+        total = pts_mask.size
+        print(f"[Mask] Filter: pts kept {pts_mask.sum()}/{total}, gs kept {gs_mask.sum()}/{total}")
+        return pts_mask, gs_mask
+    print(f"[Mask] Filter: kept {result.sum()}/{result.size} points")
+    return result, None
+# ============================================================
+# Save Utilities
+# ============================================================
+def _timed_call(func, *args, **kwargs):
+    t0 = time.perf_counter()
+    result = func(*args, **kwargs)
+    return result, time.perf_counter() - t0
+def _save_depth_parallel(depth_cpu, depth_dir, S):
+    def _save_one(i):
+        save_depth_png(depth_dir / f"depth_{i:04d}.png", depth_cpu[i, :, :, 0])
+        save_depth_npy(depth_dir / f"depth_{i:04d}.npy", depth_cpu[i, :, :, 0])
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as pool:
+        list(pool.map(_save_one, range(S)))
+def _save_conf_parallel(depth_conf_cpu, conf_dir, S):
+    def _save_one(i):
+        conf = depth_conf_cpu[i]
+        c_min, c_max = conf.min(), conf.max()
+        norm = (conf - c_min) / (c_max - c_min) if c_max - c_min > 1e-8 else torch.ones_like(conf)
+        Image.fromarray((norm.clamp(0, 1) * 255).to(torch.uint8).numpy(), mode="L").save(
+            str(conf_dir / f"conf_{i+1:04d}.png"))
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as pool:
+        list(pool.map(_save_one, range(S)))
+def _save_normal_parallel(normals_cpu, normal_dir, S):
+    def _save_one(i):
+        save_normal_png(normal_dir / f"normal_{i:04d}.png", normals_cpu[i])
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as pool:
+        list(pool.map(_save_one, range(S)))
+def _save_sky_mask_parallel(sky_mask, sky_mask_dir, S):
+    def _save_one(i):
+        Image.fromarray((~sky_mask[i]).astype(np.uint8) * 255, mode="L").save(
+            str(sky_mask_dir / f"sky_mask_{i:04d}.png"))
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as pool:
+        list(pool.map(_save_one, range(S)))
+def _voxel_prune_gaussians(means, scales, quats, colors, opacities, weights, voxel_size=0.002):
+    """Voxel-based merging of Gaussian splats via weighted average."""
+    N = means.shape[0]
+    if N == 0:
+        return means, scales, quats, colors, opacities
+    voxel_idx = (means / voxel_size).floor().long()
+    voxel_idx = voxel_idx - voxel_idx.min(dim=0)[0]
+    vmax = voxel_idx.max(dim=0)[0] + 1
+    flat = voxel_idx[:, 0] * vmax[1] * vmax[2] + voxel_idx[:, 1] * vmax[2] + voxel_idx[:, 2]
+    unique, inv = torch.unique(flat, return_inverse=True)
+    K = len(unique)
+    if K == N:
+        return means, scales, quats, colors, opacities
+    w = weights
+    wsum = torch.zeros(K, dtype=w.dtype).scatter_add_(0, inv, w).clamp(min=1e-8)
+    def _wavg(vals):
+        out = torch.zeros(K, *vals.shape[1:], dtype=vals.dtype)
+        for d in range(vals.shape[1]):
+            out[:, d].scatter_add_(0, inv, vals[:, d] * w)
+        return out / wsum.unsqueeze(-1)
+    m_opa = torch.zeros(K, dtype=opacities.dtype).scatter_add_(0, inv, w * w) / wsum
+    m_quats = torch.zeros(K, 4, dtype=quats.dtype)
+    for d in range(4):
+        m_quats[:, d].scatter_add_(0, inv, quats[:, d] * w)
+    m_quats = m_quats / m_quats.norm(dim=1, keepdim=True).clamp(min=1e-8)
+    print(f"[Save] Voxel prune: {N} -> {K} gaussians")
+    return _wavg(means), _wavg(scales), m_quats, _wavg(colors), m_opa
+def _compress_points_voxel_then_sample(pts_np, cols_np, max_points=2_000_000, voxel_size=0.005):
+    """Compress point cloud: voxel merge then uniform random sampling."""
+    n_in = int(pts_np.shape[0])
+    if n_in == 0:
+        return pts_np, cols_np
+    if voxel_size > 0:
+        voxel = np.floor(pts_np / voxel_size).astype(np.int64)
+        voxel -= voxel.min(axis=0, keepdims=True)
+        _, inv = np.unique(voxel, axis=0, return_inverse=True)
+        k = int(inv.max()) + 1
+        if k < n_in:
+            counts = np.maximum(np.bincount(inv, minlength=k).astype(np.float32), 1.0)
+            pts_np = np.stack([np.bincount(inv, weights=pts_np[:, d], minlength=k)
+                               for d in range(3)], axis=1).astype(np.float32) / counts[:, None]
+            cols_np = np.clip(np.round(
+                np.stack([np.bincount(inv, weights=cols_np[:, d].astype(np.float32), minlength=k)
+                          for d in range(3)], axis=1) / counts[:, None]
+            ), 0, 255).astype(np.uint8)
+    if max_points > 0 and pts_np.shape[0] > max_points:
+        idx = np.random.default_rng(42).choice(pts_np.shape[0], size=max_points, replace=False)
+        pts_np, cols_np = pts_np[idx], cols_np[idx]
+    return pts_np, cols_np
+def _compute_points_from_depth(depth_pred, imgs, extrinsics, intrinsics, S, H, W, filter_mask=None):
+    """Derive 3D point cloud from depth + camera outputs."""
+    depth_pred, extrinsics, intrinsics = depth_pred.float(), extrinsics.float(), intrinsics.float()
+    points_list, colors_list = [], []
+    for i in range(S):
+        d = depth_pred[0, i, :, :, 0]
+        w2c = torch.cat([extrinsics[i][:3, :4],
+                         torch.tensor([[0, 0, 0, 1]], device=extrinsics.device)], dim=0)
+        c2w = torch.linalg.inv(w2c)[:3, :4]
+        pts_i, _, mask = depth_to_world_coords_points(d[None], c2w[None], intrinsics[i][None])
+        img_colors = (imgs[0, i].permute(1, 2, 0) * 255).to(torch.uint8)
+        valid = mask[0]
+        if filter_mask is not None:
+            valid = valid & torch.from_numpy(filter_mask[i]).to(valid.device)
+        if valid.sum().item() > 0:
+            points_list.append(pts_i[0][valid])
+            colors_list.append(img_colors[valid])
+    if not points_list:
+        return np.empty((0, 3), dtype=np.float32), np.empty((0, 3), dtype=np.uint8)
+    return (torch.cat(points_list).detach().cpu().float().numpy(),
+            torch.cat(colors_list).detach().cpu().to(torch.uint8).numpy())
+def _save_colmap_lightweight(extrinsics, intrinsics, outdir, final_w, final_h, S, image_names):
+    """Save lightweight COLMAP reconstruction (cameras + images only)."""
+    import pycolmap
+    sparse_dir = outdir / "sparse" / "0"
+    sparse_dir.mkdir(parents=True, exist_ok=True)
+    scene = pycolmap.Reconstruction()
+    for i in range(S):
+        focal_avg = (intrinsics[i][0, 0] + intrinsics[i][1, 1]) / 2
+        camera = pycolmap.Camera(
+            model="SIMPLE_PINHOLE", width=final_w, height=final_h,
+            params=np.array([focal_avg, intrinsics[i][0, 2], intrinsics[i][1, 2]]),
+            camera_id=i + 1,
+        )
+        scene.add_camera(camera)
+        cam_from_world = pycolmap.Rigid3d(
+            pycolmap.Rotation3d(extrinsics[i][:3, :3]), extrinsics[i][:3, 3])
+        img = pycolmap.Image(id=i + 1, name=image_names[i], camera_id=i + 1,
+                             cam_from_world=cam_from_world)
+        img.registered = True
+        scene.add_image(img)
+    scene.write(str(sparse_dir))
+    print(f"[Save] COLMAP sparse -> {sparse_dir}")
+def save_results(predictions, imgs, img_paths, outdir,
+                 save_depth=True, save_normal=True, save_gs=True,
+                 save_camera=True, save_colmap=False, save_points=True,
+                 save_sky_mask=False, save_conf=False, log_time=False,
+                 max_resolution=1920,
+                 filter_mask=None, gs_filter_mask=None, sky_mask=None,
+                 compress_pts=True, compress_pts_max_points=2_000_000,
+                 compress_pts_voxel_size=0.002,
+                 compress_gs_max_points=5_000_000):
+    """Save all results with parallel I/O. Returns timing dict."""
+    timings = {}
+    outdir = Path(outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    B, S, C, H, W = imgs.shape
+    ar = W / H
+    max_w = max(Image.open(p).size[0] for p in img_paths)
+    new_w, new_h = max_w, int(round(max_w / ar))
+    longest = max(new_w, new_h)
+    if longest > max_resolution:
+        sf = max_resolution / longest
+        new_w, new_h = int(new_w * sf), int(new_h * sf)
+    new_w -= new_w % 2
+    new_h -= new_h % 2
+    image_names = [f"image_{i+1:04d}.jpg" for i in range(S)]
+    depth_cpu = predictions["depth"][0].detach().cpu() if "depth" in predictions else None
+    conf_cpu = predictions.get("depth_conf", [None])[0]
+    if conf_cpu is not None:
+        conf_cpu = conf_cpu.detach().cpu()
+    normals_cpu = predictions["normals"][0].detach().cpu() if "normals" in predictions else None
+    futures = {}
+    executor = ThreadPoolExecutor(max_workers=_IO_WORKERS)
+    if save_depth and depth_cpu is not None:
+        d_dir = outdir / "depth"
+        d_dir.mkdir(exist_ok=True)
+        futures["save_depth"] = executor.submit(_timed_call, _save_depth_parallel, depth_cpu, d_dir, S)
+    if save_conf and conf_cpu is not None:
+        c_dir = outdir / "depth_conf"
+        c_dir.mkdir(exist_ok=True)
+        futures["save_conf"] = executor.submit(_timed_call, _save_conf_parallel, conf_cpu, c_dir, S)
+    if save_normal and normals_cpu is not None:
+        n_dir = outdir / "normal"
+        n_dir.mkdir(exist_ok=True)
+        futures["save_normal"] = executor.submit(_timed_call, _save_normal_parallel, normals_cpu, n_dir, S)
+    if save_sky_mask and sky_mask is not None:
+        sm_dir = outdir / "sky_mask"
+        sm_dir.mkdir(exist_ok=True)
+        futures["save_sky_mask"] = executor.submit(_timed_call, _save_sky_mask_parallel, sky_mask, sm_dir, S)
+    if save_gs and "splats" in predictions:
+        sp = predictions["splats"]
+        means = sp["means"][0].reshape(-1, 3).detach().cpu()
+        scales = sp["scales"][0].reshape(-1, 3).detach().cpu()
+        quats = sp["quats"][0].reshape(-1, 4).detach().cpu()
+        colors = (sp["sh"][0] if "sh" in sp else sp["colors"][0]).reshape(-1, 3).detach().cpu()
+        opacities = sp["opacities"][0].reshape(-1).detach().cpu()
+        weights = sp["weights"][0].reshape(-1).detach().cpu() if "weights" in sp else torch.ones_like(opacities)
+        keep = None
+        if gs_filter_mask is not None:
+            keep = torch.from_numpy(gs_filter_mask.reshape(-1)).bool()
+        elif filter_mask is not None:
+            keep = torch.from_numpy(filter_mask.reshape(-1)).bool()
+        if keep is not None:
+            means, scales, quats = means[keep], scales[keep], quats[keep]
+            colors, opacities, weights = colors[keep], opacities[keep], weights[keep]
+        means, scales, quats, colors, opacities = _voxel_prune_gaussians(
+            means, scales, quats, colors, opacities, weights)
+        if compress_gs_max_points > 0 and means.shape[0] > compress_gs_max_points:
+            idx = torch.from_numpy(
+                np.random.default_rng(42).choice(means.shape[0], size=compress_gs_max_points, replace=False)
+            ).long()
+            means, scales, quats, colors, opacities = means[idx], scales[idx], quats[idx], colors[idx], opacities[idx]
+        futures["save_gs_ply"] = executor.submit(
+            _timed_call, save_gs_ply, outdir / "gaussians.ply", means, scales, quats, colors, opacities)
+    if save_camera and "camera_poses" in predictions and "camera_intrs" in predictions:
+        cam_p = predictions["camera_poses"][0].detach().cpu().float().numpy()
+        cam_i = predictions["camera_intrs"][0].detach().cpu().float().numpy()
+        futures["save_camera"] = executor.submit(_timed_call, save_camera_params, cam_p, cam_i, str(outdir))
+    if save_points and "depth" in predictions and "camera_params" in predictions:
+        e3x4, intr = vector_to_camera_matrices(predictions["camera_params"], image_hw=(H, W))
+        pts_np, cols_np = _compute_points_from_depth(
+            predictions["depth"], imgs, e3x4[0], intr[0], S, H, W, filter_mask=filter_mask)
+        futures["save_points"] = executor.submit(
+            _timed_call, _save_points_artifacts, outdir / "points.ply", pts_np, cols_np,
+            compress_pts, compress_pts_max_points, compress_pts_voxel_size)
+    if save_colmap and "camera_params" in predictions:
+        e3x4, intr = vector_to_camera_matrices(predictions["camera_params"], image_hw=(new_h, new_w))
+        futures["save_colmap"] = executor.submit(
+            _timed_call, _save_colmap_lightweight,
+            e3x4[0].detach().cpu().float().numpy(), intr[0].detach().cpu().float().numpy(),
+            outdir, new_w, new_h, S, image_names)
+    for key, future in futures.items():
+        result, elapsed = future.result()
+        if log_time:
+            timings[key] = elapsed
+            if isinstance(result, dict):
+                timings.update(result)
+    executor.shutdown(wait=False)
+    return timings
+def _save_points_artifacts(path, pts_np, cols_np,
+                           compress=False, max_points=2_000_000,
+                           voxel_size=0.005):
+    timings = {}
+    if compress:
+        t0 = time.perf_counter()
+        pts_np, cols_np = _compress_points_voxel_then_sample(pts_np, cols_np, max_points, voxel_size)
+        timings["compress_points"] = time.perf_counter() - t0
+    save_points_ply(path, pts_np, cols_np)
+    return timings
+# ============================================================
+# Timing Report
+# ============================================================
+def print_and_save_timings(timings, outdir):
+    """Print formatted timing table and save to JSON."""
+    def _p(label, value, indent=0):
+        print(f"{'  ' * (indent + 1)}{label:<38s} {value:>10.3f}s")
+    print(f"\n{'='*72}\n  TIMING REPORT\n{'='*72}")
+    print("  [Serial Stages]")
+    for key, label in [("data_loading", "Data loading"),
+                       ("inference_preprocess", "Inference preprocess"),
+                       ("inference", "Model inference"),
+                       ("compute_mask", "Compute filter mask")]:
+        if key in timings:
+            _p(label, timings[key], 1)
+    save_wall = timings.get("save_total_wall")
+    save_keys = [("save_depth", "Depth"), ("save_conf", "Depth conf"),
+                 ("save_normal", "Normal"), ("save_sky_mask", "Sky mask"),
+                 ("save_gs_ply", "Gaussians"), ("save_camera", "Camera"),
+                 ("save_points", "Points"), ("save_colmap", "COLMAP")]
+    present = [(k, n) for k, n in save_keys if k in timings]
+    if save_wall is not None or present:
+        print("  [Save Stage | Parallel]")
+        if save_wall is not None:
+            _p("Save wall-clock", save_wall, 1)
+        for k, name in present:
+            _p(f"- {name}", timings[k], 2)
+    if "case_total" in timings:
+        print("  [Total]")
+        _p("Case total", timings["case_total"], 1)
+    if "gpu_mem_peak_per_rank_gb" in timings:
+        print("  [GPU Memory]")
+        for i, p in enumerate(timings["gpu_mem_peak_per_rank_gb"]):
+            print(f"    Rank {i}: {p:.2f} GB")
+        print(f"    Average: {timings['gpu_mem_peak_avg_gb']:.2f} GB")
+    print(f"{'='*72}\n")
+    outdir = Path(outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    with open(outdir / "pipeline_timing.json", "w") as f:
+        json.dump(timings, f, indent=2)

hyworldmirror/utils/render_utils.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+Render interpolated video from Gaussian Splatting predictions.
+Interpolates smooth camera trajectories using SLERP quaternions,
+renders each frame via gsplat, and saves MP4 videos.
+"""
+from pathlib import Path
+import numpy as np
+import torch
+from tqdm import tqdm
+from ..models.models.rasterization import GaussianSplatRenderer
+def rotation_matrix_to_quaternion(R):
+    """Convert rotation matrix to quaternion (scalar-first: [w, x, y, z]).
+    Note: This uses the Hamilton convention [w, x, y, z], which differs from
+    models/utils/rotation.py that uses PyTorch3D convention [x, y, z, w].
+    """
+    trace = R[..., 0, 0] + R[..., 1, 1] + R[..., 2, 2]
+    q = torch.zeros(R.shape[:-2] + (4,), device=R.device, dtype=R.dtype)
+    mask1 = trace > 0
+    s = torch.sqrt(trace[mask1] + 1.0) * 2
+    q[mask1, 0] = 0.25 * s
+    q[mask1, 1] = (R[mask1, 2, 1] - R[mask1, 1, 2]) / s
+    q[mask1, 2] = (R[mask1, 0, 2] - R[mask1, 2, 0]) / s
+    q[mask1, 3] = (R[mask1, 1, 0] - R[mask1, 0, 1]) / s
+    mask2 = (~mask1) & (R[..., 0, 0] > R[..., 1, 1]) & (R[..., 0, 0] > R[..., 2, 2])
+    s = torch.sqrt(1.0 + R[mask2, 0, 0] - R[mask2, 1, 1] - R[mask2, 2, 2]) * 2
+    q[mask2, 0] = (R[mask2, 2, 1] - R[mask2, 1, 2]) / s
+    q[mask2, 1] = 0.25 * s
+    q[mask2, 2] = (R[mask2, 0, 1] + R[mask2, 1, 0]) / s
+    q[mask2, 3] = (R[mask2, 0, 2] + R[mask2, 2, 0]) / s
+    mask3 = (~mask1) & (~mask2) & (R[..., 1, 1] > R[..., 2, 2])
+    s = torch.sqrt(1.0 + R[mask3, 1, 1] - R[mask3, 0, 0] - R[mask3, 2, 2]) * 2
+    q[mask3, 0] = (R[mask3, 0, 2] - R[mask3, 2, 0]) / s
+    q[mask3, 1] = (R[mask3, 0, 1] + R[mask3, 1, 0]) / s
+    q[mask3, 2] = 0.25 * s
+    q[mask3, 3] = (R[mask3, 1, 2] + R[mask3, 2, 1]) / s
+    mask4 = (~mask1) & (~mask2) & (~mask3)
+    s = torch.sqrt(1.0 + R[mask4, 2, 2] - R[mask4, 0, 0] - R[mask4, 1, 1]) * 2
+    q[mask4, 0] = (R[mask4, 1, 0] - R[mask4, 0, 1]) / s
+    q[mask4, 1] = (R[mask4, 0, 2] + R[mask4, 2, 0]) / s
+    q[mask4, 2] = (R[mask4, 1, 2] + R[mask4, 2, 1]) / s
+    q[mask4, 3] = 0.25 * s
+    return q
+def quaternion_to_rotation_matrix(q):
+    """Convert quaternion (scalar-first: [w, x, y, z]) to rotation matrix."""
+    w, x, y, z = q[..., 0], q[..., 1], q[..., 2], q[..., 3]
+    norm = torch.sqrt(w*w + x*x + y*y + z*z)
+    w, x, y, z = w/norm, x/norm, y/norm, z/norm
+    R = torch.zeros(q.shape[:-1] + (3, 3), device=q.device, dtype=q.dtype)
+    R[..., 0, 0] = 1 - 2*(y*y + z*z)
+    R[..., 0, 1] = 2*(x*y - w*z)
+    R[..., 0, 2] = 2*(x*z + w*y)
+    R[..., 1, 0] = 2*(x*y + w*z)
+    R[..., 1, 1] = 1 - 2*(x*x + z*z)
+    R[..., 1, 2] = 2*(y*z - w*x)
+    R[..., 2, 0] = 2*(x*z - w*y)
+    R[..., 2, 1] = 2*(y*z + w*x)
+    R[..., 2, 2] = 1 - 2*(x*x + y*y)
+    return R
+def slerp_quaternions(q1, q2, t):
+    """Spherical linear interpolation between quaternions."""
+    dot = (q1 * q2).sum(dim=-1, keepdim=True)
+    mask = dot < 0
+    q2 = torch.where(mask, -q2, q2)
+    dot = torch.where(mask, -dot, dot)
+    DOT_THRESHOLD = 0.9995
+    mask_linear = dot > DOT_THRESHOLD
+    result = torch.zeros_like(q1)
+    if mask_linear.any():
+        result_linear = q1 + t * (q2 - q1)
+        norm = torch.norm(result_linear, dim=-1, keepdim=True)
+        result_linear = result_linear / norm
+        result = torch.where(mask_linear, result_linear, result)
+    mask_slerp = ~mask_linear
+    if mask_slerp.any():
+        theta_0 = torch.acos(torch.abs(dot))
+        sin_theta_0 = torch.sin(theta_0)
+        theta = theta_0 * t
+        sin_theta = torch.sin(theta)
+        s0 = torch.cos(theta) - dot * sin_theta / sin_theta_0
+        s1 = sin_theta / sin_theta_0
+        result_slerp = (s0 * q1) + (s1 * q2)
+        result = torch.where(mask_slerp, result_slerp, result)
+    return result
+def render_interpolated_video(gs_renderer: GaussianSplatRenderer,
+                              splats: dict,
+                              camtoworlds: torch.Tensor,
+                              intrinsics: torch.Tensor,
+                              hw: tuple,
+                              out_path: Path,
+                              interp_per_pair: int = 20,
+                              loop_reverse: bool = True,
+                              save_mode: str = "split",
+                              frame_times: list = None,
+                              render_depth: bool = False) -> None:
+    """Render an interpolated fly-through video from Gaussian splat predictions.
+    Args:
+        gs_renderer: GaussianSplatRenderer instance (from the model).
+        splats: Dict with keys 'means', 'scales', 'quats', 'opacities', 'sh'/'colors'.
+        camtoworlds: Camera-to-world matrices [B, S, 4, 4].
+        intrinsics: Camera intrinsic matrices [B, S, 3, 3].
+        hw: Tuple of (height, width) for rendering.
+        out_path: Output path (without extension).
+        interp_per_pair: Number of interpolated frames per camera pair.
+        loop_reverse: Append reversed video for smooth looping.
+        save_mode: 'split' (separate rgb/depth) or 'both' (combined).
+        frame_times: Optional list of timestamps for adaptive interpolation.
+        render_depth: Whether to also render depth video.
+    """
+    import moviepy.editor as mpy
+    b, s, _, _ = camtoworlds.shape
+    h, w = hw
+    def build_interpolated_traj(index, base_interp_per_pair: int):
+        exts, ints = [], []
+        tmp_camtoworlds = camtoworlds[:, index]
+        tmp_intrinsics = intrinsics[:, index]
+        use_time_based = frame_times is not None and len(frame_times) == len(index)
+        if use_time_based and len(index) > 1:
+            times = np.array([frame_times[i] for i in index], dtype=np.float32)
+            gaps = np.diff(times)
+            gaps[gaps < 0] = 0.0
+            total_gap = float(gaps.sum())
+            target_total_interp = max(1, (len(index) - 1) * base_interp_per_pair)
+            gap_scale = target_total_interp / total_gap if total_gap > 1e-6 else 0.0
+        else:
+            gaps = None
+            gap_scale = None
+        for i in range(len(index)-1):
+            exts.append(tmp_camtoworlds[:, i:i+1])
+            ints.append(tmp_intrinsics[:, i:i+1])
+            R0, t0 = tmp_camtoworlds[:, i, :3, :3], tmp_camtoworlds[:, i, :3, 3]
+            R1, t1 = tmp_camtoworlds[:, i + 1, :3, :3], tmp_camtoworlds[:, i + 1, :3, 3]
+            q0 = rotation_matrix_to_quaternion(R0)
+            q1 = rotation_matrix_to_quaternion(R1)
+            if use_time_based:
+                gap = float(gaps[i]) if gaps is not None else 0.0
+                num_interp = max(0, int(round(gap * gap_scale)))
+            else:
+                num_interp = base_interp_per_pair
+            for j in range(1, num_interp + 1):
+                alpha = j / (num_interp + 1)
+                t_interp = (1 - alpha) * t0 + alpha * t1
+                q_interp = slerp_quaternions(q0, q1, alpha)
+                R_interp = quaternion_to_rotation_matrix(q_interp)
+                ext = torch.eye(4, device=R_interp.device, dtype=R_interp.dtype)[None].repeat(b, 1, 1)
+                ext[:, :3, :3] = R_interp
+                ext[:, :3, 3] = t_interp
+                K0 = tmp_intrinsics[:, i]
+                K1 = tmp_intrinsics[:, i + 1]
+                K = (1 - alpha) * K0 + alpha * K1
+                exts.append(ext[:, None])
+                ints.append(K[:, None])
+        exts = torch.cat(exts, dim=1)[:1]
+        ints = torch.cat(ints, dim=1)[:1]
+        return exts, ints
+    def build_wobble_traj(nums, delta):
+        if s != 1:
+            raise ValueError("Wobble trajectory requires exactly 1 input view")
+        t = torch.linspace(0, 1, nums, dtype=torch.float32, device=camtoworlds.device)
+        t = (torch.cos(torch.pi * (t + 1)) + 1) / 2
+        tf = torch.eye(4, dtype=torch.float32, device=camtoworlds.device)
+        radius = delta * 0.15
+        tf = tf.broadcast_to((*radius.shape, t.shape[0], 4, 4)).clone()
+        radius = radius[..., None]
+        radius = radius * t
+        tf[..., 0, 3] = torch.sin(2 * torch.pi * t) * radius
+        tf[..., 1, 3] = -torch.cos(2 * torch.pi * t) * radius
+        exts = camtoworlds @ tf
+        ints = intrinsics.repeat(1, exts.shape[1], 1, 1)
+        return exts, ints
+    if s > 1:
+        all_ext, all_int = build_interpolated_traj([i for i in range(s)], interp_per_pair)
+    else:
+        all_ext, all_int = build_wobble_traj(interp_per_pair * 12, splats["means"][0].median(dim=0).values.norm(dim=-1)[None])
+    rendered_rgbs, rendered_depths = [], []
+    chunk = 40
+    # Always prune splats to remove scale outliers
+    try:
+        pruned_splats = gs_renderer.prune_gs(splats, gs_renderer.voxel_size)
+    except (AttributeError, RuntimeError):
+        pruned_splats = splats
+    for st in tqdm(range(0, all_ext.shape[1], chunk)):
+        ed = min(st + chunk, all_ext.shape[1])
+        colors, depths, _ = gs_renderer.rasterizer.rasterize_batches(
+            pruned_splats["means"][:1], pruned_splats["quats"][:1], pruned_splats["scales"][:1],
+            pruned_splats["opacities"][:1],
+            pruned_splats["sh"][:1] if "sh" in pruned_splats else pruned_splats["colors"][:1],
+            all_ext[:, st:ed].to(torch.float32), all_int[:, st:ed].to(torch.float32),
+            width=w, height=h, sh_degree=gs_renderer.sh_degree if "sh" in pruned_splats else None,
+        )
+        rendered_rgbs.append(colors)
+        if render_depth:
+            rendered_depths.append(depths)
+    rgbs = torch.cat(rendered_rgbs, dim=1)[0]  # [N, H, W, 3]
+    if render_depth:
+        depths_all = torch.cat(rendered_depths, dim=1)[0, ..., 0]  # [N, H, W]
+    del rendered_rgbs, rendered_depths
+    def _depth_vis(d: torch.Tensor) -> torch.Tensor:
+        """Simple turbo colormap depth visualization."""
+        import matplotlib.pyplot as plt
+        valid = d > 0
+        if valid.any():
+            near = d[valid].float().quantile(0.01).log()
+        else:
+            near = torch.tensor(0.0, device=d.device)
+        far = d.flatten().float().quantile(0.99).log()
+        x = d.float().clamp(min=1e-9).log()
+        x = 1.0 - (x - near) / (far - near + 1e-9)
+        x_np = x.cpu().numpy()
+        colored = torch.from_numpy(plt.cm.turbo(x_np)[..., :3]).permute(2, 0, 1).float()
+        return colored
+    rgb_frames = []
+    depth_frames = []
+    if render_depth:
+        for rgb, dep in zip(rgbs, depths_all):
+            rgb_frames.append(rgb.permute(2, 0, 1))
+            depth_frames.append(_depth_vis(dep))
+    else:
+        for rgb in rgbs:
+            rgb_frames.append(rgb.permute(2, 0, 1))
+    def _make_video(frames, path):
+        video = torch.stack([f.cpu() for f in frames]).clamp(0, 1)
+        video = video.permute(0, 2, 3, 1)
+        video = (video * 255).to(torch.uint8).numpy()
+        if loop_reverse and video.shape[0] > 1:
+            video = np.concatenate([video, video[::-1][1:-1]], axis=0)
+        clip = mpy.ImageSequenceClip(list(video), fps=30)
+        clip.write_videofile(str(path), logger=None)
+    out_path = Path(out_path)
+    out_path.mkdir(parents=True, exist_ok=True)
+    if save_mode == 'split':
+        _make_video(rgb_frames, out_path / "rendered_rgb.mp4")
+        if render_depth:
+            _make_video(depth_frames, out_path / "rendered_depth.mp4")
+    elif save_mode == 'both' and render_depth:
+        combined = [torch.cat([r, d], dim=1) for r, d in zip(rgb_frames, depth_frames)]
+        _make_video(combined, out_path / "rendered.mp4")
+    print(f"Video saved to {out_path} (mode: {save_mode})")
+    torch.cuda.empty_cache()

hyworldmirror/utils/save_utils.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Utilities for saving images, depths, normals, point clouds, and Gaussian splat data.
+tencent
+"""
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from plyfile import PlyData, PlyElement
+from io import BytesIO
+import json
+import os
+def save_camera_params(extrinsics, intrinsics, target_dir):
+    """
+    Save camera parameters (extrinsics and intrinsics) in JSON format
+    Args:
+        extrinsics: numpy array, shape [N, 4, 4] - extrinsic matrices for N cameras
+        intrinsics: numpy array, shape [N, 3, 3] - intrinsic matrices for N cameras
+        target_dir: str - directory to save the parameters
+    Returns:
+        str: path to the saved file
+    """
+    camera_data = {
+        "num_cameras": int(extrinsics.shape[0]),
+        "extrinsics": [],
+        "intrinsics": []
+    }
+    # Convert each camera's parameters to list format
+    for i in range(extrinsics.shape[0]):
+        camera_data["extrinsics"].append({
+            "camera_id": i,
+            "matrix": extrinsics[i].tolist()  # [4, 4] -> list
+        })
+        camera_data["intrinsics"].append({
+            "camera_id": i,
+            "matrix": intrinsics[i].tolist()  # [3, 3] -> list
+        })
+    # Save as JSON file
+    camera_params_path = os.path.join(target_dir, "camera_params.json")
+    with open(camera_params_path, 'w') as f:
+        json.dump(camera_data, f, indent=2)
+    return camera_params_path
+def save_image_png(path: Path, image_tensor: torch.Tensor) -> None:
+    # image_tensor: [H, W, 3]
+    img = (image_tensor.detach().cpu() * 255.0).to(torch.uint8).numpy()
+    Image.fromarray(img).save(str(path))
+def save_depth_png(path: Path, depth_tensor: torch.Tensor) -> None:
+    # depth_tensor: [H, W]
+    d = depth_tensor.detach()
+    d = d - d.min()
+    d = d / (d.max() + 1e-9)
+    img = (d.clamp(0, 1) * 255.0).to(torch.uint8).cpu().numpy()
+    Image.fromarray(img, mode="L").save(str(path))
+def save_depth_npy(path: Path, depth_tensor: torch.Tensor) -> None:
+    # depth_tensor: [H, W]
+    # Save actual depth values in numpy format
+    d = depth_tensor.detach().cpu().numpy()
+    np.save(str(path), d)
+def save_normal_png(path: Path, normal_hwc: torch.Tensor) -> None:
+    # normal_hwc: [H, W, 3], in [-1, 1]
+    n = (normal_hwc.detach().cpu() + 1.0) * 0.5
+    img = (n.clamp(0, 1) * 255.0).to(torch.uint8).numpy()
+    Image.fromarray(img).save(str(path))
+def _build_vertex_ply_element(pts: np.ndarray, colors: np.ndarray) -> PlyElement:
+    """Build a PLY vertex element from points and colors arrays.
+    Args:
+        pts: Point coordinates, shape [N, 3], dtype float32
+        colors: RGB colors, shape [N, 3], dtype uint8
+    Returns:
+        PlyElement describing the vertices
+    """
+    vertex_dtype = [("x", "f4"), ("y", "f4"), ("z", "f4"),
+                    ("red", "u1"), ("green", "u1"), ("blue", "u1")]
+    vertex_elements = np.empty(len(pts), dtype=vertex_dtype)
+    vertex_elements["x"] = pts[:, 0]
+    vertex_elements["y"] = pts[:, 1]
+    vertex_elements["z"] = pts[:, 2]
+    vertex_elements["red"] = colors[:, 0]
+    vertex_elements["green"] = colors[:, 1]
+    vertex_elements["blue"] = colors[:, 2]
+    return PlyElement.describe(vertex_elements, "vertex")
+def save_scene_ply(path: Path,
+                   points_xyz: torch.Tensor,
+                   point_colors: torch.Tensor,
+                   valid_mask: torch.Tensor = None) -> None:
+    """Save point cloud to PLY format"""
+    pts = points_xyz.detach().cpu().to(torch.float32).numpy().reshape(-1, 3)
+    colors = point_colors.detach().cpu().to(torch.uint8).numpy().reshape(-1, 3)
+    # Filter out invalid points (NaN, Inf)
+    if valid_mask is None:
+        valid_mask = np.isfinite(pts).all(axis=1)
+    else:
+        valid_mask = valid_mask.detach().cpu().numpy().reshape(-1)
+    pts = pts[valid_mask]
+    colors = colors[valid_mask]
+    # Handle empty point cloud
+    if len(pts) == 0:
+        pts = np.array([[0, 0, 0]], dtype=np.float32)
+        colors = np.array([[255, 255, 255]], dtype=np.uint8)
+    PlyData([_build_vertex_ply_element(pts, colors)]).write(str(path))
+def save_points_ply(path: Path, pts_np: np.ndarray, cols_np: np.ndarray) -> None:
+    """Save point cloud to PLY format from numpy arrays"""
+    PlyData([_build_vertex_ply_element(pts_np, cols_np)]).write(str(path))
+def _build_gs_ply_data(means, scales, rotations, rgbs, opacities, quantile_threshold):
+    """Build Gaussian splat PLY data with scale-based filtering.
+    Args:
+        means: Gaussian centers [N, 3]
+        scales: Gaussian scales [N, 3]
+        rotations: Gaussian rotations as quaternions [N, 4]
+        rgbs: RGB colors [N, 3]
+        opacities: Opacity values [N]
+        quantile_threshold: Percentile threshold for scale filtering (e.g. 0.98 or 0.90)
+    Returns:
+        PlyData object ready to be written or returned
+    """
+    scale_threshold = torch.quantile(scales.max(dim=-1)[0], quantile_threshold, dim=0)
+    filter_mask = scales.max(dim=-1)[0] <= scale_threshold
+    means = means[filter_mask].reshape(-1, 3)
+    scales = scales[filter_mask].reshape(-1, 3)
+    rotations = rotations[filter_mask].reshape(-1, 4)
+    rgbs = rgbs[filter_mask].reshape(-1, 3)
+    opacities = opacities[filter_mask].reshape(-1)
+    attributes = ["x", "y", "z", "nx", "ny", "nz"]
+    for i in range(3):
+        attributes.append(f"f_dc_{i}")
+    attributes.append("opacity")
+    for i in range(3):
+        attributes.append(f"scale_{i}")
+    for i in range(4):
+        attributes.append(f"rot_{i}")
+    dtype_full = [(attribute, "f4") for attribute in attributes]
+    elements = np.empty(means.shape[0], dtype=dtype_full)
+    attributes_data = (
+        means.float().detach().cpu().numpy(),
+        torch.zeros_like(means).float().detach().cpu().numpy(),
+        rgbs.detach().cpu().contiguous().numpy(),
+        opacities[..., None].detach().cpu().numpy(),
+        scales.log().detach().cpu().numpy(),
+        rotations.detach().cpu().numpy(),
+    )
+    attributes_data = np.concatenate(attributes_data, axis=1)
+    elements[:] = list(map(tuple, attributes_data))
+    return PlyData([PlyElement.describe(elements, "vertex")])
+def save_gs_ply(path: Path,
+                means: torch.Tensor,
+                scales: torch.Tensor,
+                rotations: torch.Tensor,
+                rgbs: torch.Tensor,
+                opacities: torch.Tensor) -> None:
+    """
+    Export Gaussian splat data to PLY format.
+    Args:
+        path: Output PLY file path
+        means: Gaussian centers [N, 3]
+        scales: Gaussian scales [N, 3]
+        rotations: Gaussian rotations as quaternions [N, 4]
+        rgbs: RGB colors [N, 3]
+        opacities: Opacity values [N]
+    """
+    # Ensure float32 for quantile and numpy conversion (bf16 not supported)
+    means, scales, rotations, rgbs, opacities = (
+        t.float() for t in (means, scales, rotations, rgbs, opacities)
+    )
+    plydata = _build_gs_ply_data(means, scales, rotations, rgbs, opacities, quantile_threshold=0.98)
+    plydata.write(str(path))
+def convert_gs_to_ply(means, scales, rotations, rgbs, opacities):
+    """
+    Export Gaussian splat data to PLY format.
+    Args:
+        means: Gaussian centers [N, 3]
+        scales: Gaussian scales [N, 3]
+        rotations: Gaussian rotations as quaternions [N, 4]
+        rgbs: RGB colors [N, 3]
+        opacities: Opacity values [N]
+    """
+    return _build_gs_ply_data(means, scales, rotations, rgbs, opacities, quantile_threshold=0.90)
+def process_ply_to_splat(plydata, output_path):
+    vert = plydata["vertex"]
+    sorted_indices = np.argsort(
+        -np.exp(vert["scale_0"] + vert["scale_1"] + vert["scale_2"])
+        / (1 + np.exp(-vert["opacity"]))
+    )
+    buffer = BytesIO()
+    for idx in sorted_indices:
+        v = plydata["vertex"][idx]
+        position = np.array([v["x"], v["y"], v["z"]], dtype=np.float32)
+        scales = np.exp(
+            np.array(
+                [v["scale_0"], v["scale_1"], v["scale_2"]],
+                dtype=np.float32,
+            )
+        )
+        rot = np.array(
+            [v["rot_0"], v["rot_1"], v["rot_2"], v["rot_3"]],
+            dtype=np.float32,
+        )
+        SH_C0 = 0.28209479177387814
+        color = np.array(
+            [
+                0.5 + SH_C0 * v["f_dc_0"],
+                0.5 + SH_C0 * v["f_dc_1"],
+                0.5 + SH_C0 * v["f_dc_2"],
+                1 / (1 + np.exp(-v["opacity"])),
+            ]
+        )
+        buffer.write(position.tobytes())
+        buffer.write(scales.tobytes())
+        buffer.write((color * 255).clip(0, 255).astype(np.uint8).tobytes())
+        buffer.write(
+            ((rot / np.linalg.norm(rot)) * 128 + 128)
+            .clip(0, 255)
+            .astype(np.uint8)
+            .tobytes()
+        )
+    value = buffer.getvalue()
+    with open(output_path, "wb") as f:
+        f.write(value)
+    return output_path

hyworldmirror/utils/video_utils.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import os
+import json
+import csv
+import time
+from concurrent.futures import ThreadPoolExecutor
+import cv2
+import numpy as np
+from PIL import Image
+import subprocess
+def video_to_image_frames(input_video_path, save_directory=None, fps=1):
+    """
+    Extracts image frames from a video file at the specified frame rate and saves them as JPEG format.
+    Supports regular video files, webcam captures, WebM files, and GIF files, including incomplete files.
+    Args:
+        input_video_path: Path to the input video file
+        save_directory: Directory to save extracted frames (default: None)
+        fps: Number of frames to extract per second (default: 1)
+    Returns: List of file paths to extracted frames
+    """
+    extracted_frame_paths = []
+    frame_indices = []  # Track frame indices for metadata
+    source_fps = None
+    # For GIF files, use PIL library for better handling
+    if input_video_path.lower().endswith('.gif'):
+        try:
+            print(f"Processing GIF file using PIL: {input_video_path}")
+            with Image.open(input_video_path) as gif_img:
+                # Get GIF properties
+                frame_duration_ms = gif_img.info.get('duration', 100)
+                gif_frame_rate = 1000.0 / frame_duration_ms if frame_duration_ms > 0 else 10.0
+                source_fps = gif_frame_rate
+                print(f"GIF properties: {gif_img.n_frames} frames, {gif_frame_rate:.2f} FPS, {frame_duration_ms}ms per frame")
+                sampling_interval = max(1, int(gif_frame_rate / fps)) if fps < gif_frame_rate else 1
+                saved_count = 0
+                for current_frame_index in range(gif_img.n_frames):
+                    gif_img.seek(current_frame_index)
+                    if current_frame_index % sampling_interval == 0:
+                        rgb_frame = gif_img.convert('RGB')
+                        frame_ndarray = np.array(rgb_frame)
+                        frame_output_path = os.path.join(save_directory, f"frame_{saved_count:06d}.jpg")
+                        pil_image = Image.fromarray(frame_ndarray)
+                        pil_image.save(frame_output_path, 'JPEG', quality=95)
+                        extracted_frame_paths.append(frame_output_path)
+                        frame_indices.append(current_frame_index)
+                        saved_count += 1
+                if extracted_frame_paths:
+                    print(f"Successfully extracted {len(extracted_frame_paths)} frames from GIF using PIL")
+                    # Save metadata
+                    _save_old_metadata(save_directory, frame_indices, source_fps)
+                    return extracted_frame_paths
+        except Exception as error:
+            print(f"PIL GIF extraction error: {str(error)}, falling back to OpenCV")
+    # For WebM files, use FFmpeg directly for more stable processing
+    if input_video_path.lower().endswith('.webm'):
+        try:
+            print(f"Processing WebM file using FFmpeg: {input_video_path}")
+            # Get video FPS first
+            cap = cv2.VideoCapture(input_video_path)
+            source_fps = cap.get(cv2.CAP_PROP_FPS) or fps
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            cap.release()
+            output_frame_pattern = os.path.join(save_directory, "frame_%04d.jpg")
+            ffmpeg_command = [
+                "ffmpeg",
+                "-i", input_video_path,
+                "-vf", f"fps={fps}",
+                "-q:v", "2",
+                output_frame_pattern
+            ]
+            ffmpeg_process = subprocess.Popen(
+                ffmpeg_command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+            process_stdout, process_stderr = ffmpeg_process.communicate()
+            # Collect all extracted frames and calculate indices
+            extracted_frame_paths = []
+            for filename in sorted(os.listdir(save_directory)):
+                if filename.startswith("frame_") and filename.endswith(".jpg"):
+                    full_frame_path = os.path.join(save_directory, filename)
+                    extracted_frame_paths.append(full_frame_path)
+                    # Extract frame number from filename (frame_XXXX.jpg)
+                    try:
+                        frame_num = int(filename.split("_")[1].split(".")[0])
+                        # Estimate original frame index based on fps ratio
+                        frame_idx = int(frame_num * source_fps / fps)
+                        frame_indices.append(frame_idx)
+                    except:
+                        frame_indices.append(len(frame_indices))
+            if extracted_frame_paths:
+                print(f"Successfully extracted {len(extracted_frame_paths)} frames from WebM using FFmpeg")
+                _save_old_metadata(save_directory, frame_indices, source_fps)
+                return extracted_frame_paths
+            print("FFmpeg extraction failed, falling back to OpenCV")
+        except Exception as error:
+            print(f"FFmpeg extraction error: {str(error)}, falling back to OpenCV")
+    # Standard OpenCV method for non-WebM files or as fallback
+    try:
+        video_capture = cv2.VideoCapture(input_video_path)
+        if input_video_path.lower().endswith('.webm'):
+            video_capture.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc(*'VP80'))
+        source_fps = video_capture.get(cv2.CAP_PROP_FPS) or fps
+        extraction_interval = max(1, int(source_fps / fps))
+        processed_frame_count = 0
+        cv2.setLogLevel(0)
+        while True:
+            read_success, current_frame = video_capture.read()
+            if not read_success:
+                break
+            if processed_frame_count % extraction_interval == 0:
+                try:
+                    if current_frame is not None and current_frame.size > 0:
+                        rgb_converted_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2RGB)
+                        frame_output_path = os.path.join(save_directory, f"frame_{len(extracted_frame_paths):06d}.jpg")
+                        cv2.imwrite(frame_output_path, cv2.cvtColor(rgb_converted_frame, cv2.COLOR_RGB2BGR))
+                        extracted_frame_paths.append(frame_output_path)
+                        frame_indices.append(processed_frame_count)
+                except Exception as error:
+                    print(f"Warning: Failed to process frame {processed_frame_count}: {str(error)}")
+            processed_frame_count += 1
+            if processed_frame_count > 1000:
+                break
+        video_capture.release()
+        print(f"Extracted {len(extracted_frame_paths)} frames from video using OpenCV")
+        # Save metadata
+        if extracted_frame_paths:
+            _save_old_metadata(save_directory, frame_indices, source_fps)
+    except Exception as error:
+        print(f"Error extracting frames: {str(error)}")
+    return extracted_frame_paths
+def _save_old_metadata(save_directory, frame_indices, fps):
+    """Save metadata for old sampling strategy."""
+    if not frame_indices or not fps:
+        return
+    try:
+        meta = {
+            "frame_indices": frame_indices,
+            "frame_times": [idx / fps for idx in frame_indices],
+            "fps": fps,
+            "algorithm": "uniform_fps_based"
+        }
+        metadata_path = os.path.join(save_directory, "frame_metadata.json")
+        with open(metadata_path, "w") as f:
+            json.dump(meta, f, indent=2)
+    except Exception as e:
+        print(f"Warning: Failed to save metadata: {e}")
+def _resize_for_flow(frame, long_edge=320):
+    height, width = frame.shape[:2]
+    long_side = max(height, width)
+    if long_side <= long_edge:
+        return frame
+    scale = long_edge / float(long_side)
+    new_w = max(1, int(width * scale))
+    new_h = max(1, int(height * scale))
+    return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+def _resize_for_clarity(frame, long_edge=480):
+    """Resize frame for clarity calculation (480p for better accuracy)."""
+    height, width = frame.shape[:2]
+    long_side = max(height, width)
+    if long_side <= long_edge:
+        return frame
+    scale = long_edge / float(long_side)
+    new_w = max(1, int(width * scale))
+    new_h = max(1, int(height * scale))
+    return cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+def _create_dis_flow():
+    if hasattr(cv2, "optflow") and hasattr(cv2.optflow, "createOptFlow_DIS"):
+        return cv2.optflow.createOptFlow_DIS(cv2.optflow.DISOPTICAL_FLOW_PRESET_ULTRAFAST)
+    if hasattr(cv2, "DISOpticalFlow_create"):
+        return cv2.DISOpticalFlow_create(cv2.DISOPTICAL_FLOW_PRESET_ULTRAFAST)
+    return None
+def _calculate_histogram(image):
+    """
+    Calculate normalized color histogram for global deduplication.
+    Using HSV for better robustness to brightness changes.
+    """
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    # 8 bins for H, 4 for S, 4 for V -> 128 dim vector
+    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 4, 4], [0, 180, 0, 256, 0, 256])
+    cv2.normalize(hist, hist)
+    return hist.flatten()
+def _calculate_hist_similarity(hist1, hist2):
+    return cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
+def _advance_cap_to_frame(cap, current_pos, target_idx):
+    """Advance cap so that next read() returns frame target_idx. Returns target_idx."""
+    dist = target_idx - current_pos
+    if dist <= 0:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+        return target_idx
+    if dist < 50:
+        for _ in range(dist):
+            cap.grab()
+        return target_idx
+    cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+    return target_idx
+def _merge_search_windows(candidate_indices, window_size=3):
+    """
+    Merge adjacent search windows to reduce disk seeks.
+    Returns list of (start_idx, end_idx, target_indices) tuples.
+    """
+    if not candidate_indices:
+        return []
+    merged = []
+    sorted_indices = sorted(candidate_indices)
+    i = 0
+    while i < len(sorted_indices):
+        start_idx = max(0, sorted_indices[i] - window_size)
+        end_idx = sorted_indices[i] + window_size
+        targets_in_window = [sorted_indices[i]]
+        # Extend window to include adjacent candidates
+        j = i + 1
+        while j < len(sorted_indices):
+            next_start = max(0, sorted_indices[j] - window_size)
+            if next_start <= end_idx:
+                end_idx = sorted_indices[j] + window_size
+                targets_in_window.append(sorted_indices[j])
+                j += 1
+            else:
+                break
+        merged.append((start_idx, end_idx, targets_in_window))
+        i = j
+    return merged
+def _sparse_motion_analysis(cap, fps, total_frames):
+    """Phase 1: Sparse sampling with DIS optical flow."""
+    sample_interval = max(1, int(fps * 0.5))
+    sparse_samples = []
+    dis_flow = _create_dis_flow()
+    current_idx = 0
+    prev_gray = None
+    while True:
+        if current_idx > 0:
+            steps_to_skip = sample_interval - 1
+            if steps_to_skip > 0:
+                current_idx = _advance_cap_to_frame(cap, current_idx, current_idx + steps_to_skip)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        small = _resize_for_flow(frame, long_edge=320)
+        gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+        motion_mag = 0.0
+        if prev_gray is not None:
+            if dis_flow is not None:
+                flow = dis_flow.calc(prev_gray, gray, None)
+            else:
+                flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 2, 5, 1.2, 0)
+            motion_mag = float(np.mean(np.sqrt(flow[..., 0]**2 + flow[..., 1]**2)))
+        sparse_samples.append({
+            "idx": current_idx,
+            "motion": motion_mag,
+            "hist": _calculate_histogram(small)
+        })
+        prev_gray = gray
+        current_idx += 1
+    return sparse_samples
+def _adaptive_frame_selection(sparse_samples, fps, max_frames):
+    """Phase 2: Adaptive threshold allocation with deduplication."""
+    motions = [s["motion"] for s in sparse_samples[1:]]
+    if not motions:
+        return [sparse_samples[0]["idx"]]
+    # Calculate adaptive threshold
+    static_floor = 1.0
+    total_motion = sum(motions)
+    estimated_step = total_motion / max_frames if max_frames > 0 else total_motion
+    step_threshold = max(static_floor * 5.0, estimated_step)
+    # Select frames based on accumulated motion
+    candidate_indices = [sparse_samples[0]["idx"]]
+    selected_hists = [sparse_samples[0]["hist"]]
+    current_accum = 0.0
+    last_selected_idx = sparse_samples[0]["idx"]
+    for i in range(1, len(sparse_samples)):
+        s = sparse_samples[i]
+        effective_motion = s["motion"] if s["motion"] >= static_floor else 0.0
+        current_accum += effective_motion
+        time_gap = s["idx"] - last_selected_idx
+        should_select = (current_accum >= step_threshold) or (time_gap > (4.0 * fps))
+        if should_select:
+            is_duplicate = any(_calculate_hist_similarity(s["hist"], h) > 0.999 for h in selected_hists)
+            if not is_duplicate:
+                candidate_indices.append(s["idx"])
+                selected_hists.append(s["hist"])
+                current_accum = 0.0
+                last_selected_idx = s["idx"]
+    # Always check last frame
+    if sparse_samples[-1]["idx"] != candidate_indices[-1]:
+        last_hist = sparse_samples[-1]["hist"]
+        if not any(_calculate_hist_similarity(last_hist, h) > 0.999 for h in selected_hists):
+            candidate_indices.append(sparse_samples[-1]["idx"])
+    return sorted(list(set(candidate_indices)))
+def _enforce_frame_constraints(candidate_indices, sparse_samples, min_frames, max_frames):
+    """Enforce min/max frame constraints."""
+    if len(candidate_indices) < min_frames:
+        needed = min_frames - len(candidate_indices)
+        all_indices = [s["idx"] for s in sparse_samples]
+        extras = np.linspace(0, len(all_indices)-1, needed+2)[1:-1]
+        candidate_indices.extend([all_indices[int(e)] for e in extras])
+        candidate_indices = sorted(list(set(candidate_indices)))
+    if len(candidate_indices) > max_frames:
+        indices_to_keep = np.linspace(0, len(candidate_indices)-1, max_frames)
+        candidate_indices = [candidate_indices[int(round(i))] for i in indices_to_keep]
+    return candidate_indices
+def _read_window_frames(cap, merged_windows, total_frames):
+    """Read all frames from merged windows."""
+    all_frames = []
+    current_pos = -1
+    for window_idx, (window_start, window_end, _) in enumerate(merged_windows):
+        current_pos = _advance_cap_to_frame(cap, current_pos, window_start)
+        for idx in range(window_start, min(window_end + 1, total_frames)):
+            ret, frame = cap.read()
+            if not ret:
+                break
+            all_frames.append((window_idx, idx, frame))
+            current_pos = idx + 1
+    return all_frames
+def _compute_clarity_parallel(all_frames):
+    """Parallel clarity calculation."""
+    def _compute(item):
+        window_idx, frame_idx, frame = item
+        clarity_frame = _resize_for_clarity(frame, long_edge=480)
+        gray = cv2.cvtColor(clarity_frame, cv2.COLOR_BGR2GRAY)
+        clarity = cv2.Laplacian(gray, cv2.CV_64F).var()
+        return (window_idx, frame_idx, frame, clarity)
+    with ThreadPoolExecutor(max_workers=min(8, len(all_frames) or 1)) as ex:
+        return list(ex.map(_compute, all_frames))
+def _select_best_frames(clarity_results, merged_windows, candidate_indices, search_window_size=3):
+    """Select best frame for each candidate based on clarity."""
+    # Group by window
+    window_frames = {}
+    for window_idx, frame_idx, frame, clarity in clarity_results:
+        if window_idx not in window_frames:
+            window_frames[window_idx] = []
+        window_frames[window_idx].append((frame_idx, frame, clarity))
+    # Select best frame for each target
+    target_to_best = {}
+    for window_idx, (_, _, targets) in enumerate(merged_windows):
+        frames = window_frames.get(window_idx, [])
+        for target_idx in targets:
+            candidates = [(idx, f, c) for idx, f, c in frames
+                         if abs(idx - target_idx) <= search_window_size]
+            if candidates:
+                best_idx, best_frame, _ = max(candidates, key=lambda x: x[2])
+                target_to_best[target_idx] = (best_idx, best_frame)
+            elif frames:
+                closest = min(frames, key=lambda x: abs(x[0] - target_idx))
+                target_to_best[target_idx] = (closest[0], closest[1])
+    return target_to_best
+def _save_frames_parallel(target_to_best, candidate_indices, save_directory):
+    """Parallel frame saving."""
+    path_frame_list = []
+    final_indices = []
+    for target_idx in sorted(candidate_indices):
+        if target_idx in target_to_best:
+            best_idx, best_frame = target_to_best[target_idx]
+            final_indices.append(best_idx)
+            path_frame_list.append((
+                os.path.join(save_directory, f"frame_{len(path_frame_list):06d}.jpg"),
+                best_frame
+            ))
+    def _write(p_f):
+        cv2.imwrite(p_f[0], p_f[1])
+        return p_f[0]
+    with ThreadPoolExecutor(max_workers=min(8, len(path_frame_list) or 1)) as ex:
+        paths = list(ex.map(_write, path_frame_list))
+    return final_indices, paths
+def video_to_image_frames_new(
+    input_video_path,
+    save_directory=None,
+    min_frames=1,
+    max_frames=64,
+    fallback_fps=1,
+):
+    """
+    Motion-aware frame extraction with local clarity refinement.
+    Strategy:
+    1. Sparse sampling (~0.5s) with DIS optical flow
+    2. Adaptive threshold allocation based on motion
+    3. Local clarity refinement (±3 frames) to avoid blur
+    """
+    if save_directory is None:
+        raise ValueError("save_directory must be provided")
+    max_frames = int(np.clip(max_frames, 1, 64))
+    min_frames = int(np.clip(min_frames, 1, max_frames))
+    cap = cv2.VideoCapture(input_video_path)
+    if not cap.isOpened():
+        print(f"Error: Failed to open video {input_video_path}")
+        return []
+    fps = cap.get(cv2.CAP_PROP_FPS) or fallback_fps or 30.0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    t_start = time.perf_counter()
+    # Phase 1: Sparse motion analysis
+    sparse_samples = _sparse_motion_analysis(cap, fps, total_frames)
+    cap.release()
+    t_phase1 = time.perf_counter()
+    print(f"[Timing] Phase 1 (Sparse Flow): {t_phase1 - t_start:.3f}s, Samples: {len(sparse_samples)}")
+    if not sparse_samples:
+        return []
+    # Phase 2: Adaptive frame selection
+    candidate_indices = _adaptive_frame_selection(sparse_samples, fps, max_frames)
+    candidate_indices = _enforce_frame_constraints(candidate_indices, sparse_samples, min_frames, max_frames)
+    # Phase 3: Local clarity refinement
+    cap = cv2.VideoCapture(input_video_path)
+    if not cap.isOpened():
+        return []
+    t_phase3_start = time.perf_counter()
+    search_window_size = 3
+    merged_windows = _merge_search_windows(candidate_indices, window_size=search_window_size)
+    # Read frames
+    t_read_start = time.perf_counter()
+    all_frames = _read_window_frames(cap, merged_windows, total_frames)
+    cap.release()
+    t_read_end = time.perf_counter()
+    # Parallel clarity calculation
+    t_clarity_start = time.perf_counter()
+    clarity_results = _compute_clarity_parallel(all_frames)
+    t_clarity_end = time.perf_counter()
+    # Select best frames
+    target_to_best = _select_best_frames(clarity_results, merged_windows, candidate_indices, search_window_size)
+    # Parallel save
+    t_save_start = time.perf_counter()
+    final_indices, extracted_paths = _save_frames_parallel(target_to_best, candidate_indices, save_directory)
+    t_save_end = time.perf_counter()
+    t_phase3_end = time.perf_counter()
+    print(f"[Timing] Phase 3 (Clarity Refinement + Save): {t_phase3_end - t_phase3_start:.3f}s")
+    print(f"  - Read frames: {t_read_end - t_read_start:.3f}s")
+    print(f"  - Parallel clarity: {t_clarity_end - t_clarity_start:.3f}s")
+    print(f"  - Parallel save: {t_save_end - t_save_start:.3f}s, Saved: {len(extracted_paths)}")
+    # Save metadata
+    try:
+        meta = {
+            "frame_indices": final_indices,
+            "frame_times": [i/fps for i in final_indices],
+            "fps": fps,
+            "algorithm": "sparse_dis_clarity_refined"
+        }
+        with open(os.path.join(save_directory, "frame_metadata.json"), "w") as f:
+            json.dump(meta, f, indent=2)
+        with open(os.path.join(save_directory, "frame_metrics.csv"), "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["frame_index", "time_sec", "motion", "selected"])
+            for s in sparse_samples:
+                writer.writerow([s["idx"], s["idx"]/fps, s["motion"],
+                               1 if s["idx"] in final_indices else 0])
+    except:
+        pass
+    print(f"Extracted {len(extracted_paths)} frames using DIS flow + local clarity refinement.")
+    return extracted_paths

hyworldmirror/utils/visual_util.py ADDED Viewed

	@@ -0,0 +1,617 @@

+""" Visual utilities for HuggingFace integration.
+References: https://github.com/facebookresearch/vggt
+"""
+import copy
+import os
+from typing import Tuple
+import cv2
+import matplotlib
+import numpy as np
+import requests
+import trimesh
+from scipy.spatial.transform import Rotation
+def segment_sky(image_or_path, onnx_session):
+    """
+    Segments sky from an image using an ONNX model.
+    Thanks for the great model provided by https://github.com/xiongzhu666/Sky-Segmentation-and-Post-processing
+    Args:
+        image_or_path: Path to input image (str) or BGR numpy array (H, W, 3)
+        onnx_session: ONNX runtime session with loaded model
+    Returns:
+        np.ndarray: Binary mask where 255 indicates non-sky regions
+    """
+    if isinstance(image_or_path, (str, os.PathLike)):
+        image = cv2.imread(str(image_or_path))
+    else:
+        image = image_or_path
+    result_map = run_skyseg(onnx_session, [320, 320], image)
+    # resize the result_map to the original image size
+    result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))
+    # Fix: Invert the mask so that 255 = non-sky, 0 = sky
+    # The model outputs low values for sky, high values for non-sky
+    output_mask = np.zeros_like(result_map_original)
+    output_mask[result_map_original < 32] = 255  # Use threshold of 32
+    return output_mask
+def run_skyseg(onnx_session, input_size, image):
+    """
+    Runs sky segmentation inference using ONNX model.
+    Args:
+        onnx_session: ONNX runtime session
+        input_size: Target size for model input (width, height)
+        image: Input image in BGR format
+    Returns:
+        np.ndarray: Segmentation mask
+    """
+    # Pre process:Resize, BGR->RGB, Transpose, PyTorch standardization, float32 cast
+    temp_image = copy.deepcopy(image)
+    resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1]))
+    x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB)
+    x = np.array(x, dtype=np.float32)
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    x = (x / 255 - mean) / std
+    x = x.transpose(2, 0, 1)
+    x = x.reshape(-1, 3, input_size[0], input_size[1]).astype("float32")
+    # Inference
+    input_name = onnx_session.get_inputs()[0].name
+    output_name = onnx_session.get_outputs()[0].name
+    onnx_result = onnx_session.run([output_name], {input_name: x})
+    # Post process
+    onnx_result = np.array(onnx_result).squeeze()
+    min_value = np.min(onnx_result)
+    max_value = np.max(onnx_result)
+    onnx_result = (onnx_result - min_value) / (max_value - min_value)
+    onnx_result *= 255
+    onnx_result = onnx_result.astype("uint8")
+    return onnx_result
+def download_file_from_url(url, filename):
+    """Downloads a file from a Hugging Face model repo, handling redirects."""
+    try:
+        # Get the redirect URL
+        response = requests.get(url, allow_redirects=False)
+        response.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)
+        if response.status_code == 302:  # Expecting a redirect
+            redirect_url = response.headers["Location"]
+            response = requests.get(redirect_url, stream=True)
+            response.raise_for_status()
+        else:
+            print(f"Unexpected status code: {response.status_code}")
+            return
+        with open(filename, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Downloaded {filename} successfully.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading file: {e}")
+def create_image_mesh(
+    *image_data: np.ndarray,
+    mask: np.ndarray = None,
+    triangulate: bool = False,
+    return_vertex_indices: bool = False,
+) -> Tuple[np.ndarray, ...]:
+    """
+    Create a mesh from image data using pixel coordinates as vertices and grid connections as faces.
+    Args:
+        *image_data (np.ndarray): Image arrays with shape (height, width, [channels])
+        mask (np.ndarray, optional): Boolean mask with shape (height, width). Defaults to None.
+        triangulate (bool): Convert quad faces to triangular faces. Defaults to False.
+        return_vertex_indices (bool): Include vertex indices in output. Defaults to False.
+    Returns:
+        faces (np.ndarray): Face connectivity array. Shape (N, 4) for quads or (N, 3) for triangles
+        *vertex_data (np.ndarray): Vertex attributes corresponding to input image_data
+        vertex_indices (np.ndarray, optional): Original vertex indices if return_vertex_indices=True
+    """
+    # Validate inputs
+    assert (len(image_data) > 0) or (mask is not None), "Need at least one image or mask"
+    if mask is None:
+        height, width = image_data[0].shape[:2]
+    else:
+        height, width = mask.shape
+    # Check all images have same dimensions
+    for img in image_data:
+        assert img.shape[:2] == (height, width), "All images must have same height and width"
+    # Create quad faces connecting neighboring pixels
+    base_quad = np.stack([
+        np.arange(0, width - 1, dtype=np.int32),           # bottom-left
+        np.arange(width, 2 * width - 1, dtype=np.int32),   # top-left
+        np.arange(1 + width, 2 * width, dtype=np.int32),   # top-right
+        np.arange(1, width, dtype=np.int32),               # bottom-right
+    ], axis=1)
+    # Replicate quad pattern for all rows
+    row_offsets = np.arange(0, (height - 1) * width, width, dtype=np.int32)
+    faces = (row_offsets[:, None, None] + base_quad[None, :, :]).reshape((-1, 4))
+    if mask is None:
+        # No masking - use all faces and vertices
+        if triangulate:
+            faces = _convert_quads_to_triangles(faces)
+        output = [faces]
+        for img in image_data:
+            output.append(img.reshape(-1, *img.shape[2:]))
+        if return_vertex_indices:
+            output.append(np.arange(height * width, dtype=np.int32))
+        return tuple(output)
+    else:
+        # Apply mask - only keep faces where all 4 corners are valid
+        valid_quads = (
+            mask[:-1, :-1] & mask[1:, :-1] &
+            mask[1:, 1:] & mask[:-1, 1:]
+        ).ravel()
+        faces = faces[valid_quads]
+        if triangulate:
+            faces = _convert_quads_to_triangles(faces)
+        # Remove unused vertices and remap face indices
+        num_face_vertices = faces.shape[-1]
+        unique_vertices, remapped_indices = np.unique(faces, return_inverse=True)
+        faces = remapped_indices.astype(np.int32).reshape(-1, num_face_vertices)
+        output = [faces]
+        for img in image_data:
+            flattened_img = img.reshape(-1, *img.shape[2:])
+            output.append(flattened_img[unique_vertices])
+        if return_vertex_indices:
+            output.append(unique_vertices)
+        return tuple(output)
+def _convert_quads_to_triangles(quad_faces: np.ndarray) -> np.ndarray:
+    """Convert quadrilateral faces to triangular faces."""
+    if quad_faces.shape[-1] == 3:
+        return quad_faces  # Already triangular
+    num_vertices_per_face = quad_faces.shape[-1]
+    triangle_indices = np.stack([
+        np.zeros(num_vertices_per_face - 2, dtype=int),                   # First vertex
+        np.arange(1, num_vertices_per_face - 1, dtype=int),               # Sequential vertices
+        np.arange(2, num_vertices_per_face, dtype=int),                   # Next sequential vertices
+    ], axis=1)
+    return quad_faces[:, triangle_indices].reshape((-1, 3))
+def convert_predictions_to_glb_scene(
+    predictions,
+    filter_by_frames="all",
+    show_camera=True,
+    mask_sky_bg=False,
+    mask_ambiguous=False,
+    as_mesh=True,
+) -> trimesh.Scene:
+    """
+    Converts model predictions to a 3D scene represented as a GLB file.
+    Args:
+        predictions (dict): Dictionary containing model predictions with keys:
+            - world_points: 3D point coordinates (S, H, W, 3)
+            - images: Input images (S, H, W, 3)
+            - camera_poses: Camera extrinsic matrices (S, 3, 4)
+        filter_by_frames (str): Frame filter specification (default: "all")
+        show_camera (bool): Include camera visualization (default: True)
+        mask_sky_bg (bool): Mask out sky background pixels (default: False)
+        mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False)
+        as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False)
+    Returns:
+        trimesh.Scene: Processed 3D scene containing point cloud/mesh and cameras
+    Raises:
+        ValueError: If input predictions structure is invalid
+    """
+    if not isinstance(predictions, dict):
+        raise ValueError("predictions must be a dictionary")
+    print("Building GLB scene")
+    # Parse frame selection from filter string
+    target_frame_index = None
+    if filter_by_frames not in ["all", "All"]:
+        try:
+            # Extract numeric index before colon separator
+            target_frame_index = int(filter_by_frames.split(":")[0])
+        except (ValueError, IndexError):
+            pass
+    # Validate required data in predictions
+    print("Using Pointmap Branch")
+    if "world_points" not in predictions:
+        raise ValueError(
+            "world_points not found in predictions. Pointmap Branch requires 'world_points' key. "
+            "Depthmap and Camera branches have been removed."
+        )
+    # Extract prediction data
+    point_cloud_3d = predictions["world_points"]
+    input_images = predictions["images"]
+    extrinsic_matrices = predictions["camera_poses"]
+    ambiguity_mask = predictions["final_mask"]
+    sky_region_mask = predictions["sky_mask"]
+    # Filter to single frame if specified
+    if target_frame_index is not None:
+        point_cloud_3d = point_cloud_3d[target_frame_index][None]
+        input_images = input_images[target_frame_index][None]
+        extrinsic_matrices = extrinsic_matrices[target_frame_index][None]
+        ambiguity_mask = ambiguity_mask[target_frame_index][None]
+        sky_region_mask = sky_region_mask[target_frame_index][None]
+    # Flatten 3D points to vertex array
+    flattened_vertices = point_cloud_3d.reshape(-1, 3)
+    # Convert images to RGB color array
+    if input_images.ndim == 4 and input_images.shape[1] == 3:  # NCHW format
+        rgb_colors = np.transpose(input_images, (0, 2, 3, 1))
+    else:  # Already in NHWC format
+        rgb_colors = input_images
+    rgb_colors = (rgb_colors.reshape(-1, 3) * 255).astype(np.uint8)
+    # Build composite filtering mask
+    valid_points_mask = np.ones(len(flattened_vertices), dtype=bool)
+    # Apply ambiguity filtering if requested
+    if mask_ambiguous:
+        flat_ambiguity_mask = ambiguity_mask.reshape(-1)
+        valid_points_mask = valid_points_mask & flat_ambiguity_mask
+    # Apply sky region filtering if requested
+    if mask_sky_bg:
+        flat_sky_mask = sky_region_mask.reshape(-1)
+        valid_points_mask = valid_points_mask & flat_sky_mask
+    # Apply mask to filter vertices and colors
+    filtered_vertices = flattened_vertices[valid_points_mask].copy()
+    filtered_colors = rgb_colors[valid_points_mask].copy()
+    # Handle empty geometry case
+    if filtered_vertices is None or np.asarray(filtered_vertices).size == 0:
+        filtered_vertices = np.array([[1, 0, 0]])
+        filtered_colors = np.array([[255, 255, 255]])
+        scene_scale_factor = 1
+    else:
+        # Compute scene scale from percentile-based bounding box
+        percentile_lower = np.percentile(filtered_vertices, 5, axis=0)
+        percentile_upper = np.percentile(filtered_vertices, 95, axis=0)
+        scene_scale_factor = np.linalg.norm(percentile_upper - percentile_lower)
+    # Initialize color mapping for cameras
+    color_palette = matplotlib.colormaps.get_cmap("gist_rainbow")
+    # Create empty 3D scene container
+    output_scene = trimesh.Scene()
+    # Add geometry to scene based on representation type
+    if as_mesh:
+        # Mesh representation
+        if target_frame_index is not None:
+            # Single frame mesh generation
+            frame_height, frame_width = point_cloud_3d.shape[1:3]
+            # Prepare unfiltered data for mesh construction
+            structured_points = point_cloud_3d.reshape(frame_height, frame_width, 3)
+            # Convert image data to proper format
+            if input_images.ndim == 4 and input_images.shape[1] == 3:  # NCHW format
+                structured_colors = np.transpose(input_images[0], (1, 2, 0))
+            else:  # Already in HWC format
+                structured_colors = input_images[0]
+            structured_colors *= 255
+            # Get structured mask for mesh creation
+            structured_mask = predictions["final_mask"][target_frame_index].reshape(
+                frame_height, frame_width
+            )
+            # Build filtering mask
+            mesh_filter_mask = structured_mask
+            # Check for normal data availability
+            mesh_normals = None
+            if "normal" in predictions and predictions["normal"] is not None:
+                # Extract normals for selected frame
+                frame_normal_data = (
+                    predictions["normal"][target_frame_index]
+                    if target_frame_index is not None
+                    else predictions["normal"][0]
+                )
+                # Generate mesh with normal information
+                mesh_faces, mesh_vertices, mesh_colors, mesh_normals = create_image_mesh(
+                    structured_points * np.array([1, -1, 1], dtype=np.float32),
+                    structured_colors / 255.0,
+                    frame_normal_data * np.array([1, -1, 1], dtype=np.float32),
+                    mask=mesh_filter_mask,
+                    triangulate=True,
+                    return_vertex_indices=False,
+                )
+                # Apply coordinate system transformation to normals
+                mesh_normals = mesh_normals * np.array([1, -1, 1], dtype=np.float32)
+            else:
+                # Generate mesh without normal information
+                mesh_faces, mesh_vertices, mesh_colors = create_image_mesh(
+                    structured_points * np.array([1, -1, 1], dtype=np.float32),
+                    structured_colors / 255.0,
+                    mask=mesh_filter_mask,
+                    triangulate=True,
+                    return_vertex_indices=False,
+                )
+            # Construct trimesh object with optional normals
+            geometry_mesh = trimesh.Trimesh(
+                vertices=mesh_vertices * np.array([1, -1, 1], dtype=np.float32),
+                faces=mesh_faces,
+                vertex_colors=(mesh_colors * 255).astype(np.uint8),
+                vertex_normals=(mesh_normals if mesh_normals is not None else None),
+                process=False,
+            )
+            output_scene.add_geometry(geometry_mesh)
+        else:
+            # Multi-frame mesh generation
+            print("Creating mesh for multi-frame data...")
+            for frame_idx in range(point_cloud_3d.shape[0]):
+                frame_height, frame_width = point_cloud_3d.shape[1:3]
+                # Extract per-frame data
+                frame_point_data = point_cloud_3d[frame_idx]
+                frame_ambiguity_mask = predictions["final_mask"][frame_idx]
+                frame_sky_mask = predictions["sky_mask"][frame_idx]
+                # Extract frame image data
+                if input_images.ndim == 4 and input_images.shape[1] == 3:  # NCHW format
+                    frame_image_data = np.transpose(input_images[frame_idx], (1, 2, 0))
+                else:  # Already in HWC format
+                    frame_image_data = input_images[frame_idx]
+                frame_image_data *= 255
+                # Build per-frame filtering mask
+                frame_filter_mask = np.ones((frame_height, frame_width), dtype=bool)
+                # Apply ambiguity filtering if enabled
+                if mask_ambiguous:
+                    frame_filter_mask = frame_filter_mask & frame_ambiguity_mask
+                # Apply sky filtering if enabled
+                if mask_sky_bg:
+                    frame_filter_mask = frame_filter_mask & frame_sky_mask
+                # Generate mesh for current frame
+                frame_faces, frame_vertices, frame_colors = create_image_mesh(
+                    frame_point_data * np.array([1, -1, 1], dtype=np.float32),
+                    frame_image_data / 255.0,
+                    mask=frame_filter_mask,
+                    triangulate=True,
+                    return_vertex_indices=False,
+                )
+                frame_vertices = frame_vertices * np.array([1, -1, 1], dtype=np.float32)
+                # Create trimesh object for current frame
+                frame_geometry = trimesh.Trimesh(
+                    vertices=frame_vertices,
+                    faces=frame_faces,
+                    vertex_colors=(frame_colors * 255).astype(np.uint8),
+                    process=False,
+                )
+                output_scene.add_geometry(frame_geometry)
+    else:
+        # Point cloud representation
+        point_cloud_geometry = trimesh.PointCloud(vertices=filtered_vertices, colors=filtered_colors)
+        output_scene.add_geometry(point_cloud_geometry)
+    # Add camera visualizations if requested
+    num_camera_views = len(extrinsic_matrices)
+    if show_camera:
+        # Iterate through all camera views
+        for camera_idx in range(num_camera_views):
+            camera_extrinsic = extrinsic_matrices[camera_idx]
+            camera_color_rgba = color_palette(camera_idx / num_camera_views)
+            camera_color_rgb = tuple(int(255 * x) for x in camera_color_rgba[:3])
+            integrate_camera_into_scene(
+                output_scene, camera_extrinsic, camera_color_rgb, scene_scale_factor
+            )
+    # Define coordinate system transformation matrices
+    opengl_transform = np.eye(4)
+    opengl_transform[1, 1] = -1  # Flip Y axis
+    opengl_transform[2, 2] = -1  # Flip Z axis
+    # Define alignment rotation (180 degrees around Y-axis)
+    alignment_rotation = np.eye(4)
+    alignment_rotation[:3, :3] = Rotation.from_euler("y", 0, degrees=True).as_matrix()
+    # Compute and apply final transformation
+    scene_transformation = (
+        np.linalg.inv(extrinsic_matrices[0])
+        @ opengl_transform
+        @ alignment_rotation
+    )
+    output_scene.apply_transform(scene_transformation)
+    print("GLB Scene built")
+    return output_scene
+def integrate_camera_into_scene(
+    scene: trimesh.Scene,
+    camera_transform: np.ndarray,
+    camera_color: tuple,
+    scale_factor: float,
+):
+    """
+    Adds a camera visualization mesh to the 3D scene.
+    Args:
+        scene (trimesh.Scene): The 3D scene to add the camera visualization.
+        camera_transform (np.ndarray): 4x4 transformation matrix for camera positioning.
+        camera_color (tuple): RGB color tuple for the camera mesh.
+        scale_factor (float): Scaling factor for the camera size relative to scene.
+    """
+    # Define camera dimensions based on scene scale
+    camera_base_width = scale_factor * 0.05
+    camera_cone_height = scale_factor * 0.1
+    # Create base cone geometry for camera representation
+    base_cone = trimesh.creation.cone(camera_base_width, camera_cone_height, sections=4)
+    # Setup rotation transformation (45 degrees around z-axis)
+    z_rotation_matrix = np.eye(4)
+    z_rotation_matrix[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
+    z_rotation_matrix[2, 3] = -camera_cone_height
+    # Setup OpenGL coordinate system conversion
+    opengl_coord_transform = np.eye(4)
+    opengl_coord_transform[1, 1] = -1  # Flip Y axis
+    opengl_coord_transform[2, 2] = -1  # Flip Z axis
+    # Combine all transformations
+    final_transform = camera_transform @ opengl_coord_transform @ z_rotation_matrix
+    # Create slight rotation for mesh variation
+    minor_rotation = np.eye(4)
+    minor_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()
+    # Generate multiple vertex sets for complex camera geometry
+    original_vertices = base_cone.vertices
+    scaled_vertices = 0.95 * original_vertices
+    rotated_vertices = apply_transformation_to_points(minor_rotation, original_vertices)
+    # Combine all vertex sets
+    all_vertices = np.concatenate([
+        original_vertices,
+        scaled_vertices,
+        rotated_vertices
+    ])
+    # Transform vertices to final position
+    transformed_vertices = apply_transformation_to_points(final_transform, all_vertices)
+    # Generate faces for the complete camera mesh
+    camera_faces = generate_camera_mesh_faces(base_cone)
+    # Create and configure the camera mesh
+    camera_mesh = trimesh.Trimesh(
+        vertices=transformed_vertices,
+        faces=camera_faces
+    )
+    camera_mesh.visual.face_colors[:, :3] = camera_color
+    # Add the camera mesh to the scene
+    scene.add_geometry(camera_mesh)
+def apply_transformation_to_points(
+    transform_matrix: np.ndarray, point_array: np.ndarray, output_dim: int = None
+) -> np.ndarray:
+    """
+    Applies a 4x4 transformation matrix to a collection of 3D points.
+    Args:
+        transform_matrix (np.ndarray): 4x4 transformation matrix to apply.
+        point_array (np.ndarray): Array of points to transform.
+        output_dim (int, optional): Target dimension for output points.
+    Returns:
+        np.ndarray: Array of transformed points.
+    """
+    point_array = np.asarray(point_array)
+    original_shape = point_array.shape[:-1]
+    target_dim = output_dim or point_array.shape[-1]
+    # Transpose transformation matrix for matrix multiplication
+    transposed_transform = transform_matrix.swapaxes(-1, -2)
+    # Apply rotation/scaling and translation components
+    transformed_points = (
+        point_array @ transposed_transform[..., :-1, :] +
+        transposed_transform[..., -1:, :]
+    )
+    # Extract desired dimensions and restore original shape
+    final_result = transformed_points[..., :target_dim].reshape(*original_shape, target_dim)
+    return final_result
+def generate_camera_mesh_faces(base_cone_mesh: trimesh.Trimesh) -> np.ndarray:
+    """
+    Generates face indices for a complex camera mesh composed of multiple cone layers.
+    Args:
+        base_cone_mesh (trimesh.Trimesh): Base cone geometry used as template.
+    Returns:
+        np.ndarray: Array of face indices defining the camera mesh topology.
+    """
+    face_indices = []
+    vertex_count_per_cone = len(base_cone_mesh.vertices)
+    # Process each face of the base cone
+    for triangle_face in base_cone_mesh.faces:
+        # Skip faces that include the cone tip (vertex 0)
+        if 0 in triangle_face:
+            continue
+        # Get vertex indices for current triangle
+        vertex_a, vertex_b, vertex_c = triangle_face
+        # Calculate corresponding vertices in second and third cone layers
+        vertex_a_layer2, vertex_b_layer2, vertex_c_layer2 = triangle_face + vertex_count_per_cone
+        vertex_a_layer3, vertex_b_layer3, vertex_c_layer3 = triangle_face + 2 * vertex_count_per_cone
+        # Create connecting faces between cone layers
+        connecting_faces = [
+            (vertex_a, vertex_b, vertex_b_layer2),
+            (vertex_a, vertex_a_layer2, vertex_c),
+            (vertex_c_layer2, vertex_b, vertex_c),
+            (vertex_a, vertex_b, vertex_b_layer3),
+            (vertex_a, vertex_a_layer3, vertex_c),
+            (vertex_c_layer3, vertex_b, vertex_c),
+        ]
+        face_indices.extend(connecting_faces)
+    # Add reverse-winding faces for proper mesh closure
+    reversed_faces = [(vertex_c, vertex_b, vertex_a) for vertex_a, vertex_b, vertex_c in face_indices]
+    face_indices.extend(reversed_faces)
+    return np.array(face_indices)

hyworldmirror/utils/warnings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Wrapper utilities for warnings.
+"""
+import warnings
+from functools import wraps
+class no_warnings:
+    def __init__(self, action: str = "ignore", **kwargs):
+        self.action = action
+        self.filter_kwargs = kwargs
+    def __call__(self, fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter(self.action, **self.filter_kwargs)
+                return fn(*args, **kwargs)
+        return wrapper
+    def __enter__(self):
+        self.warnings_manager = warnings.catch_warnings()
+        self.warnings_manager.__enter__()
+        warnings.simplefilter(self.action, **self.filter_kwargs)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.warnings_manager.__exit__(exc_type, exc_val, exc_tb)

pipeline.py ADDED Viewed

	@@ -0,0 +1,847 @@

+"""
+HunyuanWorld-Mirror Inference Pipeline
+Usage:
+    # Python API — Single GPU
+    from hyworld2.worldrecon.pipeline import WorldMirrorPipeline
+    pipeline = WorldMirrorPipeline.from_pretrained('tencent/HY-World-2.0')
+    result = pipeline('path/to/images')
+    # Python API — Multi-GPU (in a torchrun script)
+    pipeline = WorldMirrorPipeline.from_pretrained(
+        'tencent/HY-World-2.0', use_fsdp=True, enable_bf16=True)
+    result = pipeline('path/to/images')
+    # CLI — Single GPU
+    python -m hyworld2.worldrecon.pipeline --input_path path/to/images
+    # CLI — Multi-GPU
+    torchrun --nproc_per_node=2 -m hyworld2.worldrecon.pipeline --input_path path/to/images --use_fsdp --enable_bf16
+"""
+import argparse
+import functools
+import gc
+import os
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+import numpy as np
+import torch
+import torch.distributed as dist
+from omegaconf import OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
+    CPUOffload,
+)
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from .hyworldmirror.models.models.worldmirror import WorldMirror
+from .hyworldmirror.models.layers.block import Block, DistBlock
+from .hyworldmirror.models.heads.dense_head import DPTHead
+from .hyworldmirror.models.heads.camera_head import CameraHead
+from .hyworldmirror.utils.inference_utils import (
+    prepare_images_to_tensor,
+    prepare_input,
+    compute_adaptive_target_size,
+    compute_preprocessing_transform,
+    load_prior_camera,
+    load_prior_depth,
+    compute_sky_mask,
+    compute_filter_mask,
+    save_results,
+    print_and_save_timings,
+)
+from .hyworldmirror.utils.render_utils import render_interpolated_video
+# ============================================================
+# Model loading helpers (checkpoint, config, selective load)
+# ============================================================
+def _get_model_config_from_yaml(cfg) -> dict:
+    if hasattr(cfg, "wrapper") and hasattr(cfg.wrapper, "model"):
+        model_cfg = cfg.wrapper.model
+    elif hasattr(cfg, "model"):
+        model_cfg = cfg.model
+    else:
+        raise ValueError("No model config found (expect wrapper.model or model).")
+    out = OmegaConf.to_container(model_cfg, resolve=True)
+    out.pop("_target_", None)
+    return out
+def _load_checkpoint_state_dict(ckpt_path: str) -> dict:
+    if ckpt_path.endswith(".safetensors"):
+        return load_safetensors(ckpt_path)
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    state = ckpt.get("state_dict", ckpt)
+    if "state_dict" in ckpt:
+        state = {k.replace("model.", ""): v for k, v in state.items()}
+    return state
+def _load_state_dict_selective(model, ckpt_state, source_name="checkpoint"):
+    current = model.state_dict()
+    for key in current:
+        if key in ckpt_state and current[key].shape == ckpt_state[key].shape:
+            current[key] = ckpt_state[key]
+    model.load_state_dict(current, strict=True)
+    matched = sum(1 for k in current if k in ckpt_state and current[k].shape == ckpt_state[k].shape)
+    print(f"  Loaded {matched}/{len(current)} keys from {source_name}")
+def _has_model_files(path: str) -> bool:
+    """Check whether a directory contains the expected model artifacts."""
+    has_weights = os.path.isfile(os.path.join(path, "model.safetensors"))
+    has_config = (os.path.isfile(os.path.join(path, "config.yaml"))
+                  or os.path.isfile(os.path.join(path, "config.json")))
+    return has_weights and has_config
+def _resolve_model_dir(model_path: str, subfolder: str) -> str:
+    """Resolve a local directory containing config + model.safetensors.
+    Resolution order:
+      1. {model_path}/{subfolder}  — local repo root with subfolder
+      2. {model_path}              — direct local path (backward compat)
+      3. HuggingFace download: snapshot_download(repo_id, allow_patterns=[subfolder/*])
+    """
+    candidate = os.path.join(model_path, subfolder)
+    if os.path.isdir(candidate) and _has_model_files(candidate):
+        print(f"[Init] Found local model at {candidate}")
+        return candidate
+    if os.path.isdir(model_path) and _has_model_files(model_path):
+        print(f"[Init] Found local model at {model_path}")
+        return model_path
+    print(f"[Init] Downloading from HuggingFace: {model_path} (subfolder={subfolder})")
+    from huggingface_hub import snapshot_download
+    repo_root = snapshot_download(
+        repo_id=model_path,
+        allow_patterns=[f"{subfolder}/*"],
+    )
+    resolved = os.path.join(repo_root, subfolder)
+    if not _has_model_files(resolved):
+        raise FileNotFoundError(
+            f"Downloaded repo '{model_path}' but subfolder '{subfolder}' "
+            f"does not contain model.safetensors + config. "
+            f"Check that the repo and subfolder name are correct."
+        )
+    return resolved
+def _load_model_config(model_dir: str) -> dict:
+    """Load model constructor kwargs from config.yaml or config.json in model_dir."""
+    import json as _json
+    yaml_path = os.path.join(model_dir, "config.yaml")
+    json_path = os.path.join(model_dir, "config.json")
+    if os.path.isfile(yaml_path):
+        cfg = OmegaConf.load(yaml_path)
+        return _get_model_config_from_yaml(cfg)
+    elif os.path.isfile(json_path):
+        with open(json_path) as f:
+            return _json.load(f)
+    else:
+        raise FileNotFoundError(f"No config.yaml or config.json in {model_dir}")
+# ============================================================
+# FSDP / bf16 helpers
+# ============================================================
+def _collect_fp32_critical_modules(model):
+    from .hyworldmirror.models.layers.mlp import MlpFP32
+    critical = set()
+    for _, module in model.named_modules():
+        if isinstance(module, MlpFP32) and hasattr(module, 'fc2'):
+            if any(p.dtype == torch.float32 for p in module.fc2.parameters()):
+                critical.add(module.fc2)
+        if hasattr(module, 'scratch') and hasattr(module.scratch, 'output_conv2'):
+            oc2 = module.scratch.output_conv2
+            if any(p.dtype == torch.float32 for p in oc2.parameters()):
+                critical.add(oc2)
+    return critical
+def _cast_noncritical_fp32_to_bf16(model, critical_modules):
+    critical_ids = {id(p) for mod in critical_modules for p in mod.parameters()}
+    cast = []
+    for name, param in model.named_parameters():
+        if param.dtype == torch.float32 and id(param) not in critical_ids:
+            param.data = param.data.to(torch.bfloat16)
+            cast.append(name)
+    for _, buf in model.named_buffers():
+        if buf.dtype == torch.float32:
+            buf.data = buf.data.to(torch.bfloat16)
+    def _hook(module, args):
+        if not args:
+            return args
+        dtype = next((p.dtype for p in module.parameters(recurse=False)), None)
+        if dtype is None:
+            return args
+        return tuple(a.to(dtype) if isinstance(a, torch.Tensor) and a.is_floating_point() and a.dtype != dtype else a
+                     for a in args)
+    for name, module in model.named_modules():
+        if not any(True for _ in module.children()):
+            own = list(module.named_parameters(recurse=False))
+            if own and all(p.dtype == torch.bfloat16 for _, p in own):
+                pfx = name + "." if name else ""
+                if any(c.startswith(pfx) for c in cast):
+                    module.register_forward_pre_hook(_hook)
+def _wrap_model_fsdp(model, sp_group, device, use_cpu_offload=False, enable_bf16=False):
+    wrap_cls = {DistBlock, Block, DPTHead, CameraHead}
+    if enable_bf16:
+        fp32_critical = _collect_fp32_critical_modules(model)
+        def policy(module, recurse, nonwrapped_numel, **kw):
+            if recurse:
+                return True
+            return isinstance(module, tuple(wrap_cls)) or module in fp32_critical
+        auto_wrap_policy = policy
+    else:
+        auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy, transformer_layer_cls=wrap_cls)
+    fsdp_model = FSDP(
+        model, process_group=sp_group,
+        sharding_strategy=ShardingStrategy.FULL_SHARD,
+        auto_wrap_policy=auto_wrap_policy, mixed_precision=None,
+        cpu_offload=CPUOffload(offload_params=True) if use_cpu_offload else None,
+        device_id=device, use_orig_params=True, sync_module_states=True,
+        forward_prefetch=False,
+    )
+    rank = dist.get_rank()
+    if rank == 0:
+        total = sum(p.numel() for p in fsdp_model.parameters())
+        local = sum(getattr(p, '_local_tensor', p).numel() for p in fsdp_model.parameters())
+        print(f"[FSDP] total={total/1e6:.1f}M, local≈{local/1e6:.1f}M")
+    return fsdp_model
+# ============================================================
+# WorldMirrorPipeline
+# ============================================================
+class WorldMirrorPipeline:
+    """HunyuanWorld-Mirror inference pipeline.
+    Supports single-GPU and multi-GPU (Sequence Parallel) inference with
+    a unified API. Multi-GPU mode is auto-detected from torch.distributed.
+    """
+    def __init__(self, model, device, sp_size=1, sp_group=None, rank=0):
+        self.model = model
+        self.device = device
+        self.sp_size = sp_size
+        self.sp_group = sp_group
+        self.rank = rank
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str = "tencent/HY-World-2.0",
+        *,
+        subfolder: str = "HY-WorldMirror-2.0",
+        config_path: str = None,
+        ckpt_path: str = None,
+        use_fsdp: bool = False,
+        enable_bf16: bool = False,
+        fsdp_cpu_offload: bool = False,
+        disable_heads: list = None,
+    ) -> "WorldMirrorPipeline":
+        """Load model and create pipeline instance.
+        Automatically detects distributed mode (torchrun sets WORLD_SIZE).
+        Args:
+            pretrained_model_name_or_path: HuggingFace repo ID or local path.
+                The model files are expected under ``{path}/{subfolder}/``.
+            subfolder: Subfolder inside the repo that contains the WorldMirror
+                checkpoint (model.safetensors + config).
+            config_path: Training config YAML (used with ckpt_path).
+            ckpt_path: Checkpoint file (.ckpt / .safetensors).
+            use_fsdp: Shard parameters across GPUs via FSDP.
+            enable_bf16: Use bf16 precision (except critical layers).
+            fsdp_cpu_offload: Offload FSDP params to CPU.
+            disable_heads: List of heads to disable, e.g. ["camera", "depth"].
+        """
+        is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
+        if is_distributed:
+            if not dist.is_initialized():
+                dist.init_process_group(backend="nccl")
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            local_rank = int(os.environ.get("LOCAL_RANK", rank))
+            torch.cuda.set_device(local_rank)
+            device = torch.device("cuda", local_rank)
+            sp_size = world_size
+            sp_group = dist.new_group(ranks=list(range(sp_size)))
+            if rank == 0:
+                print(f"[Pipeline] Multi-GPU: world_size={world_size}, sp_size={sp_size}")
+        else:
+            rank, sp_size = 0, 1
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            sp_group = None
+            if use_fsdp:
+                print("[Pipeline] Warning: use_fsdp is ignored in single-GPU mode (FSDP requires torchrun with multiple GPUs)")
+                use_fsdp = False
+            print("[Pipeline] Single-GPU mode")
+        # Load model
+        t0 = time.perf_counter()
+        if config_path and ckpt_path:
+            print(f"[Init] config={config_path}, ckpt={ckpt_path}, sp_size={sp_size}")
+            cfg = OmegaConf.load(config_path)
+            model_cfg = _get_model_config_from_yaml(cfg)
+            if sp_size > 1:
+                model_cfg["sp_size"] = sp_size
+            if enable_bf16:
+                model_cfg["enable_bf16"] = True
+            model = WorldMirror(**model_cfg).to(device)
+            state = _load_checkpoint_state_dict(ckpt_path)
+            _load_state_dict_selective(model, state, source_name=ckpt_path)
+            del state; gc.collect(); torch.cuda.empty_cache()
+        else:
+            model_dir = _resolve_model_dir(pretrained_model_name_or_path, subfolder)
+            model_cfg = _load_model_config(model_dir)
+            if sp_size > 1:
+                model_cfg["sp_size"] = sp_size
+            if enable_bf16:
+                model_cfg["enable_bf16"] = True
+            model = WorldMirror(**model_cfg).to(device)
+            state = load_safetensors(os.path.join(model_dir, "model.safetensors"))
+            _load_state_dict_selective(model, state, source_name=model_dir)
+            del state; gc.collect(); torch.cuda.empty_cache()
+        # bf16 casting — two strategies depending on FSDP:
+        #
+        # Multi-GPU + FSDP: cast everything to bf16 uniformly (including fc2).
+        #   FSDP requires uniform dtype per flat-param unit.
+        #
+        # Single GPU (no FSDP): cast to bf16, then restore critical fp32
+        #   modules (MlpFP32.fc2, output_conv2) so their .float() calls work.
+        #   Register input-cast hooks on bf16 leaf modules for dtype boundaries.
+        if enable_bf16:
+            if use_fsdp and is_distributed:
+                model.to(torch.bfloat16)
+                crit = _collect_fp32_critical_modules(model)
+                _cast_noncritical_fp32_to_bf16(model, crit)
+            else:
+                crit = _collect_fp32_critical_modules(model)
+                model.to(torch.bfloat16)
+                for mod in crit:
+                    mod.to(torch.float32)
+                def _input_cast_hook(module, args):
+                    if not args:
+                        return args
+                    dtype = next((p.dtype for p in module.parameters(recurse=False)), None)
+                    if dtype is None:
+                        return args
+                    return tuple(
+                        a.to(dtype) if isinstance(a, torch.Tensor) and a.is_floating_point() and a.dtype != dtype else a
+                        for a in args
+                    )
+                for _, module in model.named_modules():
+                    if not any(True for _ in module.children()):
+                        own = list(module.parameters(recurse=False))
+                        if own and all(p.dtype == torch.bfloat16 for p in own):
+                            module.register_forward_pre_hook(_input_cast_hook)
+        model.eval()
+        # Disable unused heads
+        if disable_heads:
+            _disable_heads(model, disable_heads)
+        # FSDP wrapping
+        if use_fsdp and is_distributed:
+            model = _wrap_model_fsdp(model, sp_group, device,
+                                     use_cpu_offload=fsdp_cpu_offload,
+                                     enable_bf16=enable_bf16)
+            if enable_bf16:
+                inner = model.module if hasattr(model, 'module') else model
+                inner.to = lambda *a, **kw: inner
+        if rank == 0:
+            print(f"[Init] Model ready in {time.perf_counter() - t0:.1f}s")
+            if torch.cuda.is_available():
+                alloc = torch.cuda.memory_allocated(device) / (1024**3)
+                print(f"[Memory] allocated={alloc:.2f}GB")
+        return cls(model, device, sp_size, sp_group, rank)
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_path: str,
+        output_path: str = "inference_output",
+        *,
+        # Inference
+        target_size: int = 952,
+        fps: int = 1,
+        video_strategy: str = "new",
+        video_min_frames: int = 1,
+        video_max_frames: int = 32,
+        # Save
+        save_depth: bool = True,
+        save_normal: bool = True,
+        save_gs: bool = True,
+        save_camera: bool = True,
+        save_points: bool = True,
+        save_colmap: bool = False,
+        save_conf: bool = False,
+        # Mask
+        apply_sky_mask: bool = True,
+        apply_edge_mask: bool = True,
+        apply_confidence_mask: bool = False,
+        save_sky_mask: bool = False,
+        sky_mask_source: str = "auto",
+        model_sky_threshold: float = 0.45,
+        confidence_percentile: float = 10.0,
+        edge_normal_threshold: float = 1.0,
+        edge_depth_threshold: float = 0.03,
+        # Compression
+        compress_pts: bool = True,
+        compress_pts_max_points: int = 2_000_000,
+        compress_pts_voxel_size: float = 0.002,
+        max_resolution: int = 1920,
+        compress_gs_max_points: int = 5_000_000,
+        # Prior
+        prior_cam_path: str = None,
+        prior_depth_path: str = None,
+        # Rendered video
+        save_rendered: bool = False,
+        render_interp_per_pair: int = 15,
+        render_depth: bool = False,
+        # Misc
+        log_time: bool = True,
+        strict_output_path: str = None,
+    ) -> str:
+        """Run inference on images/video and save results.
+        Args:
+            input_path: Directory of images or a video file.
+            output_path: Root output directory.
+            **kwargs: Override default inference parameters.
+        Returns:
+            Path to the output directory (str), or None on skip.
+        """
+        model = self.model
+        device = self.device
+        sp_size, sp_group, rank = self.sp_size, self.sp_group, self.rank
+        is_distributed = sp_size > 1
+        case_t0 = time.perf_counter()
+        timings = {}
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # 1. Prepare input
+        t0 = time.perf_counter()
+        img_paths, subdir_name = prepare_input(
+            input_path, target_size=target_size, fps=fps,
+            video_strategy=video_strategy,
+            min_frames=video_min_frames, max_frames=video_max_frames,
+        )
+        if log_time:
+            timings["data_loading"] = time.perf_counter() - t0
+        if strict_output_path is not None:
+            outdir = Path(strict_output_path)
+        else:
+            outdir = Path(output_path) / subdir_name / timestamp
+        # 2. Adaptive resolution
+        effective = compute_adaptive_target_size(img_paths, target_size)
+        if rank == 0 and effective != target_size:
+            print(f"[Inference] Adaptive resolution: {effective} (max={target_size})")
+        # 3. Inference
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats(device)
+            torch.cuda.synchronize(device)
+        t0_all = time.perf_counter()
+        try:
+            predictions, imgs, infer_time = self._run_inference(
+                img_paths, effective, prior_cam_path, prior_depth_path)
+        except ValueError as e:
+            if rank == 0:
+                print(f"[Pipeline] Skipping '{input_path}': {e}")
+            return None
+        if log_time:
+            timings["inference"] = infer_time
+            timings["inference_preprocess"] = time.perf_counter() - t0_all - infer_time
+        # GPU memory stats (multi-GPU)
+        if log_time and torch.cuda.is_available() and is_distributed:
+            peak = torch.cuda.max_memory_allocated(device) / (1024**3)
+            peak_t = torch.tensor([peak], dtype=torch.float64, device=device)
+            gathered = [torch.zeros(1, dtype=torch.float64, device=device) for _ in range(sp_size)]
+            dist.all_gather(gathered, peak_t, group=sp_group)
+            timings["gpu_mem_peak_per_rank_gb"] = [t.item() for t in gathered]
+            timings["gpu_mem_peak_avg_gb"] = sum(timings["gpu_mem_peak_per_rank_gb"]) / sp_size
+        # 4. Post-processing and saving (rank 0 only)
+        if rank == 0:
+            B, S, C, H, W = imgs.shape
+            t0 = time.perf_counter()
+            sky_mask = (compute_sky_mask(
+                img_paths, H, W, S, predictions=predictions,
+                source=sky_mask_source, model_threshold=model_sky_threshold,
+                processed_aspect_ratio=W / H,
+            ) if apply_sky_mask else None)
+            filter_mask, gs_filter_mask = None, None
+            if apply_confidence_mask or apply_edge_mask or apply_sky_mask:
+                filter_mask, gs_filter_mask = compute_filter_mask(
+                    predictions, imgs, img_paths, H, W, S,
+                    apply_confidence_mask=apply_confidence_mask,
+                    apply_edge_mask=apply_edge_mask,
+                    apply_sky_mask=apply_sky_mask,
+                    confidence_percentile=confidence_percentile,
+                    edge_normal_threshold=edge_normal_threshold,
+                    edge_depth_threshold=edge_depth_threshold,
+                    sky_mask=sky_mask, use_gs_depth=save_gs,
+                )
+            if log_time:
+                timings["compute_mask"] = time.perf_counter() - t0
+            t0 = time.perf_counter()
+            save_timings = save_results(
+                predictions, imgs, img_paths, outdir,
+                save_depth=save_depth, save_normal=save_normal,
+                save_gs=save_gs, save_camera=save_camera,
+                save_points=save_points, save_colmap=save_colmap,
+                save_sky_mask=save_sky_mask, save_conf=save_conf,
+                log_time=log_time, max_resolution=max_resolution,
+                filter_mask=filter_mask, gs_filter_mask=gs_filter_mask,
+                sky_mask=sky_mask,
+                compress_pts=compress_pts,
+                compress_pts_max_points=compress_pts_max_points,
+                compress_pts_voxel_size=compress_pts_voxel_size,
+                compress_gs_max_points=compress_gs_max_points,
+            )
+            if log_time:
+                timings.update(save_timings or {})
+                timings["save_total_wall"] = time.perf_counter() - t0
+            # Render interpolated video from Gaussian splats
+            if save_rendered and "splats" in predictions:
+                inner_model = model.module if hasattr(model, 'module') else model
+                if hasattr(inner_model, 'gs_renderer'):
+                    t0_render = time.perf_counter()
+                    try:
+                        splats_f32 = {k: v.float() if isinstance(v, torch.Tensor) else v
+                                      for k, v in predictions["splats"].items()}
+                        render_interpolated_video(
+                            inner_model.gs_renderer,
+                            splats_f32,
+                            predictions["camera_poses"].float(),
+                            predictions["camera_intrs"].float(),
+                            (H, W),
+                            outdir / "rendered",
+                            interp_per_pair=render_interp_per_pair,
+                            loop_reverse=(S <= 2),
+                            render_depth=render_depth,
+                        )
+                        if log_time:
+                            timings["render_video"] = time.perf_counter() - t0_render
+                    except Exception as e:
+                        print(f"[Pipeline] Warning: video rendering failed: {e}")
+            if not is_distributed:
+                del predictions
+                torch.cuda.empty_cache()
+            timings["case_total"] = time.perf_counter() - case_t0
+            if log_time:
+                print_and_save_timings(timings, outdir)
+            print(f"\n{'='*60}\n[Pipeline] Results saved to: {outdir}\n{'='*60}\n")
+        if is_distributed:
+            del predictions, imgs
+            gc.collect()
+            torch.cuda.empty_cache()
+            dist.barrier()
+        return str(outdir)
+    def _run_inference(self, img_paths, target_size, prior_cam_path, prior_depth_path):
+        """Run model forward pass."""
+        device = self.device
+        imgs = prepare_images_to_tensor(
+            img_paths, target_size=target_size, resize_strategy="crop"
+        ).to(device)
+        views = {"img": imgs}
+        B, S, C, H, W = imgs.shape
+        if self.sp_size > 1 and S < self.sp_size:
+            raise ValueError(
+                f"Number of input images ({S}) must be >= number of GPUs ({self.sp_size}) "
+                f"in multi-GPU mode. Please provide at least {self.sp_size} images, "
+                f"or use fewer GPUs."
+            )
+        if self.rank == 0:
+            print(f"[Inference] {S} images, shape={imgs.shape}, sp_size={self.sp_size}")
+        pp_xform = compute_preprocessing_transform(img_paths, target_size)
+        cond_flags = [0, 0, 0]
+        if prior_cam_path and os.path.isfile(prior_cam_path):
+            extr, intr = load_prior_camera(prior_cam_path, img_paths, preprocess_transform=pp_xform)
+            if extr is not None:
+                first = extr[0, 0]
+                extr = torch.linalg.inv(first.float()).to(first.dtype).unsqueeze(0).unsqueeze(0) @ extr
+                views["camera_poses"] = extr.to(device)
+                cond_flags[0] = 1
+            if intr is not None:
+                views["camera_intrs"] = intr.to(device)
+                cond_flags[2] = 1
+        if prior_depth_path and os.path.isdir(prior_depth_path):
+            depth = load_prior_depth(prior_depth_path, img_paths, H, W, preprocess_transform=pp_xform)
+            if depth is not None:
+                views["depthmap"] = depth.to(device)
+                cond_flags[1] = 1
+        use_amp = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+        inner = self.model.module if hasattr(self.model, 'module') else self.model
+        model_bf16 = getattr(inner, 'enable_bf16', False)
+        t0 = time.perf_counter()
+        with torch.amp.autocast("cuda", enabled=(not model_bf16 and use_amp), dtype=torch.bfloat16):
+            fwd_kw = dict(views=views, cond_flags=cond_flags, is_inference=True)
+            if self.sp_size > 1:
+                fwd_kw["sp_size"] = self.sp_size
+                fwd_kw["sp_group"] = self.sp_group
+            predictions = self.model(**fwd_kw)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+        infer_time = time.perf_counter() - t0
+        if self.rank == 0:
+            print(f"[Inference] Done in {infer_time:.2f}s")
+        return predictions, imgs, infer_time
+# ============================================================
+# Head disabling helper
+# ============================================================
+def _disable_heads(model, head_names):
+    """Disable and free specified heads. head_names: list of 'camera','depth','normal','points','gs'."""
+    mapping = {
+        "camera": ("enable_cam",   ["cam_head"]),
+        "depth":  ("enable_depth", ["depth_head"]),
+        "normal": ("enable_norm",  ["norm_head"]),
+        "points": ("enable_pts",   ["pts_head"]),
+        "gs":     ("enable_gs",    ["gs_head", "gs_renderer"]),
+    }
+    freed = 0
+    for name in head_names:
+        if name not in mapping:
+            continue
+        attr, modules = mapping[name]
+        setattr(model, attr, False)
+        for mod_name in modules:
+            if hasattr(model, mod_name):
+                mod = getattr(model, mod_name)
+                freed += sum(p.numel() for p in mod.parameters())
+                mod.cpu()
+                delattr(model, mod_name)
+                del mod
+    if freed:
+        gc.collect()
+        torch.cuda.empty_cache()
+        print(f"[Init] Disabled heads: {head_names}, freed ~{freed/1e6:.1f}M params")
+# ============================================================
+# CLI entry point
+# ============================================================
+def _broadcast_string(s, rank, src=0):
+    if rank == src:
+        data = s.encode("utf-8")
+        length = torch.tensor([len(data)], dtype=torch.long, device="cuda")
+    else:
+        length = torch.tensor([0], dtype=torch.long, device="cuda")
+    dist.broadcast(length, src=src)
+    n = length.item()
+    tensor = torch.tensor(list(data), dtype=torch.uint8, device="cuda") if rank == src else torch.empty(n, dtype=torch.uint8, device="cuda")
+    dist.broadcast(tensor, src=src)
+    return tensor.cpu().numpy().tobytes().decode("utf-8")
+def main():
+    parser = argparse.ArgumentParser(description="HunyuanWorld-Mirror Pipeline")
+    parser.add_argument("--input_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, default="inference_output")
+    parser.add_argument("--strict_output_path", type=str, default=None,
+                        help="If set, save results directly to this path (no subdir/timestamp)")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="tencent/HY-World-2.0",
+                        help="HuggingFace repo ID or local path")
+    parser.add_argument("--subfolder", type=str, default="HY-WorldMirror-2.0",
+                        help="Subfolder inside the repo containing WorldMirror weights")
+    parser.add_argument("--config_path", type=str, default=None)
+    parser.add_argument("--ckpt_path", type=str, default=None)
+    parser.add_argument("--use_fsdp", action="store_true", default=False)
+    parser.add_argument("--enable_bf16", action="store_true", default=False)
+    parser.add_argument("--fsdp_cpu_offload", action="store_true", default=False)
+    parser.add_argument("--target_size", type=int, default=952)
+    parser.add_argument("--fps", type=int, default=1)
+    parser.add_argument("--video_strategy", type=str, default="new", choices=["old", "new"])
+    parser.add_argument("--video_min_frames", type=int, default=1)
+    parser.add_argument("--video_max_frames", type=int, default=32)
+    parser.add_argument("--no_save_depth", action="store_true")
+    parser.add_argument("--no_save_normal", action="store_true")
+    parser.add_argument("--no_save_gs", action="store_true")
+    parser.add_argument("--no_save_camera", action="store_true")
+    parser.add_argument("--no_save_points", action="store_true")
+    parser.add_argument("--save_colmap", action="store_true", default=False)
+    parser.add_argument("--save_conf", action="store_true", default=False)
+    parser.add_argument("--save_sky_mask", action="store_true", default=False)
+    parser.add_argument("--apply_sky_mask", action="store_true", default=True)
+    parser.add_argument("--no_sky_mask", dest="apply_sky_mask", action="store_false")
+    parser.add_argument("--apply_edge_mask", action="store_true", default=True)
+    parser.add_argument("--no_edge_mask", dest="apply_edge_mask", action="store_false")
+    parser.add_argument("--apply_confidence_mask", action="store_true", default=False)
+    parser.add_argument("--sky_mask_source", type=str, default="auto", choices=["auto", "model", "onnx"])
+    parser.add_argument("--model_sky_threshold", type=float, default=0.45)
+    parser.add_argument("--confidence_percentile", type=float, default=10.0)
+    parser.add_argument("--edge_normal_threshold", type=float, default=1.0)
+    parser.add_argument("--edge_depth_threshold", type=float, default=0.03)
+    parser.add_argument("--compress_pts", action="store_true", default=True)
+    parser.add_argument("--no_compress_pts", dest="compress_pts", action="store_false")
+    parser.add_argument("--compress_pts_max_points", type=int, default=2_000_000)
+    parser.add_argument("--compress_pts_voxel_size", type=float, default=0.002)
+    parser.add_argument("--max_resolution", type=int, default=1920)
+    parser.add_argument("--compress_gs_max_points", type=int, default=5_000_000)
+    parser.add_argument("--prior_cam_path", type=str, default=None)
+    parser.add_argument("--prior_depth_path", type=str, default=None)
+    parser.add_argument("--disable_heads", type=str, nargs="*", default=None,
+                        help="Heads to disable: camera depth normal points gs")
+    parser.add_argument("--save_rendered", action="store_true", default=False,
+                        help="Render interpolated video from Gaussian splats")
+    parser.add_argument("--render_interp_per_pair", type=int, default=15,
+                        help="Interpolated frames per camera pair for video rendering")
+    parser.add_argument("--render_depth", action="store_true", default=False,
+                        help="Also render depth video")
+    parser.add_argument("--log_time", action="store_true", default=True)
+    parser.add_argument("--no_log_time", dest="log_time", action="store_false")
+    parser.add_argument("--no_interactive", action="store_true")
+    args = parser.parse_args()
+    pipeline = WorldMirrorPipeline.from_pretrained(
+        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
+        subfolder=args.subfolder,
+        config_path=args.config_path, ckpt_path=args.ckpt_path,
+        use_fsdp=args.use_fsdp, enable_bf16=args.enable_bf16,
+        fsdp_cpu_offload=args.fsdp_cpu_offload,
+        disable_heads=args.disable_heads,
+    )
+    call_kwargs = dict(
+        output_path=args.output_path,
+        target_size=args.target_size, fps=args.fps,
+        video_strategy=args.video_strategy,
+        video_min_frames=args.video_min_frames,
+        video_max_frames=args.video_max_frames,
+        save_depth=not args.no_save_depth,
+        save_normal=not args.no_save_normal,
+        save_gs=not args.no_save_gs,
+        save_camera=not args.no_save_camera,
+        save_points=not args.no_save_points,
+        save_colmap=args.save_colmap,
+        save_conf=args.save_conf,
+        save_sky_mask=args.save_sky_mask,
+        apply_sky_mask=args.apply_sky_mask,
+        apply_edge_mask=args.apply_edge_mask,
+        apply_confidence_mask=args.apply_confidence_mask,
+        sky_mask_source=args.sky_mask_source,
+        model_sky_threshold=args.model_sky_threshold,
+        confidence_percentile=args.confidence_percentile,
+        edge_normal_threshold=args.edge_normal_threshold,
+        edge_depth_threshold=args.edge_depth_threshold,
+        compress_pts=args.compress_pts,
+        compress_pts_max_points=args.compress_pts_max_points,
+        compress_pts_voxel_size=args.compress_pts_voxel_size,
+        max_resolution=args.max_resolution,
+        compress_gs_max_points=args.compress_gs_max_points,
+        prior_cam_path=args.prior_cam_path,
+        prior_depth_path=args.prior_depth_path,
+        save_rendered=args.save_rendered,
+        render_interp_per_pair=args.render_interp_per_pair,
+        render_depth=args.render_depth,
+        log_time=args.log_time,
+        strict_output_path=args.strict_output_path,
+    )
+    try:
+        pipeline(args.input_path, **call_kwargs)
+        if args.no_interactive:
+            return
+        rank = pipeline.rank
+        is_distributed = pipeline.sp_size > 1
+        if rank == 0:
+            print("\n[Interactive] Enter new input paths. Type 'quit' to stop.\n")
+        _INF_TIMEOUT = timedelta(days=365)
+        _DEF_TIMEOUT = timedelta(minutes=10)
+        while True:
+            if is_distributed:
+                dist.distributed_c10d._get_default_group()._get_backend(
+                    torch.device("cuda")).options._timeout = _INF_TIMEOUT
+            new_input = ""
+            if rank == 0:
+                try:
+                    new_input = input(">>> ").strip()
+                except (EOFError, KeyboardInterrupt):
+                    new_input = "quit"
+            if is_distributed:
+                new_input = _broadcast_string(new_input, rank, src=0)
+                dist.distributed_c10d._get_default_group()._get_backend(
+                    torch.device("cuda")).options._timeout = _DEF_TIMEOUT
+            if not new_input or new_input.lower() in ("quit", "exit", "q"):
+                break
+            if rank == 0 and not (Path(new_input).is_dir() or Path(new_input).is_file()):
+                print(f"  Invalid path: {new_input}")
+                continue
+            pipeline.model.to(pipeline.device)
+            pipeline.model.eval()
+            pipeline(new_input, **call_kwargs)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+if __name__ == "__main__":
+    main()