appRobotRender/pipeline/4_v8_4d_direct_hand_pose.py

#!/usr/bin/env python3
"""
4d_direct_hand_pose.py
----------------------
Estimate Hand/Palm/Finger joint angles (b, c, e) directly from ArUco
marker observations — WITHOUT relying on step-3 triangulated positions.

Why this is needed
------------------
Step 3 triangulates marker centres from multiple cameras.  For markers
far above the board (arm extended in Z), the depth estimation from
triangulation degrades: all reference Board markers lie in a flat plane,
so the cameras are poorly conditioned for Z estimation of out-of-plane points.

This script takes a completely different route:

  1. For each camera that sees a finger or hand marker, run  cv2.solvePnP
     on the four detected corners.  solvePnP uses the apparent size /
     aspect ratio of the marker to recover depth  — it is NOT affected
     by the flat-board reference plane problem.

  2. Use the known camera-in-world pose (from step 2) to convert the
     per-camera marker pose to world frame.

  3. Fuse all world-frame marker poses with quality weighting.

  4. Given world-frame positions for all visible finger markers, and the
     known local positions in the FingerA/FingerB frames, solve a small
     nonlinear LS problem for (b, c, e) using scipy.

What you need
-------------
  - robot.json
  - Step-2 camera-pose JSONs for all cameras
  - Step-1 aruco-detection JSONs for all cameras
  - Already-estimated x, y, z, a from 4a/4b (pass via --state-json or CLI)

Usage
-----
  python 4d_direct_hand_pose.py \\
      --robot  robot.json \\
      --det    cam1_aruco_detection.json cam2_aruco_detection.json ... \\
      --pose   cam1_camera_pose.json    cam2_camera_pose.json    ... \\
      --state  4b_arm2_result.json \\
      --output 4d_result.json

Output
------
  {
    "b_deg": ...,   "c_deg": ...,   "e_mm": ...,
    "rms_mm": ...,
    "accumulated_state": {"x":..., "y":..., "z":..., "a":..., "b":..., "c":..., "e":...}
  }
"""

from __future__ import annotations

import argparse
import json
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import cv2
import numpy as np

try:
    from scipy.optimize import least_squares
    _HAS_SCIPY = True
except ImportError:
    _HAS_SCIPY = False

from robot_fk import RobotFK, STATE_KEYS


# ──────────────────────────────────────────────────────────────
# I/O
# ──────────────────────────────────────────────────────────────

def _load(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def _load_state(path: Path) -> Dict[str, float]:
    raw = _load(path)
    state = raw.get("accumulated_state", raw.get("state", {}))
    return {k: float(v) for k, v in state.items() if k in STATE_KEYS}


# ──────────────────────────────────────────────────────────────
# ArUco / PnP helpers
# ──────────────────────────────────────────────────────────────

def _marker_local_corners(size_m: float) -> np.ndarray:
    """Four local corners of a marker (same order as OpenCV ArUco)."""
    h = size_m / 2.0
    return np.array([[-h, h, 0], [h, h, 0], [h, -h, 0], [-h, -h, 0]], dtype=np.float32)


def _camera_pose_from_json(pose_json: dict) -> Tuple[np.ndarray, np.ndarray]:
    """
    Load world-to-camera rotation and translation from step-2 JSON.
    Returns rvec (3,), tvec (3,) — the same convention used in step 2.
    """
    wtc = pose_json["camera_pose"]["world_to_camera"]
    R   = np.array(wtc["rotation_matrix"], dtype=np.float32).reshape(3, 3)
    t   = np.array(wtc["translation_m"],   dtype=np.float32).reshape(3)
    rvec, _ = cv2.Rodrigues(R)
    return rvec.reshape(3), t


def _intrinsics_from_detection(det: dict) -> Tuple[np.ndarray, np.ndarray]:
    cam = det["camera"]
    K   = np.array(cam["camera_matrix"],           dtype=np.float32).reshape(3, 3)
    D   = np.array(cam["distortion_coefficients"],  dtype=np.float32).reshape(-1, 1)
    return K, D


def _solve_marker_pose_in_camera(
    corners_px: np.ndarray,
    marker_size_m: float,
    K: np.ndarray,
    D: np.ndarray
) -> Optional[Tuple[np.ndarray, np.ndarray]]:
    """
    Estimate marker-in-camera pose via solvePnP (IPPE_SQUARE).
    Returns (rvec_cm, tvec_cm)  or  None on failure.
    """
    obj = _marker_local_corners(marker_size_m)
    img = corners_px.astype(np.float32)
    ok, rvec, tvec = cv2.solvePnP(obj, img, K, D, flags=cv2.SOLVEPNP_IPPE_SQUARE)
    if not ok:
        ok, rvec, tvec = cv2.solvePnP(obj, img, K, D, flags=cv2.SOLVEPNP_ITERATIVE)
    if not ok:
        return None
    return rvec.reshape(3), tvec.reshape(3)


def _marker_world_pose(
    rvec_cm: np.ndarray,
    tvec_cm: np.ndarray,
    rvec_wc: np.ndarray,
    tvec_wc: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Convert marker-in-camera pose to marker-in-world pose.

    Convention (same as step 2):
        X_cam = R_wc @ X_world + t_wc
    So:
        R_wc = rvec→R(rvec_wc),  camera origin in world = -R_wc^T @ t_wc

    Marker pose in world:
        R_mw = R_wc^T @ R_cm   (world_to_marker rotation)
        t_mw = R_wc^T @ (tvec_cm - tvec_wc)
    → marker centre in world = -R_mw^T @ t_mw = R_mw^T @ (-t_mw)

    We return (pos_world_m, R_marker_in_world) where
        R_marker_in_world maps local marker axes to world axes.
    """
    R_wc, _ = cv2.Rodrigues(rvec_wc.reshape(3, 1))
    R_cm, _ = cv2.Rodrigues(rvec_cm.reshape(3, 1))

    R_cw = R_wc.T  # camera-to-world rotation

    # Marker centre in world frame
    pos_w = R_cw @ tvec_cm.reshape(3) - R_cw @ tvec_wc.reshape(3)

    # Marker orientation in world frame (columns = marker x,y,z axes in world)
    R_mw = R_cw @ R_cm

    return pos_w.astype(np.float64), R_mw.astype(np.float64)


# ──────────────────────────────────────────────────────────────
# Multi-camera marker pose fusion
# ──────────────────────────────────────────────────────────────

def _observation_weight(corners_px: np.ndarray, img_w: int, img_h: int) -> float:
    """
    Simple quality weight: larger markers, more central = higher weight.
    """
    area  = float(cv2.contourArea(corners_px.astype(np.float32)))
    cx, cy = corners_px.mean(axis=0)
    dx    = abs(cx - img_w / 2.0) / (img_w / 2.0)
    dy    = abs(cy - img_h / 2.0) / (img_h / 2.0)
    centrality = 1.0 - 0.5 * (dx + dy)
    return max(0.05, (area / 500.0) * centrality)


def collect_marker_world_poses(
    detection_files: List[Path],
    pose_files:      List[Path],
    target_marker_ids: set,
    marker_size_m: float,
    verbose: bool = True,
) -> Dict[int, List[Dict[str, Any]]]:
    """
    For each target marker, collect world-frame pose estimates from all cameras.

    Returns: marker_id → list of {pos_m, R, weight, camera_id}
    """
    result: Dict[int, List[Dict[str, Any]]] = {}

    for det_path, pose_path in zip(detection_files, pose_files):
        det       = _load(det_path)
        pose_json = _load(pose_path)

        K, D    = _intrinsics_from_detection(det)
        rvec_wc, tvec_wc = _camera_pose_from_json(pose_json)

        cam_id  = det.get("camera", {}).get("camera_id", str(det_path))
        img_w   = int(det.get("image", {}).get("width_px",  1280))
        img_h   = int(det.get("image", {}).get("height_px",  720))

        for d in det.get("detections", []):
            mid = int(d.get("marker_id", -1))
            if mid not in target_marker_ids:
                continue
            corners = np.array(d.get("image_points_px", []), dtype=np.float32).reshape(4, 2)
            if corners.shape != (4, 2):
                continue

            pnp = _solve_marker_pose_in_camera(corners, marker_size_m, K, D)
            if pnp is None:
                continue
            rvec_cm, tvec_cm = pnp

            pos_w, R_w = _marker_world_pose(rvec_cm, tvec_cm, rvec_wc, tvec_wc)
            w = _observation_weight(corners, img_w, img_h)

            result.setdefault(mid, []).append({
                "pos_m":     pos_w,
                "R":         R_w,
                "weight":    w,
                "camera_id": cam_id,
            })

    if verbose:
        for mid, obs in result.items():
            print(f"  M{mid}: {len(obs)} camera(s) — weights "
                  f"{[f'{o[\"weight\"]:.2f}' for o in obs]}")

    return result


def fuse_marker_positions(
    observations: Dict[int, List[Dict[str, Any]]]
) -> Dict[int, np.ndarray]:
    """
    Weighted-mean fusion of world-frame marker centre positions (metres).
    """
    result: Dict[int, np.ndarray] = {}
    for mid, obs_list in observations.items():
        total_w = sum(o["weight"] for o in obs_list)
        if total_w < 1e-12:
            continue
        pos = sum(o["pos_m"] * o["weight"] for o in obs_list) / total_w
        result[mid] = pos.astype(np.float64)
    return result


# ──────────────────────────────────────────────────────────────
# FK-based optimizer for b, c, e
# ──────────────────────────────────────────────────────────────

def _fk_finger_positions(
    fk: RobotFK,
    base_state: Dict[str, float],
    b: float,
    c: float,
    e: float
) -> Dict[int, np.ndarray]:
    """
    Compute world positions (m) of all FingerA and FingerB markers.
    base_state already contains x, y, z, a.
    """
    state = dict(base_state)
    state.update({"b": b, "c": c, "e": e})
    transforms = fk.compute(state)
    result = {}
    for link_name in ("FingerA", "FingerB", "Hand", "Palm"):
        if link_name not in fk.links:
            continue
        for m in fk.links[link_name].get("markers", []):
            mid = int(m.get("id", -1))
            if mid < 0 or "position" not in m:
                continue
            # position in METRES
            local_mm = np.array(m["position"], dtype=float)
            world_m  = fk.marker_world(transforms, link_name, local_mm) / 1000.0
            result[mid] = world_m
    return result


def optimize_bce(
    fk: RobotFK,
    base_state: Dict[str, float],
    observed_m: Dict[int, np.ndarray],
    b_init: float = 0.0,
    c_init: float = 0.0,
    e_init: float = 0.0,
    verbose: bool = True,
) -> dict:
    """
    Find (b, c, e) minimising reprojection of FK finger-marker positions
    to observed world positions.

    Parameters
    ----------
    fk          : RobotFK
    base_state  : already-estimated {x, y, z, a} (all in mm/deg)
    observed_m  : {marker_id: world_position_m}
    """

    if not _HAS_SCIPY:
        return {"status": "failed", "reason": "scipy not available (pip install scipy)"}

    def residuals(p):
        b, c, e = float(p[0]), float(p[1]), float(p[2])
        fk_pos  = _fk_finger_positions(fk, base_state, b, c, e)
        r = []
        for mid, obs_p in observed_m.items():
            if mid in fk_pos:
                diff = fk_pos[mid] - obs_p
                r.extend(diff.tolist())
        return r

    # Initial residual
    r0 = residuals([b_init, c_init, e_init])
    if len(r0) < 3:
        return {
            "status": "failed",
            "reason": f"Only {len(r0)//3} finger markers matched. Need ≥1.",
        }

    res = least_squares(
        residuals,
        x0=[b_init, c_init, e_init],
        method="lm",
        max_nfev=2000,
    )

    b_est, c_est, e_est = float(res.x[0]), float(res.x[1]), float(res.x[2])

    rms_m   = float(np.sqrt(np.mean(np.array(res.fun) ** 2)))
    rms_mm  = rms_m * 1000.0

    accum = dict(base_state)
    accum.update({"b": b_est, "c": c_est, "e": e_est})

    if verbose:
        print(f"\n── 4d: Hand/Palm/Finger (b, c, e) ──────────────────────")
        print(f"  Markers used:  {len(observed_m)} fused positions")
        print(f"  b = {b_est:+.2f} °   c = {c_est:+.2f} °   e = {e_est:+.2f} mm")
        print(f"  RMS position error: {rms_mm:.2f} mm")
        if not res.success:
            print(f"  [WARN] Optimizer: {res.message}")

    return {
        "status":            "ok",
        "b_deg":             b_est,
        "c_deg":             c_est,
        "e_mm":              e_est,
        "rms_mm":            rms_mm,
        "optimizer_success": bool(res.success),
        "num_markers":       len(observed_m),
        "accumulated_state": accum,
    }


# ──────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────

def main() -> int:
    p = argparse.ArgumentParser(
        description="4d: Direct hand/finger pose from per-camera solvePnP")
    p.add_argument("--robot",      type=Path, required=True)
    p.add_argument("--det",  "-d", type=Path, nargs="+", required=True,
                   help="*_aruco_detection.json files (one per camera)")
    p.add_argument("--pose", "-p", type=Path, nargs="+", required=True,
                   help="*_camera_pose.json files (same order as --det)")
    p.add_argument("--state",      type=Path, default=None,
                   help="JSON from previous 4b step (accumulated_state)")
    for k in ("x", "y", "z", "a"):
        p.add_argument(f"--{k}",   type=float, default=None,
                       help=f"Known {k} value (overrides --state)")
    p.add_argument("--b-init",     type=float, default=0.0)
    p.add_argument("--c-init",     type=float, default=0.0)
    p.add_argument("--e-init",     type=float, default=0.0)
    p.add_argument("--output",     type=Path,  default=None)
    args = p.parse_args()

    if len(args.det) != len(args.pose):
        print("[ERROR] --det and --pose must have the same number of files")
        return 1

    # Base state from file or CLI
    base_state: Dict[str, float] = {}
    if args.state:
        base_state = _load_state(args.state)
        print(f"  Loaded state: {base_state}")
    for k in ("x", "y", "z", "a"):
        v = getattr(args, k, None)
        if v is not None:
            base_state[k] = float(v)

    # Load FK
    fk = RobotFK.from_file(args.robot)

    # Which markers belong to Hand/Palm/FingerA/FingerB?
    target_ids: set = set()
    for link_name in ("Hand", "Palm", "FingerA", "FingerB"):
        for m in fk.links.get(link_name, {}).get("markers", []):
            mid = int(m.get("id", -1))
            if mid >= 0:
                target_ids.add(mid)

    marker_size_m = float(
        fk.robot.get("vision_config", {}).get("MarkerSize", 0.025)
    )

    print(f"\n  Target marker IDs: {sorted(target_ids)}")
    print(f"  Marker size: {marker_size_m*1000:.1f} mm")
    print(f"  Cameras: {len(args.det)}")

    # Collect world-frame poses from all cameras via solvePnP
    print("\n  Per-camera solvePnP:")
    world_pose_obs = collect_marker_world_poses(
        args.det, args.pose, target_ids, marker_size_m, verbose=True
    )

    if not world_pose_obs:
        print("[ERROR] No target markers found in any detection file.")
        return 1

    # Fuse positions
    observed_m = fuse_marker_positions(world_pose_obs)
    print(f"\n  Fused {len(observed_m)} marker world positions")

    # Optimise b, c, e
    result = optimize_bce(
        fk          = fk,
        base_state  = base_state,
        observed_m  = observed_m,
        b_init      = args.b_init,
        c_init      = args.c_init,
        e_init      = args.e_init,
        verbose     = True,
    )

    if args.output and result["status"] == "ok":
        args.output.parent.mkdir(parents=True, exist_ok=True)
        with args.output.open("w", encoding="utf-8") as f:
            json.dump(result, f, indent=2)
        print(f"\n  Saved → {args.output}")

    return 0 if result["status"] == "ok" else 1


if __name__ == "__main__":
    raise SystemExit(main())