#!/usr/bin/env python3 """ scene_reconstruct.py — Phase A der Szenen-Kalibrierung ======================================================== Ankerlose, metrische Rekonstruktion ALLER Kamera- und Marker-Posen aus den ArUco-Eckbeobachtungen mehrerer Kameras — OHNE bekannten Welt-Anker, allein aus der bekannten Marker-Kantenlänge. Vollständig generisch: kein robot.json nötig, keine festen Marker-IDs/Namen. Warum nicht einfach Einzel-Marker-PnP? -------------------------------------- Ein planares Quadrat hat bei PnP eine 2-fache Flip-Ambiguität. Verkettet man solche Einzelposen naiv, sind ~1/3 der Knoten gespiegelt -> unbrauchbar. Stattdessen: 1) je (Kamera, Marker): PnP (IPPE_SQUARE) als TENTATIVE Startpose. 2) Referenzkamera c0 (meiste Beobachtungen) definiert den Szenen-Frame S (E_{c0}=I). 3) Iteration (flip-robust): a) Kameras per cv2.solvePnPRansac gegen die bereits platzierten 3D-Ecken neu bestimmen -> geflippte Marker fallen als RANSAC-Ausreißer raus. b) Marker-Ecken über alle platzierten Kameras TRIANGULIEREN (DLT, flip-frei) und die Marker-Pose per Kabsch aus den triangulierten Ecken neu setzen. 4) globale Bündelausgleichung (least_squares, Huber, dünnbesetzte Jacobi) über die Reprojektion ALLER Ecken. Gauge: c0 = Identität; Maßstab via Markergröße. Geometrie-Konventionen ---------------------- - E_c = world(S) -> camera c (X_cam = E_c X_S) - G_m = marker m local -> world(S) (X_S = G_m X_local) - M_{c<-m} = E_c @ G_m (local -> camera) - Ecken-Reihenfolge wie ArUco/Pipeline: TL, TR, BR, BL. Ein-/Ausgabe ------------ --evalDir : Ordner mit render_*_aruco_detection.json (Eckpunkte + Intrinsik) --out : scene_reconstruction.json (Default: /scene_reconstruction.json) Das Ergebnis (Posen in S) ist Eingang für Phase B (robot_register.py). """ from __future__ import annotations import argparse import glob import json import os import re import sys import time from typing import Dict, List, Optional, Tuple import numpy as np import cv2 try: from scipy.optimize import least_squares from scipy.sparse import lil_matrix HAVE_SCIPY = True except ImportError: # pragma: no cover HAVE_SCIPY = False # ────────────────────────────────────────────────────────────── # SE(3) / Geometrie-Helfer # ────────────────────────────────────────────────────────────── def rt_to_T(rvec, tvec) -> np.ndarray: R, _ = cv2.Rodrigues(np.asarray(rvec, dtype=float).reshape(3, 1)) T = np.eye(4) T[:3, :3] = R T[:3, 3] = np.asarray(tvec, dtype=float).reshape(3) return T def T_to_rt(T: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: rvec, _ = cv2.Rodrigues(np.ascontiguousarray(T[:3, :3])) return rvec.reshape(3), T[:3, 3].copy() def inv_T(T: np.ndarray) -> np.ndarray: R, t = T[:3, :3], T[:3, 3] Ti = np.eye(4) Ti[:3, :3] = R.T Ti[:3, 3] = -R.T @ t return Ti def local_corners(size_m: float) -> np.ndarray: """Marker-Ecken im lokalen Frame (TL, TR, BR, BL), z=0.""" h = size_m / 2.0 return np.array([[-h, h, 0.0], [h, h, 0.0], [h, -h, 0.0], [-h, -h, 0.0]], dtype=float) def kabsch(P: np.ndarray, Q: np.ndarray) -> np.ndarray: """R,t mit Q ≈ R P + t (ohne Skalierung); P,Q: Nx3. Liefert 4x4.""" pc, qc = P.mean(0), Q.mean(0) H = (P - pc).T @ (Q - qc) U, _, Vt = np.linalg.svd(H) d = np.sign(np.linalg.det(Vt.T @ U.T)) R = Vt.T @ np.diag([1.0, 1.0, d]) @ U.T T = np.eye(4) T[:3, :3] = R T[:3, 3] = qc - R @ pc return T def triangulate_point(obs: List[Tuple]) -> Optional[np.ndarray]: """Mehrbild-DLT eines 3D-Punkts. obs: [(K,D,R,t,uv), ...] mit X_cam=R X+t.""" A = [] for K, D, R, t, uv in obs: und = cv2.undistortPoints(np.array([[uv]], dtype=np.float64), K, D).reshape(2) x, y = float(und[0]), float(und[1]) P = np.hstack([R, t.reshape(3, 1)]) A.append(x * P[2] - P[0]) A.append(y * P[2] - P[1]) _, _, Vt = np.linalg.svd(np.asarray(A, dtype=float)) X = Vt[-1] return X[:3] / X[3] if abs(X[3]) > 1e-12 else None # ────────────────────────────────────────────────────────────── # Laden der Detektionen # ────────────────────────────────────────────────────────────── class Cam: __slots__ = ("id", "K", "D", "obs") def __init__(self, cid, K, D): self.id = cid self.K = K self.D = D self.obs: Dict[int, np.ndarray] = {} # marker_id -> (4,2) px def load_detections(eval_dir: str, cameras: Optional[List[str]] = None, default_size: Optional[float] = None ) -> Tuple[Dict[str, Cam], Dict[int, float]]: cams: Dict[str, Cam] = {} sizes: Dict[int, float] = {} for det_path in sorted(glob.glob(os.path.join(eval_dir, "*_aruco_detection.json"))): m = re.match(r"render_([A-Za-z0-9]+)_aruco_detection\.json", os.path.basename(det_path)) if not m: continue cid = m.group(1) if cameras and cid not in cameras: continue det = json.load(open(det_path, "r", encoding="utf-8")) K = np.array(det["camera"]["camera_matrix"], dtype=float).reshape(3, 3) D = np.array(det["camera"]["distortion_coefficients"], dtype=float).reshape(-1, 1) cam = Cam(cid, K, D) det_size = (det.get("vision_config", {}) or {}).get("MarkerSize", None) for d in det.get("detections", []): if d.get("type", "aruco") != "aruco": continue pts = d.get("image_points_px") if pts is None: continue mid = int(d["marker_id"]) cam.obs[mid] = np.array(pts, dtype=float).reshape(4, 2) s = d.get("marker_size_m", det_size if det_size is not None else default_size) if s is not None: sizes[mid] = float(s) if cam.obs: cams[cid] = cam return cams, sizes # ────────────────────────────────────────────────────────────── # Schritt 1: Per-Marker-PnP (tentative Startposen) # ────────────────────────────────────────────────────────────── def pnp_all(cams: Dict[str, Cam], sizes: Dict[int, float], default_size: float ) -> Dict[Tuple[str, int], Dict]: out: Dict[Tuple[str, int], Dict] = {} for cid, cam in cams.items(): for mid, corners_px in cam.obs.items(): size = sizes.get(mid, default_size) ok, rvec, tvec = cv2.solvePnP(local_corners(size), corners_px, cam.K, cam.D, flags=cv2.SOLVEPNP_IPPE_SQUARE) if not ok: ok, rvec, tvec = cv2.solvePnP(local_corners(size), corners_px, cam.K, cam.D, flags=cv2.SOLVEPNP_ITERATIVE) if ok: out[(cid, mid)] = {"M": rt_to_T(rvec, tvec)} return out # ────────────────────────────────────────────────────────────── # Schritt 2+3: flip-robuste Initialisierung # ────────────────────────────────────────────────────────────── def initialize_poses(cams, sizes, default_size, pnp, n_iter=6, min_pts_pnp=8, ransac_px=3.0) -> Tuple[Dict[str, np.ndarray], Dict[int, np.ndarray], str]: cam_ids = sorted(cams) all_mk = sorted({m for _, m in pnp}) # Referenzkamera = meiste Beobachtungen ref = max(cam_ids, key=lambda c: len(cams[c].obs)) E: Dict[str, np.ndarray] = {ref: np.eye(4)} G: Dict[int, np.ndarray] = {} for m in cams[ref].obs: # tentative Startmarker aus c0 if (ref, m) in pnp: G[m] = pnp[(ref, m)]["M"].copy() def corners3D(m): return (G[m][:3, :3] @ local_corners(sizes.get(m, default_size)).T).T + G[m][:3, 3] for _ in range(n_iter): # (a) Kameras gegen platzierte 3D-Ecken (RANSAC -> Flip-Ausreißer raus) for c in cam_ids: if c == ref: continue obj, img = [], [] for m in cams[c].obs: if m in G: obj.append(corners3D(m)) img.append(cams[c].obs[m]) if sum(len(o) for o in obj) < min_pts_pnp: continue O = np.vstack(obj).astype(np.float64) I = np.vstack(img).astype(np.float64) ok, rvec, tvec, inl = cv2.solvePnPRansac( O, I, cams[c].K, cams[c].D, reprojectionError=ransac_px, iterationsCount=200, flags=cv2.SOLVEPNP_ITERATIVE) if ok and inl is not None and len(inl) >= 6: E[c] = rt_to_T(rvec, tvec) # (b) Marker triangulieren (flip-frei) bzw. einfügen for m in all_mk: placed = [c for c in cam_ids if c in E and (c, m) in pnp] if len(placed) >= 2: tri, ok = [], True for ci in range(4): o = [(cams[c].K, cams[c].D, E[c][:3, :3], E[c][:3, 3], cams[c].obs[m][ci]) for c in placed] X = triangulate_point(o) if X is None: ok = False break tri.append(X) if ok: G[m] = kabsch(local_corners(sizes.get(m, default_size)), np.array(tri)) elif placed and m not in G: c = placed[0] G[m] = inv_T(E[c]) @ pnp[(c, m)]["M"] return E, G, ref # ────────────────────────────────────────────────────────────── # Schritt 4: Globale Bündelausgleichung (Referenzkamera fix) # ────────────────────────────────────────────────────────────── def reproj_stats(obs_pairs, cams, sizes, default_size, E, G) -> Tuple[float, float]: """(RMS, Median) der Per-Beobachtungs-Reprojektionsfehler (px).""" per = [] for (c, m) in obs_pairs: rvec, tvec = T_to_rt(E[c] @ G[m]) proj, _ = cv2.projectPoints(local_corners(sizes.get(m, default_size)), rvec, tvec, cams[c].K, cams[c].D) d = proj.reshape(4, 2) - cams[c].obs[m] per.append(float(np.sqrt(np.mean(d * d)))) if not per: return 0.0, 0.0 a = np.asarray(per) return float(np.sqrt(np.mean(a * a))), float(np.median(a)) def bundle_adjust(pnp, cams, sizes, default_size, E, G, ref, huber_px=2.0, max_nfev=120, verbose=0): if not HAVE_SCIPY: print("[WARN] scipy fehlt -> BA uebersprungen") return E, G cam_opt = [c for c in sorted(E) if c != ref] mk_ids = sorted(G) ci = {c: i for i, c in enumerate(cam_opt)} mi = {m: j for j, m in enumerate(mk_ids)} ncam, nmk = len(cam_opt), len(mk_ids) obs = [(c, m) for (c, m) in pnp if c in E and m in G] def pack(): x = np.zeros(6 * ncam + 6 * nmk) for c in cam_opt: r, t = T_to_rt(E[c]); x[6*ci[c]:6*ci[c]+3] = r; x[6*ci[c]+3:6*ci[c]+6] = t for m in mk_ids: r, t = T_to_rt(G[m]); o = 6*ncam + 6*mi[m]; x[o:o+3] = r; x[o+3:o+6] = t return x def unpack(x): Ee = {ref: np.eye(4)} for c in cam_opt: Ee[c] = rt_to_T(x[6*ci[c]:6*ci[c]+3], x[6*ci[c]+3:6*ci[c]+6]) Gg = {} for m in mk_ids: o = 6*ncam + 6*mi[m] Gg[m] = rt_to_T(x[o:o+3], x[o+3:o+6]) return Ee, Gg def residuals(x): Ee, Gg = unpack(x) res = np.empty(8 * len(obs)) for k, (c, m) in enumerate(obs): rvec, tvec = T_to_rt(Ee[c] @ Gg[m]) proj, _ = cv2.projectPoints(local_corners(sizes.get(m, default_size)), rvec, tvec, cams[c].K, cams[c].D) res[8*k:8*k+8] = (proj.reshape(4, 2) - cams[c].obs[m]).ravel() return res Sp = lil_matrix((8 * len(obs), 6 * ncam + 6 * nmk), dtype=int) for k, (c, m) in enumerate(obs): rows = slice(8*k, 8*k+8) if c in ci: Sp[rows, 6*ci[c]:6*ci[c]+6] = 1 o = 6*ncam + 6*mi[m] Sp[rows, o:o+6] = 1 sol = least_squares(residuals, pack(), jac_sparsity=Sp, method="trf", loss="huber", f_scale=huber_px, max_nfev=max_nfev, verbose=verbose) return unpack(sol.x) # ────────────────────────────────────────────────────────────── # Hauptlauf # ────────────────────────────────────────────────────────────── def reconstruct(eval_dir, cameras=None, default_size=0.025, huber_px=2.0, max_nfev=120, verbose=0) -> Dict: cams, sizes = load_detections(eval_dir, cameras, default_size) if len(cams) < 2: raise RuntimeError(f"brauche >=2 Kameras, gefunden: {sorted(cams)}") pnp = pnp_all(cams, sizes, default_size) E0, G0, ref = initialize_poses(cams, sizes, default_size, pnp) # Marker je #platzierter Kameras; nur >=2 sind triangulierbar/zuverlässig. ncams_of = {} for (c, m) in pnp: if c in E0 and m in G0: ncams_of[m] = ncams_of.get(m, 0) + 1 weak = sorted(m for m in G0 if ncams_of.get(m, 0) < 2) # Einzelbild -> unbeobachtbar G0 = {m: T for m, T in G0.items() if ncams_of.get(m, 0) >= 2} pairs0 = [(c, m) for (c, m) in pnp if c in E0 and m in G0] rms0, med0 = reproj_stats(pairs0, cams, sizes, default_size, E0, G0) E, G = bundle_adjust(pnp, cams, sizes, default_size, E0, G0, ref, huber_px, max_nfev, verbose) pairs = [(c, m) for (c, m) in pnp if c in E and m in G] rms1, med1 = reproj_stats(pairs, cams, sizes, default_size, E, G) dropped = [c for c in cams if c not in E] cameras_out = [] for c in sorted(E): Ec = E[c] rvec, _ = T_to_rt(Ec) cameras_out.append({ "camera_id": c, "world_to_camera": {"rotation_matrix": Ec[:3, :3].tolist(), "translation_m": Ec[:3, 3].tolist(), "rvec_rad": rvec.tolist()}, "center_m": (-Ec[:3, :3].T @ Ec[:3, 3]).tolist(), "num_markers": len(cams[c].obs), "is_reference": bool(c == ref), }) markers_out = [] for m in sorted(G): Gm = G[m] size = sizes.get(m, default_size) corners_S = (Gm[:3, :3] @ local_corners(size).T).T + Gm[:3, 3] rvec, _ = T_to_rt(Gm) markers_out.append({ "marker_id": int(m), "pose_in_S": {"rotation_matrix": Gm[:3, :3].tolist(), "translation_m": Gm[:3, 3].tolist(), "rvec_rad": rvec.tolist()}, "center_m": Gm[:3, 3].tolist(), "normal": (Gm[:3, :3] @ np.array([0.0, 0.0, 1.0])).tolist(), "corners_m": corners_S.tolist(), "num_cameras": int(ncams_of.get(m, 0)), }) return { "schema_version": "1.0", "stage": "scene_reconstruct_A", "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "frame": "arbitrary S (reference camera pose = identity)", "reference_camera": ref, "summary": {"num_cameras": len(E), "num_markers": len(G), "num_observations": len(pairs), "reproj_rms_px_init": rms0, "reproj_median_px_init": med0, "reproj_rms_px_final": rms1, "reproj_median_px_final": med1, "dropped_cameras": dropped, "insufficient_view_markers": weak}, "cameras": cameras_out, "markers": markers_out, } def main() -> None: ap = argparse.ArgumentParser(description="Phase A: ankerlose metrische Szenen-Rekonstruktion") ap.add_argument("--evalDir", required=True, help="Ordner mit render_*_aruco_detection.json") ap.add_argument("--cameras", default=None, help="Kommagetrennte Kamera-IDs (Default: alle)") ap.add_argument("--markerSize", type=float, default=0.025, help="Fallback-Kantenlänge (m)") ap.add_argument("--huberPx", type=float, default=2.0) ap.add_argument("--maxNfev", type=int, default=120) ap.add_argument("--out", default=None) ap.add_argument("-v", "--verbose", action="count", default=0) args = ap.parse_args() cams = args.cameras.split(",") if args.cameras else None res = reconstruct(args.evalDir, cams, args.markerSize, args.huberPx, args.maxNfev, args.verbose) out = args.out or os.path.join(args.evalDir, "scene_reconstruction.json") json.dump(res, open(out, "w", encoding="utf-8"), indent=2) s = res["summary"] print(f"[INFO] Kameras={s['num_cameras']} Marker(>=2 Views)={s['num_markers']} " f"Obs={s['num_observations']} | RMS {s['reproj_rms_px_init']:.3f}->{s['reproj_rms_px_final']:.3f}px " f"(median {s['reproj_median_px_final']:.3f}px) | ref-Kamera={res['reference_camera']}") if s["dropped_cameras"]: print(f"[WARN] nicht verbundene Kameras verworfen: {s['dropped_cameras']}") if s["insufficient_view_markers"]: print(f"[INFO] {len(s['insufficient_view_markers'])} Marker mit <2 Views (unbeobachtbar, " f"nicht rekonstruiert): {s['insufficient_view_markers']}") print(f"[INFO] -> {out}") if __name__ == "__main__": try: main() except Exception as exc: print(f"[ERROR] {exc}", file=sys.stderr) sys.exit(1)