425 lines
18 KiB
Python
425 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
scene_reconstruct.py — Phase A der Szenen-Kalibrierung
|
|
========================================================
|
|
Ankerlose, metrische Rekonstruktion ALLER Kamera- und Marker-Posen aus den
|
|
ArUco-Eckbeobachtungen mehrerer Kameras — OHNE bekannten Welt-Anker, allein aus
|
|
der bekannten Marker-Kantenlänge.
|
|
|
|
Vollständig generisch: kein robot.json nötig, keine festen Marker-IDs/Namen.
|
|
|
|
Warum nicht einfach Einzel-Marker-PnP?
|
|
--------------------------------------
|
|
Ein planares Quadrat hat bei PnP eine 2-fache Flip-Ambiguität. Verkettet man solche
|
|
Einzelposen naiv, sind ~1/3 der Knoten gespiegelt -> unbrauchbar. Stattdessen:
|
|
|
|
1) je (Kamera, Marker): PnP (IPPE_SQUARE) als TENTATIVE Startpose.
|
|
2) Referenzkamera c0 (meiste Beobachtungen) definiert den Szenen-Frame S (E_{c0}=I).
|
|
3) Iteration (flip-robust):
|
|
a) Kameras per cv2.solvePnPRansac gegen die bereits platzierten 3D-Ecken
|
|
neu bestimmen -> geflippte Marker fallen als RANSAC-Ausreißer raus.
|
|
b) Marker-Ecken über alle platzierten Kameras TRIANGULIEREN (DLT, flip-frei)
|
|
und die Marker-Pose per Kabsch aus den triangulierten Ecken neu setzen.
|
|
4) globale Bündelausgleichung (least_squares, Huber, dünnbesetzte Jacobi) über
|
|
die Reprojektion ALLER Ecken. Gauge: c0 = Identität; Maßstab via Markergröße.
|
|
|
|
Geometrie-Konventionen
|
|
----------------------
|
|
- E_c = world(S) -> camera c (X_cam = E_c X_S)
|
|
- G_m = marker m local -> world(S) (X_S = G_m X_local)
|
|
- M_{c<-m} = E_c @ G_m (local -> camera)
|
|
- Ecken-Reihenfolge wie ArUco/Pipeline: TL, TR, BR, BL.
|
|
|
|
Ein-/Ausgabe
|
|
------------
|
|
--evalDir : Ordner mit render_*_aruco_detection.json (Eckpunkte + Intrinsik)
|
|
--out : scene_reconstruction.json (Default: <evalDir>/scene_reconstruction.json)
|
|
|
|
Das Ergebnis (Posen in S) ist Eingang für Phase B (robot_register.py).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
import cv2
|
|
|
|
try:
|
|
from scipy.optimize import least_squares
|
|
from scipy.sparse import lil_matrix
|
|
HAVE_SCIPY = True
|
|
except ImportError: # pragma: no cover
|
|
HAVE_SCIPY = False
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# SE(3) / Geometrie-Helfer
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def rt_to_T(rvec, tvec) -> np.ndarray:
|
|
R, _ = cv2.Rodrigues(np.asarray(rvec, dtype=float).reshape(3, 1))
|
|
T = np.eye(4)
|
|
T[:3, :3] = R
|
|
T[:3, 3] = np.asarray(tvec, dtype=float).reshape(3)
|
|
return T
|
|
|
|
|
|
def T_to_rt(T: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
rvec, _ = cv2.Rodrigues(np.ascontiguousarray(T[:3, :3]))
|
|
return rvec.reshape(3), T[:3, 3].copy()
|
|
|
|
|
|
def inv_T(T: np.ndarray) -> np.ndarray:
|
|
R, t = T[:3, :3], T[:3, 3]
|
|
Ti = np.eye(4)
|
|
Ti[:3, :3] = R.T
|
|
Ti[:3, 3] = -R.T @ t
|
|
return Ti
|
|
|
|
|
|
def local_corners(size_m: float) -> np.ndarray:
|
|
"""Marker-Ecken im lokalen Frame (TL, TR, BR, BL), z=0."""
|
|
h = size_m / 2.0
|
|
return np.array([[-h, h, 0.0], [h, h, 0.0], [h, -h, 0.0], [-h, -h, 0.0]], dtype=float)
|
|
|
|
|
|
def kabsch(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
|
|
"""R,t mit Q ≈ R P + t (ohne Skalierung); P,Q: Nx3. Liefert 4x4."""
|
|
pc, qc = P.mean(0), Q.mean(0)
|
|
H = (P - pc).T @ (Q - qc)
|
|
U, _, Vt = np.linalg.svd(H)
|
|
d = np.sign(np.linalg.det(Vt.T @ U.T))
|
|
R = Vt.T @ np.diag([1.0, 1.0, d]) @ U.T
|
|
T = np.eye(4)
|
|
T[:3, :3] = R
|
|
T[:3, 3] = qc - R @ pc
|
|
return T
|
|
|
|
|
|
def triangulate_point(obs: List[Tuple]) -> Optional[np.ndarray]:
|
|
"""Mehrbild-DLT eines 3D-Punkts. obs: [(K,D,R,t,uv), ...] mit X_cam=R X+t."""
|
|
A = []
|
|
for K, D, R, t, uv in obs:
|
|
und = cv2.undistortPoints(np.array([[uv]], dtype=np.float64), K, D).reshape(2)
|
|
x, y = float(und[0]), float(und[1])
|
|
P = np.hstack([R, t.reshape(3, 1)])
|
|
A.append(x * P[2] - P[0])
|
|
A.append(y * P[2] - P[1])
|
|
_, _, Vt = np.linalg.svd(np.asarray(A, dtype=float))
|
|
X = Vt[-1]
|
|
return X[:3] / X[3] if abs(X[3]) > 1e-12 else None
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# Laden der Detektionen
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
class Cam:
|
|
__slots__ = ("id", "K", "D", "obs")
|
|
|
|
def __init__(self, cid, K, D):
|
|
self.id = cid
|
|
self.K = K
|
|
self.D = D
|
|
self.obs: Dict[int, np.ndarray] = {} # marker_id -> (4,2) px
|
|
|
|
|
|
def load_detections(eval_dir: str, cameras: Optional[List[str]] = None,
|
|
default_size: Optional[float] = None
|
|
) -> Tuple[Dict[str, Cam], Dict[int, float]]:
|
|
cams: Dict[str, Cam] = {}
|
|
sizes: Dict[int, float] = {}
|
|
for det_path in sorted(glob.glob(os.path.join(eval_dir, "*_aruco_detection.json"))):
|
|
m = re.match(r"render_([A-Za-z0-9]+)_aruco_detection\.json", os.path.basename(det_path))
|
|
if not m:
|
|
continue
|
|
cid = m.group(1)
|
|
if cameras and cid not in cameras:
|
|
continue
|
|
det = json.load(open(det_path, "r", encoding="utf-8"))
|
|
K = np.array(det["camera"]["camera_matrix"], dtype=float).reshape(3, 3)
|
|
D = np.array(det["camera"]["distortion_coefficients"], dtype=float).reshape(-1, 1)
|
|
cam = Cam(cid, K, D)
|
|
det_size = (det.get("vision_config", {}) or {}).get("MarkerSize", None)
|
|
for d in det.get("detections", []):
|
|
if d.get("type", "aruco") != "aruco":
|
|
continue
|
|
pts = d.get("image_points_px")
|
|
if pts is None:
|
|
continue
|
|
mid = int(d["marker_id"])
|
|
cam.obs[mid] = np.array(pts, dtype=float).reshape(4, 2)
|
|
s = d.get("marker_size_m", det_size if det_size is not None else default_size)
|
|
if s is not None:
|
|
sizes[mid] = float(s)
|
|
if cam.obs:
|
|
cams[cid] = cam
|
|
return cams, sizes
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# Schritt 1: Per-Marker-PnP (tentative Startposen)
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def pnp_all(cams: Dict[str, Cam], sizes: Dict[int, float], default_size: float
|
|
) -> Dict[Tuple[str, int], Dict]:
|
|
out: Dict[Tuple[str, int], Dict] = {}
|
|
for cid, cam in cams.items():
|
|
for mid, corners_px in cam.obs.items():
|
|
size = sizes.get(mid, default_size)
|
|
ok, rvec, tvec = cv2.solvePnP(local_corners(size), corners_px, cam.K, cam.D,
|
|
flags=cv2.SOLVEPNP_IPPE_SQUARE)
|
|
if not ok:
|
|
ok, rvec, tvec = cv2.solvePnP(local_corners(size), corners_px, cam.K, cam.D,
|
|
flags=cv2.SOLVEPNP_ITERATIVE)
|
|
if ok:
|
|
out[(cid, mid)] = {"M": rt_to_T(rvec, tvec)}
|
|
return out
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# Schritt 2+3: flip-robuste Initialisierung
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def initialize_poses(cams, sizes, default_size, pnp, n_iter=6, min_pts_pnp=8,
|
|
ransac_px=3.0) -> Tuple[Dict[str, np.ndarray], Dict[int, np.ndarray], str]:
|
|
cam_ids = sorted(cams)
|
|
all_mk = sorted({m for _, m in pnp})
|
|
|
|
# Referenzkamera = meiste Beobachtungen
|
|
ref = max(cam_ids, key=lambda c: len(cams[c].obs))
|
|
E: Dict[str, np.ndarray] = {ref: np.eye(4)}
|
|
G: Dict[int, np.ndarray] = {}
|
|
for m in cams[ref].obs: # tentative Startmarker aus c0
|
|
if (ref, m) in pnp:
|
|
G[m] = pnp[(ref, m)]["M"].copy()
|
|
|
|
def corners3D(m):
|
|
return (G[m][:3, :3] @ local_corners(sizes.get(m, default_size)).T).T + G[m][:3, 3]
|
|
|
|
for _ in range(n_iter):
|
|
# (a) Kameras gegen platzierte 3D-Ecken (RANSAC -> Flip-Ausreißer raus)
|
|
for c in cam_ids:
|
|
if c == ref:
|
|
continue
|
|
obj, img = [], []
|
|
for m in cams[c].obs:
|
|
if m in G:
|
|
obj.append(corners3D(m))
|
|
img.append(cams[c].obs[m])
|
|
if sum(len(o) for o in obj) < min_pts_pnp:
|
|
continue
|
|
O = np.vstack(obj).astype(np.float64)
|
|
I = np.vstack(img).astype(np.float64)
|
|
ok, rvec, tvec, inl = cv2.solvePnPRansac(
|
|
O, I, cams[c].K, cams[c].D, reprojectionError=ransac_px,
|
|
iterationsCount=200, flags=cv2.SOLVEPNP_ITERATIVE)
|
|
if ok and inl is not None and len(inl) >= 6:
|
|
E[c] = rt_to_T(rvec, tvec)
|
|
|
|
# (b) Marker triangulieren (flip-frei) bzw. einfügen
|
|
for m in all_mk:
|
|
placed = [c for c in cam_ids if c in E and (c, m) in pnp]
|
|
if len(placed) >= 2:
|
|
tri, ok = [], True
|
|
for ci in range(4):
|
|
o = [(cams[c].K, cams[c].D, E[c][:3, :3], E[c][:3, 3], cams[c].obs[m][ci])
|
|
for c in placed]
|
|
X = triangulate_point(o)
|
|
if X is None:
|
|
ok = False
|
|
break
|
|
tri.append(X)
|
|
if ok:
|
|
G[m] = kabsch(local_corners(sizes.get(m, default_size)), np.array(tri))
|
|
elif placed and m not in G:
|
|
c = placed[0]
|
|
G[m] = inv_T(E[c]) @ pnp[(c, m)]["M"]
|
|
|
|
return E, G, ref
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# Schritt 4: Globale Bündelausgleichung (Referenzkamera fix)
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def reproj_stats(obs_pairs, cams, sizes, default_size, E, G) -> Tuple[float, float]:
|
|
"""(RMS, Median) der Per-Beobachtungs-Reprojektionsfehler (px)."""
|
|
per = []
|
|
for (c, m) in obs_pairs:
|
|
rvec, tvec = T_to_rt(E[c] @ G[m])
|
|
proj, _ = cv2.projectPoints(local_corners(sizes.get(m, default_size)),
|
|
rvec, tvec, cams[c].K, cams[c].D)
|
|
d = proj.reshape(4, 2) - cams[c].obs[m]
|
|
per.append(float(np.sqrt(np.mean(d * d))))
|
|
if not per:
|
|
return 0.0, 0.0
|
|
a = np.asarray(per)
|
|
return float(np.sqrt(np.mean(a * a))), float(np.median(a))
|
|
|
|
|
|
def bundle_adjust(pnp, cams, sizes, default_size, E, G, ref,
|
|
huber_px=2.0, max_nfev=120, verbose=0):
|
|
if not HAVE_SCIPY:
|
|
print("[WARN] scipy fehlt -> BA uebersprungen")
|
|
return E, G
|
|
cam_opt = [c for c in sorted(E) if c != ref]
|
|
mk_ids = sorted(G)
|
|
ci = {c: i for i, c in enumerate(cam_opt)}
|
|
mi = {m: j for j, m in enumerate(mk_ids)}
|
|
ncam, nmk = len(cam_opt), len(mk_ids)
|
|
obs = [(c, m) for (c, m) in pnp if c in E and m in G]
|
|
|
|
def pack():
|
|
x = np.zeros(6 * ncam + 6 * nmk)
|
|
for c in cam_opt:
|
|
r, t = T_to_rt(E[c]); x[6*ci[c]:6*ci[c]+3] = r; x[6*ci[c]+3:6*ci[c]+6] = t
|
|
for m in mk_ids:
|
|
r, t = T_to_rt(G[m]); o = 6*ncam + 6*mi[m]; x[o:o+3] = r; x[o+3:o+6] = t
|
|
return x
|
|
|
|
def unpack(x):
|
|
Ee = {ref: np.eye(4)}
|
|
for c in cam_opt:
|
|
Ee[c] = rt_to_T(x[6*ci[c]:6*ci[c]+3], x[6*ci[c]+3:6*ci[c]+6])
|
|
Gg = {}
|
|
for m in mk_ids:
|
|
o = 6*ncam + 6*mi[m]
|
|
Gg[m] = rt_to_T(x[o:o+3], x[o+3:o+6])
|
|
return Ee, Gg
|
|
|
|
def residuals(x):
|
|
Ee, Gg = unpack(x)
|
|
res = np.empty(8 * len(obs))
|
|
for k, (c, m) in enumerate(obs):
|
|
rvec, tvec = T_to_rt(Ee[c] @ Gg[m])
|
|
proj, _ = cv2.projectPoints(local_corners(sizes.get(m, default_size)),
|
|
rvec, tvec, cams[c].K, cams[c].D)
|
|
res[8*k:8*k+8] = (proj.reshape(4, 2) - cams[c].obs[m]).ravel()
|
|
return res
|
|
|
|
Sp = lil_matrix((8 * len(obs), 6 * ncam + 6 * nmk), dtype=int)
|
|
for k, (c, m) in enumerate(obs):
|
|
rows = slice(8*k, 8*k+8)
|
|
if c in ci:
|
|
Sp[rows, 6*ci[c]:6*ci[c]+6] = 1
|
|
o = 6*ncam + 6*mi[m]
|
|
Sp[rows, o:o+6] = 1
|
|
|
|
sol = least_squares(residuals, pack(), jac_sparsity=Sp, method="trf",
|
|
loss="huber", f_scale=huber_px, max_nfev=max_nfev, verbose=verbose)
|
|
return unpack(sol.x)
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# Hauptlauf
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
def reconstruct(eval_dir, cameras=None, default_size=0.025,
|
|
huber_px=2.0, max_nfev=120, verbose=0) -> Dict:
|
|
cams, sizes = load_detections(eval_dir, cameras, default_size)
|
|
if len(cams) < 2:
|
|
raise RuntimeError(f"brauche >=2 Kameras, gefunden: {sorted(cams)}")
|
|
pnp = pnp_all(cams, sizes, default_size)
|
|
E0, G0, ref = initialize_poses(cams, sizes, default_size, pnp)
|
|
|
|
# Marker je #platzierter Kameras; nur >=2 sind triangulierbar/zuverlässig.
|
|
ncams_of = {}
|
|
for (c, m) in pnp:
|
|
if c in E0 and m in G0:
|
|
ncams_of[m] = ncams_of.get(m, 0) + 1
|
|
weak = sorted(m for m in G0 if ncams_of.get(m, 0) < 2) # Einzelbild -> unbeobachtbar
|
|
G0 = {m: T for m, T in G0.items() if ncams_of.get(m, 0) >= 2}
|
|
|
|
pairs0 = [(c, m) for (c, m) in pnp if c in E0 and m in G0]
|
|
rms0, med0 = reproj_stats(pairs0, cams, sizes, default_size, E0, G0)
|
|
E, G = bundle_adjust(pnp, cams, sizes, default_size, E0, G0, ref, huber_px, max_nfev, verbose)
|
|
pairs = [(c, m) for (c, m) in pnp if c in E and m in G]
|
|
rms1, med1 = reproj_stats(pairs, cams, sizes, default_size, E, G)
|
|
dropped = [c for c in cams if c not in E]
|
|
|
|
cameras_out = []
|
|
for c in sorted(E):
|
|
Ec = E[c]
|
|
rvec, _ = T_to_rt(Ec)
|
|
cameras_out.append({
|
|
"camera_id": c,
|
|
"world_to_camera": {"rotation_matrix": Ec[:3, :3].tolist(),
|
|
"translation_m": Ec[:3, 3].tolist(), "rvec_rad": rvec.tolist()},
|
|
"center_m": (-Ec[:3, :3].T @ Ec[:3, 3]).tolist(),
|
|
"num_markers": len(cams[c].obs),
|
|
"is_reference": bool(c == ref),
|
|
})
|
|
markers_out = []
|
|
for m in sorted(G):
|
|
Gm = G[m]
|
|
size = sizes.get(m, default_size)
|
|
corners_S = (Gm[:3, :3] @ local_corners(size).T).T + Gm[:3, 3]
|
|
rvec, _ = T_to_rt(Gm)
|
|
markers_out.append({
|
|
"marker_id": int(m),
|
|
"pose_in_S": {"rotation_matrix": Gm[:3, :3].tolist(),
|
|
"translation_m": Gm[:3, 3].tolist(), "rvec_rad": rvec.tolist()},
|
|
"center_m": Gm[:3, 3].tolist(),
|
|
"normal": (Gm[:3, :3] @ np.array([0.0, 0.0, 1.0])).tolist(),
|
|
"corners_m": corners_S.tolist(),
|
|
"num_cameras": int(ncams_of.get(m, 0)),
|
|
})
|
|
|
|
return {
|
|
"schema_version": "1.0",
|
|
"stage": "scene_reconstruct_A",
|
|
"created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"frame": "arbitrary S (reference camera pose = identity)",
|
|
"reference_camera": ref,
|
|
"summary": {"num_cameras": len(E), "num_markers": len(G), "num_observations": len(pairs),
|
|
"reproj_rms_px_init": rms0, "reproj_median_px_init": med0,
|
|
"reproj_rms_px_final": rms1, "reproj_median_px_final": med1,
|
|
"dropped_cameras": dropped,
|
|
"insufficient_view_markers": weak},
|
|
"cameras": cameras_out,
|
|
"markers": markers_out,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="Phase A: ankerlose metrische Szenen-Rekonstruktion")
|
|
ap.add_argument("--evalDir", required=True, help="Ordner mit render_*_aruco_detection.json")
|
|
ap.add_argument("--cameras", default=None, help="Kommagetrennte Kamera-IDs (Default: alle)")
|
|
ap.add_argument("--markerSize", type=float, default=0.025, help="Fallback-Kantenlänge (m)")
|
|
ap.add_argument("--huberPx", type=float, default=2.0)
|
|
ap.add_argument("--maxNfev", type=int, default=120)
|
|
ap.add_argument("--out", default=None)
|
|
ap.add_argument("-v", "--verbose", action="count", default=0)
|
|
args = ap.parse_args()
|
|
|
|
cams = args.cameras.split(",") if args.cameras else None
|
|
res = reconstruct(args.evalDir, cams, args.markerSize, args.huberPx, args.maxNfev, args.verbose)
|
|
out = args.out or os.path.join(args.evalDir, "scene_reconstruction.json")
|
|
json.dump(res, open(out, "w", encoding="utf-8"), indent=2)
|
|
s = res["summary"]
|
|
print(f"[INFO] Kameras={s['num_cameras']} Marker(>=2 Views)={s['num_markers']} "
|
|
f"Obs={s['num_observations']} | RMS {s['reproj_rms_px_init']:.3f}->{s['reproj_rms_px_final']:.3f}px "
|
|
f"(median {s['reproj_median_px_final']:.3f}px) | ref-Kamera={res['reference_camera']}")
|
|
if s["dropped_cameras"]:
|
|
print(f"[WARN] nicht verbundene Kameras verworfen: {s['dropped_cameras']}")
|
|
if s["insufficient_view_markers"]:
|
|
print(f"[INFO] {len(s['insufficient_view_markers'])} Marker mit <2 Views (unbeobachtbar, "
|
|
f"nicht rekonstruiert): {s['insufficient_view_markers']}")
|
|
print(f"[INFO] -> {out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except Exception as exc:
|
|
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
sys.exit(1)
|