Overview
Camera pose estimation determines the position and orientation of a camera relative to a scene or object:- Camera Calibration: Determine intrinsic camera parameters
- PnP (Perspective-n-Point): Estimate pose from 2D-3D correspondences
- Homography-based: Estimate pose from planar objects
- Augmented Reality: Overlay 3D graphics on video
- Visual Odometry: Track camera motion over time
Camera Calibration
Calibrate camera to obtain intrinsic parameters needed for accurate pose estimation.import numpy as np
import cv2 as cv
import glob
def calibrate_camera(images, pattern_size, square_size):
"""
Calibrate camera using chessboard pattern
Args:
images: List of calibration image paths
pattern_size: Chessboard size (width, height) in inner corners
square_size: Size of chessboard square in mm or cm
Returns:
Camera matrix, distortion coefficients, RMS error
"""
# Prepare object points
objp = np.zeros((pattern_size[0] * pattern_size[1], 3), np.float32)
objp[:, :2] = np.indices(pattern_size).T.reshape(-1, 2)
objp *= square_size
obj_points = [] # 3D points in real world space
img_points = [] # 2D points in image plane
# Find chessboard corners
for fname in images:
img = cv.imread(fname)
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# Find corners
ret, corners = cv.findChessboardCorners(gray, pattern_size, None)
if ret:
# Refine corner locations
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
corners_refined = cv.cornerSubPix(gray, corners, (11, 11), (-1, -1), criteria)
obj_points.append(objp)
img_points.append(corners_refined)
# Draw and display corners
cv.drawChessboardCorners(img, pattern_size, corners_refined, ret)
cv.imshow('Calibration', img)
cv.waitKey(100)
cv.destroyAllWindows()
if len(obj_points) == 0:
raise ValueError("No valid calibration images found")
# Calibrate camera
h, w = gray.shape[:2]
rms, camera_matrix, dist_coefs, rvecs, tvecs = cv.calibrateCamera(
obj_points, img_points, (w, h), None, None
)
print(f"\nCalibration RMS error: {rms:.3f}")
print(f"\nCamera Matrix:\n{camera_matrix}")
print(f"\nDistortion Coefficients:\n{dist_coefs.ravel()}")
return camera_matrix, dist_coefs, rms
# Example usage
images = glob.glob('calibration_images/*.jpg')
pattern_size = (9, 6) # 9x6 inner corners
square_size = 25.0 # 25mm squares
K, dist, rms = calibrate_camera(images, pattern_size, square_size)
# Save calibration
np.savez('camera_calibration.npz',
camera_matrix=K,
dist_coefs=dist,
rms=rms)
Pose Estimation with solvePnP
Estimate camera pose from known 3D-2D point correspondences.import numpy as np
import cv2 as cv
def estimate_pose_pnp(object_points, image_points, camera_matrix, dist_coefs):
"""
Estimate camera pose using PnP
Args:
object_points: 3D points in world coordinates (Nx3)
image_points: Corresponding 2D points in image (Nx2)
camera_matrix: Camera intrinsic matrix
dist_coefs: Distortion coefficients
Returns:
Rotation vector, translation vector, success flag
"""
# Solve PnP
success, rvec, tvec = cv.solvePnP(
object_points,
image_points,
camera_matrix,
dist_coefs,
flags=cv.SOLVEPNP_ITERATIVE
)
if not success:
return None, None, False
# Convert rotation vector to matrix
R, _ = cv.Rodrigues(rvec)
print(f"Rotation vector:\n{rvec.ravel()}")
print(f"\nTranslation vector:\n{tvec.ravel()}")
print(f"\nRotation matrix:\n{R}")
return rvec, tvec, True
# Example: Define 3D object points (e.g., corners of a square)
object_points = np.array([
[0, 0, 0], # Origin
[100, 0, 0], # 100mm along X
[100, 100, 0], # 100mm along X and Y
[0, 100, 0] # 100mm along Y
], dtype=np.float32)
# Corresponding 2D image points (detected in image)
image_points = np.array([
[320, 240],
[420, 240],
[420, 340],
[320, 340]
], dtype=np.float32)
# Load camera calibration
calib = np.load('camera_calibration.npz')
K = calib['camera_matrix']
dist = calib['dist_coefs']
# Estimate pose
rvec, tvec, success = estimate_pose_pnp(object_points, image_points, K, dist)
Augmented Reality Application
Overlay 3D graphics on tracked planar objects.- Python
import numpy as np
import cv2 as cv
from plane_tracker import PlaneTracker
import video
# Define 3D model (cube with pyramid roof)
ar_verts = np.float32([
[0, 0, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0], # Base
[0, 0, 1], [0, 1, 1], [1, 1, 1], [1, 0, 1], # Top
[0, 0.5, 2], [1, 0.5, 2] # Roof
])
ar_edges = [
(0, 1), (1, 2), (2, 3), (3, 0), # Base edges
(4, 5), (5, 6), (6, 7), (7, 4), # Top edges
(0, 4), (1, 5), (2, 6), (3, 7), # Vertical edges
(4, 8), (5, 8), (6, 9), (7, 9), (8, 9) # Roof edges
]
class ARApp:
def __init__(self, video_src=0):
self.cap = video.create_capture(video_src)
self.tracker = PlaneTracker()
self.frame = None
self.paused = False
cv.namedWindow('AR Demo')
cv.createTrackbar('focal', 'AR Demo', 25, 50, lambda x: None)
def draw_3d_overlay(self, img, tracked):
"""Draw 3D model on tracked plane"""
x0, y0, x1, y1 = tracked.target.rect
quad_3d = np.float32([
[x0, y0, 0], [x1, y0, 0],
[x1, y1, 0], [x0, y1, 0]
])
# Estimate camera intrinsics from focal length
fx = 0.5 + cv.getTrackbarPos('focal', 'AR Demo') / 50.0
h, w = img.shape[:2]
K = np.float64([
[fx*w, 0, 0.5*(w-1)],
[0, fx*w, 0.5*(h-1)],
[0.0, 0.0, 1.0]
])
dist_coef = np.zeros(4)
# Solve PnP to get camera pose
_ret, rvec, tvec = cv.solvePnP(
quad_3d, tracked.quad, K, dist_coef
)
# Transform and project 3D points
verts = ar_verts * [(x1-x0), (y1-y0), -(x1-x0)*0.3] + (x0, y0, 0)
verts_2d = cv.projectPoints(
verts, rvec, tvec, K, dist_coef
)[0].reshape(-1, 2)
# Draw 3D model
for i, j in ar_edges:
pt1 = tuple(map(int, verts_2d[i]))
pt2 = tuple(map(int, verts_2d[j]))
cv.line(img, pt1, pt2, (255, 255, 0), 2)
def run(self):
"""Main AR loop"""
while True:
if not self.paused:
ret, self.frame = self.cap.read()
if not ret:
break
vis = self.frame.copy()
if not self.paused:
# Track planar objects
tracked = self.tracker.track(self.frame)
for tr in tracked:
# Draw tracking quad
cv.polylines(vis, [np.int32(tr.quad)],
True, (255, 255, 255), 2)
# Draw 3D overlay
self.draw_3d_overlay(vis, tr)
cv.imshow('AR Demo', vis)
ch = cv.waitKey(1)
if ch == ord(' '):
self.paused = not self.paused
if ch == 27: # ESC
break
self.cap.release()
cv.destroyAllWindows()
# Run AR application
if __name__ == '__main__':
app = ARApp(0)
app.run()
Pose from Homography
Extract pose information from planar object homography.import numpy as np
import cv2 as cv
def decompose_homography_to_pose(H, K):
"""
Decompose homography to rotation and translation
Args:
H: Homography matrix (3x3)
K: Camera intrinsic matrix (3x3)
Returns:
List of possible (R, t, n) tuples
"""
# Normalize homography
H_norm = np.linalg.inv(K) @ H @ K
# Decompose homography
num_solutions, Rs, ts, normals = cv.decomposeHomographyMat(H, K)
print(f"Found {num_solutions} possible solutions")
solutions = []
for i in range(num_solutions):
R = Rs[i]
t = ts[i]
n = normals[i]
# Check if rotation matrix is valid
if np.linalg.det(R) > 0:
solutions.append((R, t, n))
print(f"\nSolution {len(solutions)}:")
print(f"Rotation:\n{R}")
print(f"Translation:\n{t.ravel()}")
print(f"Normal:\n{n.ravel()}")
return solutions
# Example usage
# Assume H is computed from matched points
src_pts = np.float32([[0, 0], [100, 0], [100, 100], [0, 100]])
dst_pts = np.float32([[120, 150], [250, 140], [260, 280], [110, 290]])
H, mask = cv.findHomography(src_pts, dst_pts)
# Load camera matrix
calib = np.load('camera_calibration.npz')
K = calib['camera_matrix']
# Decompose
solutions = decompose_homography_to_pose(H, K)
Visual Odometry
Track camera motion over time using feature tracking.import numpy as np
import cv2 as cv
class VisualOdometry:
def __init__(self, camera_matrix, dist_coefs):
self.K = camera_matrix
self.dist = dist_coefs
self.detector = cv.ORB_create(1000)
self.prev_frame = None
self.prev_kp = None
self.prev_des = None
# Camera pose (accumulated)
self.R = np.eye(3)
self.t = np.zeros((3, 1))
def process_frame(self, frame):
"""Process new frame and update pose"""
gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
# Detect features
kp, des = self.detector.detectAndCompute(gray, None)
if self.prev_frame is None:
self.prev_frame = gray
self.prev_kp = kp
self.prev_des = des
return self.R, self.t
# Match features
matcher = cv.BFMatcher(cv.NORM_HAMMING)
matches = matcher.knnMatch(self.prev_des, des, k=2)
# Filter matches
good = []
for match in matches:
if len(match) == 2:
m, n = match
if m.distance < 0.7 * n.distance:
good.append(m)
if len(good) < 10:
print("Not enough matches")
return self.R, self.t
# Extract matched points
pts1 = np.float32([self.prev_kp[m.queryIdx].pt for m in good])
pts2 = np.float32([kp[m.trainIdx].pt for m in good])
# Compute essential matrix
E, mask = cv.findEssentialMat(pts2, pts1, self.K, method=cv.RANSAC)
# Recover pose
_, R, t, mask = cv.recoverPose(E, pts2, pts1, self.K, mask=mask)
# Update accumulated pose
self.t = self.t + self.R @ t
self.R = R @ self.R
# Update previous frame
self.prev_frame = gray
self.prev_kp = kp
self.prev_des = des
return self.R, self.t
# Example usage
calib = np.load('camera_calibration.npz')
vo = VisualOdometry(calib['camera_matrix'], calib['dist_coefs'])
cap = cv.VideoCapture('video.mp4')
trajectory = []
while True:
ret, frame = cap.read()
if not ret:
break
R, t = vo.process_frame(frame)
trajectory.append(t.copy())
# Visualize trajectory
traj_img = np.zeros((600, 600, 3), dtype=np.uint8)
for i in range(1, len(trajectory)):
pt1 = (int(trajectory[i-1][0]) + 300, int(trajectory[i-1][2]) + 500)
pt2 = (int(trajectory[i][0]) + 300, int(trajectory[i][2]) + 500)
cv.line(traj_img, pt1, pt2, (0, 255, 0), 2)
cv.imshow('Trajectory', traj_img)
if cv.waitKey(1) == 27:
break
cv.destroyAllWindows()
PnP Algorithms Comparison
| Algorithm | Speed | Accuracy | Min Points | Use Case |
|---|---|---|---|---|
| ITERATIVE | Medium | Good | 4 | General purpose |
| P3P | Fast | Good | 3 | Minimal case |
| EPNP | Fast | Good | 4+ | Many points |
| DLS | Medium | Very Good | 4+ | High accuracy |
| UPNP | Fast | Good | 4+ | Fast processing |
| IPPE | Fast | Good | 4 (planar) | Planar objects |
| SQPNP | Medium | Excellent | 3+ | Best accuracy |
Best Practices
Calibrate Your Camera
Always calibrate for accurate pose estimation:
K, dist = calibrate_camera(images, (9, 6), 25.0)
Use Enough Points
More points = better accuracy:
- Minimum: 4 points for general case
- Recommended: 10+ points
- Use RANSAC for outlier rejection
Handle Ambiguities
Some configurations have multiple solutions:
# Check determinant of rotation matrix
if np.linalg.det(R) < 0:
R = -R # Flip if improper rotation
Coordinate Systems: OpenCV uses right-handed coordinate system:
- X-axis: right
- Y-axis: down
- Z-axis: forward (into scene)
cv.Rodrigues().Calibration Quality: Poor calibration leads to inaccurate pose estimation. Always:
- Use at least 10-20 calibration images
- Vary target orientation and position
- Check RMS error (should be < 1.0 pixel)
- Test on held-out validation images
Troubleshooting
Unstable Pose
# Use more robust PnP algorithm
success, rvec, tvec = cv.solvePnP(
obj_pts, img_pts, K, dist,
flags=cv.SOLVEPNP_SQPNP # Most accurate
)
# Or use RANSAC for outliers
success, rvec, tvec, inliers = cv.solvePnPRansac(
obj_pts, img_pts, K, dist,
reprojectionError=8.0
)
Incorrect Pose
# Verify point correspondences
for i, (obj_pt, img_pt) in enumerate(zip(obj_pts, img_pts)):
projected = cv.projectPoints(
obj_pt.reshape(1, 1, 3), rvec, tvec, K, dist
)[0].ravel()
error = np.linalg.norm(img_pt - projected)
print(f"Point {i}: error = {error:.2f} pixels")
Next Steps
- Learn Camera Calibration in detail
- Explore 3D Reconstruction techniques
- Check Feature Matching for point correspondences
- See Video Stabilization for motion estimation
