Skip to main content

Overview

Camera pose estimation determines the position and orientation of a camera relative to a scene or object:
  • Camera Calibration: Determine intrinsic camera parameters
  • PnP (Perspective-n-Point): Estimate pose from 2D-3D correspondences
  • Homography-based: Estimate pose from planar objects
  • Augmented Reality: Overlay 3D graphics on video
  • Visual Odometry: Track camera motion over time

Camera Calibration

Calibrate camera to obtain intrinsic parameters needed for accurate pose estimation.
import numpy as np
import cv2 as cv
import glob

def calibrate_camera(images, pattern_size, square_size):
    """
    Calibrate camera using chessboard pattern
    
    Args:
        images: List of calibration image paths
        pattern_size: Chessboard size (width, height) in inner corners
        square_size: Size of chessboard square in mm or cm
    
    Returns:
        Camera matrix, distortion coefficients, RMS error
    """
    # Prepare object points
    objp = np.zeros((pattern_size[0] * pattern_size[1], 3), np.float32)
    objp[:, :2] = np.indices(pattern_size).T.reshape(-1, 2)
    objp *= square_size
    
    obj_points = []  # 3D points in real world space
    img_points = []  # 2D points in image plane
    
    # Find chessboard corners
    for fname in images:
        img = cv.imread(fname)
        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        
        # Find corners
        ret, corners = cv.findChessboardCorners(gray, pattern_size, None)
        
        if ret:
            # Refine corner locations
            criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 30, 0.001)
            corners_refined = cv.cornerSubPix(gray, corners, (11, 11), (-1, -1), criteria)
            
            obj_points.append(objp)
            img_points.append(corners_refined)
            
            # Draw and display corners
            cv.drawChessboardCorners(img, pattern_size, corners_refined, ret)
            cv.imshow('Calibration', img)
            cv.waitKey(100)
    
    cv.destroyAllWindows()
    
    if len(obj_points) == 0:
        raise ValueError("No valid calibration images found")
    
    # Calibrate camera
    h, w = gray.shape[:2]
    rms, camera_matrix, dist_coefs, rvecs, tvecs = cv.calibrateCamera(
        obj_points, img_points, (w, h), None, None
    )
    
    print(f"\nCalibration RMS error: {rms:.3f}")
    print(f"\nCamera Matrix:\n{camera_matrix}")
    print(f"\nDistortion Coefficients:\n{dist_coefs.ravel()}")
    
    return camera_matrix, dist_coefs, rms

# Example usage
images = glob.glob('calibration_images/*.jpg')
pattern_size = (9, 6)  # 9x6 inner corners
square_size = 25.0  # 25mm squares

K, dist, rms = calibrate_camera(images, pattern_size, square_size)

# Save calibration
np.savez('camera_calibration.npz', 
         camera_matrix=K, 
         dist_coefs=dist, 
         rms=rms)

Pose Estimation with solvePnP

Estimate camera pose from known 3D-2D point correspondences.
import numpy as np
import cv2 as cv

def estimate_pose_pnp(object_points, image_points, camera_matrix, dist_coefs):
    """
    Estimate camera pose using PnP
    
    Args:
        object_points: 3D points in world coordinates (Nx3)
        image_points: Corresponding 2D points in image (Nx2)
        camera_matrix: Camera intrinsic matrix
        dist_coefs: Distortion coefficients
    
    Returns:
        Rotation vector, translation vector, success flag
    """
    # Solve PnP
    success, rvec, tvec = cv.solvePnP(
        object_points,
        image_points,
        camera_matrix,
        dist_coefs,
        flags=cv.SOLVEPNP_ITERATIVE
    )
    
    if not success:
        return None, None, False
    
    # Convert rotation vector to matrix
    R, _ = cv.Rodrigues(rvec)
    
    print(f"Rotation vector:\n{rvec.ravel()}")
    print(f"\nTranslation vector:\n{tvec.ravel()}")
    print(f"\nRotation matrix:\n{R}")
    
    return rvec, tvec, True

# Example: Define 3D object points (e.g., corners of a square)
object_points = np.array([
    [0, 0, 0],      # Origin
    [100, 0, 0],    # 100mm along X
    [100, 100, 0],  # 100mm along X and Y
    [0, 100, 0]     # 100mm along Y
], dtype=np.float32)

# Corresponding 2D image points (detected in image)
image_points = np.array([
    [320, 240],
    [420, 240],
    [420, 340],
    [320, 340]
], dtype=np.float32)

# Load camera calibration
calib = np.load('camera_calibration.npz')
K = calib['camera_matrix']
dist = calib['dist_coefs']

# Estimate pose
rvec, tvec, success = estimate_pose_pnp(object_points, image_points, K, dist)

Augmented Reality Application

Overlay 3D graphics on tracked planar objects.
import numpy as np
import cv2 as cv
from plane_tracker import PlaneTracker
import video

# Define 3D model (cube with pyramid roof)
ar_verts = np.float32([
    [0, 0, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0],  # Base
    [0, 0, 1], [0, 1, 1], [1, 1, 1], [1, 0, 1],  # Top
    [0, 0.5, 2], [1, 0.5, 2]  # Roof
])

ar_edges = [
    (0, 1), (1, 2), (2, 3), (3, 0),  # Base edges
    (4, 5), (5, 6), (6, 7), (7, 4),  # Top edges
    (0, 4), (1, 5), (2, 6), (3, 7),  # Vertical edges
    (4, 8), (5, 8), (6, 9), (7, 9), (8, 9)  # Roof edges
]

class ARApp:
    def __init__(self, video_src=0):
        self.cap = video.create_capture(video_src)
        self.tracker = PlaneTracker()
        self.frame = None
        self.paused = False
        
        cv.namedWindow('AR Demo')
        cv.createTrackbar('focal', 'AR Demo', 25, 50, lambda x: None)
    
    def draw_3d_overlay(self, img, tracked):
        """Draw 3D model on tracked plane"""
        x0, y0, x1, y1 = tracked.target.rect
        quad_3d = np.float32([
            [x0, y0, 0], [x1, y0, 0],
            [x1, y1, 0], [x0, y1, 0]
        ])
        
        # Estimate camera intrinsics from focal length
        fx = 0.5 + cv.getTrackbarPos('focal', 'AR Demo') / 50.0
        h, w = img.shape[:2]
        K = np.float64([
            [fx*w, 0, 0.5*(w-1)],
            [0, fx*w, 0.5*(h-1)],
            [0.0, 0.0, 1.0]
        ])
        dist_coef = np.zeros(4)
        
        # Solve PnP to get camera pose
        _ret, rvec, tvec = cv.solvePnP(
            quad_3d, tracked.quad, K, dist_coef
        )
        
        # Transform and project 3D points
        verts = ar_verts * [(x1-x0), (y1-y0), -(x1-x0)*0.3] + (x0, y0, 0)
        verts_2d = cv.projectPoints(
            verts, rvec, tvec, K, dist_coef
        )[0].reshape(-1, 2)
        
        # Draw 3D model
        for i, j in ar_edges:
            pt1 = tuple(map(int, verts_2d[i]))
            pt2 = tuple(map(int, verts_2d[j]))
            cv.line(img, pt1, pt2, (255, 255, 0), 2)
    
    def run(self):
        """Main AR loop"""
        while True:
            if not self.paused:
                ret, self.frame = self.cap.read()
                if not ret:
                    break
            
            vis = self.frame.copy()
            
            if not self.paused:
                # Track planar objects
                tracked = self.tracker.track(self.frame)
                
                for tr in tracked:
                    # Draw tracking quad
                    cv.polylines(vis, [np.int32(tr.quad)], 
                               True, (255, 255, 255), 2)
                    
                    # Draw 3D overlay
                    self.draw_3d_overlay(vis, tr)
            
            cv.imshow('AR Demo', vis)
            ch = cv.waitKey(1)
            
            if ch == ord(' '):
                self.paused = not self.paused
            if ch == 27:  # ESC
                break
        
        self.cap.release()
        cv.destroyAllWindows()

# Run AR application
if __name__ == '__main__':
    app = ARApp(0)
    app.run()

Pose from Homography

Extract pose information from planar object homography.
import numpy as np
import cv2 as cv

def decompose_homography_to_pose(H, K):
    """
    Decompose homography to rotation and translation
    
    Args:
        H: Homography matrix (3x3)
        K: Camera intrinsic matrix (3x3)
    
    Returns:
        List of possible (R, t, n) tuples
    """
    # Normalize homography
    H_norm = np.linalg.inv(K) @ H @ K
    
    # Decompose homography
    num_solutions, Rs, ts, normals = cv.decomposeHomographyMat(H, K)
    
    print(f"Found {num_solutions} possible solutions")
    
    solutions = []
    for i in range(num_solutions):
        R = Rs[i]
        t = ts[i]
        n = normals[i]
        
        # Check if rotation matrix is valid
        if np.linalg.det(R) > 0:
            solutions.append((R, t, n))
            print(f"\nSolution {len(solutions)}:")
            print(f"Rotation:\n{R}")
            print(f"Translation:\n{t.ravel()}")
            print(f"Normal:\n{n.ravel()}")
    
    return solutions

# Example usage
# Assume H is computed from matched points
src_pts = np.float32([[0, 0], [100, 0], [100, 100], [0, 100]])
dst_pts = np.float32([[120, 150], [250, 140], [260, 280], [110, 290]])

H, mask = cv.findHomography(src_pts, dst_pts)

# Load camera matrix
calib = np.load('camera_calibration.npz')
K = calib['camera_matrix']

# Decompose
solutions = decompose_homography_to_pose(H, K)

Visual Odometry

Track camera motion over time using feature tracking.
import numpy as np
import cv2 as cv

class VisualOdometry:
    def __init__(self, camera_matrix, dist_coefs):
        self.K = camera_matrix
        self.dist = dist_coefs
        self.detector = cv.ORB_create(1000)
        self.prev_frame = None
        self.prev_kp = None
        self.prev_des = None
        
        # Camera pose (accumulated)
        self.R = np.eye(3)
        self.t = np.zeros((3, 1))
    
    def process_frame(self, frame):
        """Process new frame and update pose"""
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        
        # Detect features
        kp, des = self.detector.detectAndCompute(gray, None)
        
        if self.prev_frame is None:
            self.prev_frame = gray
            self.prev_kp = kp
            self.prev_des = des
            return self.R, self.t
        
        # Match features
        matcher = cv.BFMatcher(cv.NORM_HAMMING)
        matches = matcher.knnMatch(self.prev_des, des, k=2)
        
        # Filter matches
        good = []
        for match in matches:
            if len(match) == 2:
                m, n = match
                if m.distance < 0.7 * n.distance:
                    good.append(m)
        
        if len(good) < 10:
            print("Not enough matches")
            return self.R, self.t
        
        # Extract matched points
        pts1 = np.float32([self.prev_kp[m.queryIdx].pt for m in good])
        pts2 = np.float32([kp[m.trainIdx].pt for m in good])
        
        # Compute essential matrix
        E, mask = cv.findEssentialMat(pts2, pts1, self.K, method=cv.RANSAC)
        
        # Recover pose
        _, R, t, mask = cv.recoverPose(E, pts2, pts1, self.K, mask=mask)
        
        # Update accumulated pose
        self.t = self.t + self.R @ t
        self.R = R @ self.R
        
        # Update previous frame
        self.prev_frame = gray
        self.prev_kp = kp
        self.prev_des = des
        
        return self.R, self.t

# Example usage
calib = np.load('camera_calibration.npz')
vo = VisualOdometry(calib['camera_matrix'], calib['dist_coefs'])

cap = cv.VideoCapture('video.mp4')
trajectory = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    R, t = vo.process_frame(frame)
    trajectory.append(t.copy())
    
    # Visualize trajectory
    traj_img = np.zeros((600, 600, 3), dtype=np.uint8)
    for i in range(1, len(trajectory)):
        pt1 = (int(trajectory[i-1][0]) + 300, int(trajectory[i-1][2]) + 500)
        pt2 = (int(trajectory[i][0]) + 300, int(trajectory[i][2]) + 500)
        cv.line(traj_img, pt1, pt2, (0, 255, 0), 2)
    
    cv.imshow('Trajectory', traj_img)
    if cv.waitKey(1) == 27:
        break

cv.destroyAllWindows()

PnP Algorithms Comparison

AlgorithmSpeedAccuracyMin PointsUse Case
ITERATIVEMediumGood4General purpose
P3PFastGood3Minimal case
EPNPFastGood4+Many points
DLSMediumVery Good4+High accuracy
UPNPFastGood4+Fast processing
IPPEFastGood4 (planar)Planar objects
SQPNPMediumExcellent3+Best accuracy

Best Practices

1

Calibrate Your Camera

Always calibrate for accurate pose estimation:
K, dist = calibrate_camera(images, (9, 6), 25.0)
2

Use Enough Points

More points = better accuracy:
  • Minimum: 4 points for general case
  • Recommended: 10+ points
  • Use RANSAC for outlier rejection
3

Handle Ambiguities

Some configurations have multiple solutions:
# Check determinant of rotation matrix
if np.linalg.det(R) < 0:
    R = -R  # Flip if improper rotation
4

Validate Results

Check reprojection error:
projected, _ = cv.projectPoints(object_pts, rvec, tvec, K, dist)
error = cv.norm(image_pts, projected, cv.NORM_L2) / len(projected)
Coordinate Systems: OpenCV uses right-handed coordinate system:
  • X-axis: right
  • Y-axis: down
  • Z-axis: forward (into scene)
Rotation vectors use Rodrigues representation, convertible to matrices with cv.Rodrigues().
Calibration Quality: Poor calibration leads to inaccurate pose estimation. Always:
  • Use at least 10-20 calibration images
  • Vary target orientation and position
  • Check RMS error (should be < 1.0 pixel)
  • Test on held-out validation images

Troubleshooting

Unstable Pose

# Use more robust PnP algorithm
success, rvec, tvec = cv.solvePnP(
    obj_pts, img_pts, K, dist,
    flags=cv.SOLVEPNP_SQPNP  # Most accurate
)

# Or use RANSAC for outliers
success, rvec, tvec, inliers = cv.solvePnPRansac(
    obj_pts, img_pts, K, dist,
    reprojectionError=8.0
)

Incorrect Pose

# Verify point correspondences
for i, (obj_pt, img_pt) in enumerate(zip(obj_pts, img_pts)):
    projected = cv.projectPoints(
        obj_pt.reshape(1, 1, 3), rvec, tvec, K, dist
    )[0].ravel()
    error = np.linalg.norm(img_pt - projected)
    print(f"Point {i}: error = {error:.2f} pixels")

Next Steps