Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/opencv/opencv/llms.txt

Use this file to discover all available pages before exploring further.

Deep Learning with OpenCV DNN Module

Learn how to use OpenCV’s DNN (Deep Neural Networks) module to load and run pre-trained models for object detection, classification, and more.

Introduction to OpenCV DNN

OpenCV’s DNN module allows you to:
  • Load models from TensorFlow, PyTorch, Caffe, ONNX, and Darknet
  • Run inference without installing deep learning frameworks
  • Deploy on CPU, GPU (CUDA), or OpenVINO backends
  • Use pre-trained models for various tasks

Supported Frameworks

  • ONNX (.onnx) - Universal format, recommended
  • TensorFlow (.pb, .pbtxt)
  • PyTorch (via ONNX export)
  • Caffe (.caffemodel, .prototxt)
  • Darknet (.weights, .cfg) - YOLO models
  • TensorFlow Lite (.tflite)

Loading and Running Models

Basic Model Loading

import cv2 as cv
import numpy as np

# Load a model (example: ONNX format)
net = cv.dnn.readNet('model.onnx')

# Or load specific formats:
# net = cv.dnn.readNetFromTensorflow('model.pb', 'model.pbtxt')
# net = cv.dnn.readNetFromCaffe('deploy.prototxt', 'model.caffemodel')
# net = cv.dnn.readNetFromDarknet('yolov3.cfg', 'yolov3.weights')

# Set computation backend and target
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# For GPU acceleration:
# net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
# net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA)

YOLO Object Detection

YOLO (You Only Look Once) is a popular real-time object detection system.

YOLOv3/YOLOv4 Detection

import cv2 as cv
import numpy as np

# Load YOLO network
net = cv.dnn.readNetFromDarknet('yolov4.cfg', 'yolov4.weights')
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# Load class names
with open('coco.names', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

# Load image
img = cv.imread('street.jpg')
height, width = img.shape[:2]

# Create blob from image
blob = cv.dnn.blobFromImage(img, 1/255.0, (416, 416), 
                           swapRB=True, crop=False)

# Set input and run forward pass
net.setInput(blob)

# Get output layer names
output_layers = net.getUnconnectedOutLayersNames()

# Forward pass
outputs = net.forward(output_layers)

# Process detections
boxes = []
confidences = []
class_ids = []

for output in outputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        
        if confidence > 0.5:
            # Scale bounding box back to image size
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)
            
            # Get top-left corner
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)
            
            boxes.append([x, y, w, h])
            confidences.append(float(confidence))
            class_ids.append(class_id)

# Apply Non-Maximum Suppression
indices = cv.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

# Draw detections
if len(indices) > 0:
    for i in indices.flatten():
        x, y, w, h = boxes[i]
        label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
        color = (0, 255, 0)
        
        cv.rectangle(img, (x, y), (x+w, y+h), color, 2)
        cv.putText(img, label, (x, y-10), 
                  cv.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

cv.imshow('YOLO Detection', img)
cv.waitKey(0)

YOLOv8 with ONNX

Modern YOLO versions export to ONNX format:
import cv2 as cv
import numpy as np

# Load YOLOv8 model (ONNX format)
net = cv.dnn.readNetFromONNX('yolov8n.onnx')
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# Load image
img = cv.imread('image.jpg')
original_height, original_width = img.shape[:2]

# Preprocess
input_size = 640
blob = cv.dnn.blobFromImage(img, 1/255.0, (input_size, input_size),
                           swapRB=True, crop=False)

# Run inference
net.setInput(blob)
output = net.forward()

# YOLOv8 outputs shape: [1, 84, 8400] for COCO
# Format: [x, y, w, h, class_scores...]
output = output[0].transpose()  # [8400, 84]

# Process detections
boxes = []
confidences = []
class_ids = []

img_height, img_width = img.shape[:2]
x_scale = img_width / input_size
y_scale = img_height / input_size

for detection in output:
    # Extract box coordinates
    x, y, w, h = detection[:4]
    
    # Get class scores and find max
    class_scores = detection[4:]
    class_id = np.argmax(class_scores)
    confidence = class_scores[class_id]
    
    if confidence > 0.5:
        # Scale back to original image
        x = int((x - w/2) * x_scale)
        y = int((y - h/2) * y_scale)
        w = int(w * x_scale)
        h = int(h * y_scale)
        
        boxes.append([x, y, w, h])
        confidences.append(float(confidence))
        class_ids.append(class_id)

# Apply NMS
indices = cv.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

# Draw results
for i in indices.flatten():
    x, y, w, h = boxes[i]
    cv.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    label = f"Class {class_ids[i]}: {confidences[i]:.2f}"
    cv.putText(img, label, (x, y-10),
              cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

cv.imshow('YOLOv8 Detection', img)
cv.waitKey(0)

SSD Object Detection

SSD (Single Shot MultiBox Detector) for faster detection:
import cv2 as cv
import numpy as np

# Load MobileNet-SSD model
net = cv.dnn.readNetFromCaffe(
    'MobileNetSSD_deploy.prototxt',
    'MobileNetSSD_deploy.caffemodel'
)

# COCO class names
classes = ["background", "aeroplane", "bicycle", "bird", "boat",
           "bottle", "bus", "car", "cat", "chair", "cow",
           "diningtable", "dog", "horse", "motorbike", "person",
           "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

# Load image
img = cv.imread('image.jpg')
height, width = img.shape[:2]

# Prepare input
blob = cv.dnn.blobFromImage(img, 0.007843, (300, 300), 127.5)

# Run detection
net.setInput(blob)
detections = net.forward()

# Process detections
for i in range(detections.shape[2]):
    confidence = detections[0, 0, i, 2]
    
    if confidence > 0.5:
        class_id = int(detections[0, 0, i, 1])
        
        # Get box coordinates
        box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])
        (x1, y1, x2, y2) = box.astype("int")
        
        # Draw detection
        label = f"{classes[class_id]}: {confidence:.2f}"
        cv.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv.putText(img, label, (x1, y1-10),
                  cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

cv.imshow('SSD Detection', img)
cv.waitKey(0)

Image Classification

import cv2 as cv
import numpy as np

# Load ResNet model
net = cv.dnn.readNetFromCaffe(
    'ResNet-50-deploy.prototxt',
    'ResNet-50-model.caffemodel'
)

# Load ImageNet class labels
with open('imagenet_classes.txt', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

# Load and preprocess image
img = cv.imread('dog.jpg')

# Create blob (ResNet expects 224x224 input)
blob = cv.dnn.blobFromImage(img, 1.0, (224, 224),
                           (104, 117, 123), swapRB=False, crop=False)

# Run inference
net.setInput(blob)
predictions = net.forward()

# Get top 5 predictions
top5_indices = np.argsort(predictions[0])[::-1][:5]

print("Top 5 predictions:")
for i, idx in enumerate(top5_indices):
    label = classes[idx]
    confidence = predictions[0][idx]
    print(f"{i+1}. {label}: {confidence*100:.2f}%")

# Display result
top_label = classes[top5_indices[0]]
cv.putText(img, top_label, (10, 30),
          cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv.imshow('Classification', img)
cv.waitKey(0)

Face Detection with DNN

Deep learning-based face detection (more accurate than Haar cascades):
import cv2 as cv

# Load face detection model
net = cv.dnn.readNetFromCaffe(
    'deploy.prototxt',
    'res10_300x300_ssd_iter_140000.caffemodel'
)

# Load image
img = cv.imread('faces.jpg')
height, width = img.shape[:2]

# Preprocess
blob = cv.dnn.blobFromImage(img, 1.0, (300, 300),
                           (104.0, 177.0, 123.0))

# Detect faces
net.setInput(blob)
detections = net.forward()

# Draw detections
for i in range(detections.shape[2]):
    confidence = detections[0, 0, i, 2]
    
    if confidence > 0.5:
        box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])
        (x1, y1, x2, y2) = box.astype("int")
        
        # Draw box and confidence
        text = f"{confidence*100:.2f}%"
        cv.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv.putText(img, text, (x1, y1-10),
                  cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

cv.imshow('Face Detection', img)
cv.waitKey(0)

Video Processing with DNN

import cv2 as cv
import time

# Load model
net = cv.dnn.readNetFromCaffe(
    'MobileNetSSD_deploy.prototxt',
    'MobileNetSSD_deploy.caffemodel'
)

# Open video
cap = cv.VideoCapture('video.mp4')

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    height, width = frame.shape[:2]
    
    # Prepare input
    blob = cv.dnn.blobFromImage(frame, 0.007843, (300, 300), 127.5)
    
    # Measure inference time
    start = time.time()
    net.setInput(blob)
    detections = net.forward()
    end = time.time()
    
    # Process detections
    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]
        
        if confidence > 0.5:
            box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])
            (x1, y1, x2, y2) = box.astype("int")
            cv.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    
    # Display FPS
    fps = 1 / (end - start)
    cv.putText(frame, f'FPS: {fps:.1f}', (10, 30),
              cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    cv.imshow('Detection', frame)
    
    if cv.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv.destroyAllWindows()

Performance Optimization

import cv2 as cv

net = cv.dnn.readNet('model.onnx')

# CUDA backend (requires OpenCV built with CUDA)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA)

# Or CUDA with FP16 (faster, slightly less accurate)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA_FP16)
# Intel OpenVINO for optimized inference on Intel hardware
net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# Or use Intel GPU
net.setPreferableTarget(cv.dnn.DNN_TARGET_OPENCL)
# Process multiple images at once
images = [img1, img2, img3, img4]

# Create batch blob
blob = cv.dnn.blobFromImages(images, 1/255.0, (640, 640))

net.setInput(blob)
outputs = net.forward()
Backend and target options:
  • DNN_BACKEND_OPENCV + DNN_TARGET_CPU: Default, works everywhere
  • DNN_BACKEND_CUDA + DNN_TARGET_CUDA: NVIDIA GPU acceleration
  • DNN_BACKEND_INFERENCE_ENGINE + DNN_TARGET_CPU: Intel OpenVINO
  • DNN_TARGET_OPENCL: OpenCL acceleration
  • DNN_TARGET_CUDA_FP16: Half-precision for faster inference
Common issues:
  • Model input size must match the size used during training
  • Check if the model expects RGB or BGR input (use swapRB parameter)
  • Normalize input values correctly (typically 0-1 or mean subtraction)
  • Ensure OpenCV is built with the desired backend support

Downloading Pre-trained Models

OpenCV provides scripts to download common models:
# Download YOLOv3
python opencv/samples/dnn/download_models.py --name yolo

# Download all models
python opencv/samples/dnn/download_models.py --all
Common model sources:

Next Steps