Semantic Segmentation with DNN

Semantic segmentation assigns a class label to every pixel in an image, enabling detailed scene understanding. OpenCV’s DNN module supports various segmentation architectures trained on datasets like PASCAL VOC, Cityscapes, and COCO.

Supported Models

FCN (Fully Convolutional Networks) - FCN-8s, FCN-ResNet101
ENet - Efficient neural network for real-time segmentation
DeepLab - State-of-the-art segmentation with atrous convolution
U-Net - Popular architecture for medical image segmentation
PSPNet - Pyramid Scene Parsing Network

Python Implementation

Import Libraries

import cv2 as cv
import numpy as np

Load the Model

# Load segmentation model (ENet example)
model = 'Enet-model-best.net'
net = cv.dnn.readNet(model)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# Load class names
classes = None
with open('enet-classes.txt', 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')

Generate or Load Colors

# Generate random colors for each class
np.random.seed(324)
colors = None

# Option 1: Generate colors automatically
def generate_colors(num_classes):
    colors = [np.array([0, 0, 0], np.uint8)]
    for i in range(1, num_classes):
        colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
    return colors

# Option 2: Load predefined colors from file
colors_file = 'colors.txt'
with open(colors_file, 'rt') as f:
    colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]

Prepare Input Image

# Read input image
frame = cv.imread('image.jpg')
frameHeight, frameWidth = frame.shape[:2]

# Create blob from image
# ENet uses 512x256 input with scale 1/255
blob = cv.dnn.blobFromImage(frame, 1.0/255.0, (512, 256), [0, 0, 0], True, crop=False)

Different segmentation models require different input sizes:

ENet: 512x256
FCN-8s: 500x500
FCN-ResNet101: 500x500

Run Inference

# Set input blob
net.setInput(blob)

# Forward pass to get score map
score = net.forward()

# score shape: [1, num_classes, height, width]
numClasses = score.shape[1]
height = score.shape[2]
width = score.shape[3]

Post-process Segmentation Map

# Generate colors if not loaded
if not colors:
    colors = [np.array([0, 0, 0], np.uint8)]
    for i in range(1, numClasses):
        colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)

# Get class ID for each pixel
classIds = np.argmax(score[0], axis=0)

# Create colored segmentation mask
segm = np.stack([colors[idx] for idx in classIds.flatten()])
segm = segm.reshape(height, width, 3)

# Resize to original frame size
segm = cv.resize(segm, (frameWidth, frameHeight), interpolation=cv.INTER_NEAREST)

Overlay and Display

# Blend segmentation with original image
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)

# Add inference time
t, _ = net.getPerfProfile()
label = f'Inference time: {t * 1000.0 / cv.getTickFrequency():.2f} ms'
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))

# Display result
cv.imshow('Semantic Segmentation', frame)
cv.waitKey(0)

C++ Implementation

Complete Example
Video Processing

#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <fstream>
#include <iostream>

using namespace cv;
using namespace dnn;

std::vector<std::string> classes;
std::vector<Vec3b> colors;

void colorizeSegmentation(const Mat &score, Mat &segm) {
    const int rows = score.size[2];
    const int cols = score.size[3];
    const int chns = score.size[1];
    
    if (colors.empty()) {
        // Generate colors
        colors.push_back(Vec3b());
        for (int i = 1; i < chns; ++i) {
            Vec3b color;
            for (int j = 0; j < 3; ++j)
                color[j] = (colors[i - 1][j] + rand() % 256) / 2;
            colors.push_back(color);
        }
    }
    
    // Find class with maximum score for each pixel
    Mat maxCl = Mat::zeros(rows, cols, CV_8UC1);
    Mat maxVal(rows, cols, CV_32FC1, score.data);
    for (int ch = 1; ch < chns; ch++) {
        for (int row = 0; row < rows; row++) {
            const float *ptrScore = score.ptr<float>(0, ch, row);
            uint8_t *ptrMaxCl = maxCl.ptr<uint8_t>(row);
            float *ptrMaxVal = maxVal.ptr<float>(row);
            for (int col = 0; col < cols; col++) {
                if (ptrScore[col] > ptrMaxVal[col]) {
                    ptrMaxVal[col] = ptrScore[col];
                    ptrMaxCl[col] = (uchar)ch;
                }
            }
        }
    }
    
    // Create colored segmentation mask
    segm.create(rows, cols, CV_8UC3);
    for (int row = 0; row < rows; row++) {
        const uchar *ptrMaxCl = maxCl.ptr<uchar>(row);
        Vec3b *ptrSegm = segm.ptr<Vec3b>(row);
        for (int col = 0; col < cols; col++) {
            ptrSegm[col] = colors[ptrMaxCl[col]];
        }
    }
}

int main(int argc, char** argv) {
    // Load model
    String model = "Enet-model-best.net";
    Net net = readNet(model);
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(DNN_TARGET_CPU);
    
    // Read input image
    Mat frame = imread("image.jpg");
    
    // Create blob
    Mat blob;
    Scalar mean(0, 0, 0);
    blobFromImage(frame, blob, 1.0/255.0, Size(512, 256), mean, true, false);
    
    // Set input and forward
    net.setInput(blob);
    Mat score = net.forward();
    
    // Colorize segmentation
    Mat segm;
    colorizeSegmentation(score, segm);
    
    // Resize to original size
    resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
    
    // Blend with original image
    addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
    
    // Display
    imshow("Semantic Segmentation", frame);
    waitKey(0);
    
    return 0;
}

// Open video capture
VideoCapture cap;
cap.open(0); // or video file

Mat frame, blob;
while (waitKey(1) < 0) {
    cap >> frame;
    if (frame.empty())
        break;
    
    // Create blob
    blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), 
                 mean, swapRB, false);
    
    // Forward pass
    net.setInput(blob);
    Mat score = net.forward();
    
    // Colorize
    Mat segm;
    colorizeSegmentation(score, segm);
    
    // Resize and blend
    resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
    addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
    
    // Display performance
    std::vector<double> layersTimes;
    double freq = getTickFrequency() / 1000;
    double t = net.getPerfProfile(layersTimes) / freq;
    std::string label = format("Inference time: %.2f ms", t);
    putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 
           0.5, Scalar(0, 255, 0));
    
    imshow("Semantic Segmentation", frame);
}

Creating a Legend

Display a legend showing class names and colors:

def showLegend(classes, colors):
    if classes is None or len(classes) == 0:
        return
        
    blockHeight = 30
    legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
    
    for i in range(len(classes)):
        block = legend[i * blockHeight:(i + 1) * blockHeight]
        block[:, :] = colors[i]
        cv.putText(block, classes[i], (0, blockHeight // 2), 
                  cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
    
    cv.namedWindow('Legend', cv.WINDOW_NORMAL)
    cv.imshow('Legend', legend)

# Call in main loop
showLegend(classes, colors)

C++ version:

void showLegend() {
    static const int kBlockHeight = 30;
    static Mat legend;
    if (legend.empty()) {
        const int numClasses = (int)classes.size();
        legend.create(kBlockHeight * numClasses, 200, CV_8UC3);
        for (int i = 0; i < numClasses; i++) {
            Mat block = legend.rowRange(i * kBlockHeight, (i + 1) * kBlockHeight);
            block.setTo(colors[i]);
            putText(block, classes[i], Point(0, kBlockHeight / 2), 
                   FONT_HERSHEY_SIMPLEX, 0.5, Vec3b(255, 255, 255));
        }
        namedWindow("Legend", WINDOW_NORMAL);
        imshow("Legend", legend);
    }
}

Model Configurations

ENet (Torch)

enet:
  model: "Enet-model-best.net"
  mean: [0, 0, 0]
  scale: 0.00392  # 1/255
  width: 512
  height: 256
  rgb: true
  classes: "enet-classes.txt"

Download: https://github.com/e-lab/ENet-training Classes: 20 road scene classes (Cityscapes-style)

FCN-8s (Caffe)

fcn8s:
  model: "fcn8s-heavy-pascal.caffemodel"
  config: "fcn8s-heavy-pascal.prototxt"
  mean: [0, 0, 0]
  scale: 1.0
  width: 500
  height: 500
  rgb: false

Download: http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel Classes: 21 PASCAL VOC classes

FCN-ResNet101 (ONNX)

fcnresnet101:
  model: "fcn-resnet101-11.onnx"
  mean: [103.5, 116.2, 123.6]
  scale: 0.019
  width: 500
  height: 500
  rgb: false

Download: https://github.com/onnx/models (ONNX Model Zoo)

Common Segmentation Classes

Cityscapes (ENet)
PASCAL VOC (FCN)

Road scene segmentation with 20 classes:

road
sidewalk
building
wall
fence
pole
traffic light
traffic sign
vegetation
terrain
sky
person
rider
car
truck
bus
train
motorcycle
bicycle

Performance Optimization

Use GPU Acceleration

net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA)

Reduce Input Size

Smaller input sizes process faster but may lose detail:

# Original: 512x256
blob = cv.dnn.blobFromImage(frame, 1.0/255.0, (512, 256), [0, 0, 0], True)

# Faster: 256x128
blob = cv.dnn.blobFromImage(frame, 1.0/255.0, (256, 128), [0, 0, 0], True)

Use Efficient Models

Choose models based on speed/accuracy tradeoff:

ENet: Real-time, good for road scenes
FCN-8s: Moderate speed, high accuracy
DeepLab: Best accuracy, slower

Blending Segmentation with Original Image

Adjust the blend ratio for different visualization effects:

# Heavy segmentation overlay (90% segmentation, 10% original)
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)

# Balanced overlay (50% each)
frame = (0.5 * frame + 0.5 * segm).astype(np.uint8)

# Light segmentation overlay (30% segmentation, 70% original)
frame = (0.7 * frame + 0.3 * segm).astype(np.uint8)

# Using OpenCV addWeighted (C++/Python)
output = cv.addWeighted(frame, 0.3, segm, 0.7, 0.0)

Complete Example with Video

import cv2 as cv
import numpy as np

def main():
    # Load model
    model = 'Enet-model-best.net'
    net = cv.dnn.readNet(model)
    net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
    net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
    
    # Load classes
    with open('enet-classes.txt', 'rt') as f:
        classes = f.read().rstrip('\n').split('\n')
    
    # Generate colors
    np.random.seed(324)
    colors = [np.array([0, 0, 0], np.uint8)]
    for i in range(1, len(classes)):
        colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
    
    # Open video
    cap = cv.VideoCapture(0)  # or video file
    
    cv.namedWindow('Segmentation', cv.WINDOW_NORMAL)
    
    while cv.waitKey(1) < 0:
        hasFrame, frame = cap.read()
        if not hasFrame:
            break
        
        frameHeight, frameWidth = frame.shape[:2]
        
        # Create blob
        blob = cv.dnn.blobFromImage(frame, 1.0/255.0, (512, 256), 
                                   [0, 0, 0], True, crop=False)
        
        # Run segmentation
        net.setInput(blob)
        score = net.forward()
        
        numClasses = score.shape[1]
        height = score.shape[2]
        width = score.shape[3]
        
        # Get class for each pixel
        classIds = np.argmax(score[0], axis=0)
        
        # Create colored mask
        segm = np.stack([colors[idx] for idx in classIds.flatten()])
        segm = segm.reshape(height, width, 3)
        segm = cv.resize(segm, (frameWidth, frameHeight), 
                        interpolation=cv.INTER_NEAREST)
        
        # Blend
        frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)
        
        # Add timing info
        t, _ = net.getPerfProfile()
        label = f'Inference time: {t * 1000.0 / cv.getTickFrequency():.2f} ms'
        cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 
                  0.5, (0, 255, 0))
        
        cv.imshow('Segmentation', frame)

if __name__ == '__main__':
    main()

Semantic segmentation is computationally expensive. For real-time applications on CPU, use lightweight models like ENet or reduce input resolution.

Source Code

Complete source code for semantic segmentation:

Python: samples/dnn/segmentation.py
C++: samples/dnn/segmentation.cpp

​Supported Models

​Python Implementation

​C++ Implementation

​Creating a Legend

​Model Configurations

​ENet (Torch)

​FCN-8s (Caffe)

​FCN-ResNet101 (ONNX)

​Common Segmentation Classes

​Performance Optimization

​Blending Segmentation with Original Image

​Complete Example with Video

​Source Code

Supported Models

Python Implementation

C++ Implementation

Creating a Legend

Model Configurations

ENet (Torch)

FCN-8s (Caffe)

FCN-ResNet101 (ONNX)

Common Segmentation Classes

Performance Optimization

Blending Segmentation with Original Image

Complete Example with Video

Source Code