Introduction to GPU-Accelerated Image Processing with CuPy and cuCIM#

In this brief tutorial, weโ€™ll explore how to use CuPy and cuCIM for high-performance image processing on NVIDIA GPUs.

Prerequisites#

Ensure you have CuPy and cuCIM installed in your environment. If not, you can install them for your specific CUDA version:

Install for CUDA 12:

pip install cucim-cu12

Alternatively install for CUDA 11:

pip install cucim-cu11
from skimage import io
import cucim.skimage
import cupy as cp
import matplotlib.pyplot as plt
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 2
      1 from skimage import io
----> 2 import cucim.skimage
      3 import cupy as cp
      4 import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'cucim'

Load and Display an Image#

image = io.imread('EPFL_aereal.jpeg')
# Display the image
plt.imshow(image)
plt.title('Original Image')
plt.axis('off') 
plt.show()
../../../_images/397a1ecd225317c13559e42d60ab3d1f28ec8536e35f6f371b2ea7fb58161005.png

Convert Image to Grayscale using CuPy#

Here we utilize CuPy to convert the image to grayscale, demonstrating the GPUโ€™s capability to handle pixel-wise operations.

def rgb2gray_cupy(result, image):    
    result[...] = (0.21 * image[..., 0] + 0.72 * image[..., 1] + 0.07 * image[..., 2])
    return result


# Send image to device
image_device = cp.asarray(image)
gray_image_device = cp.zeros_like(image_device[...,0])

# Convert and time the operation
start_gpu = cp.cuda.Event(); end_gpu = cp.cuda.Event()
start_gpu.record()
gray_image_device = rgb2gray_cupy(gray_image_device, image_device)
end_gpu.record()
end_gpu.synchronize()
gpu_time = cp.cuda.get_elapsed_time(start_gpu, end_gpu)

gray_image = gray_image_device.get()

# Display the grayscale image
plt.imshow(gray_image, cmap='gray')  # .get() transfers data back to host
plt.title('Grayscale Image')
plt.axis('off')
plt.show()

print(f"Conversion Time on GPU: {gpu_time:.5f} ms")
../../../_images/732511f66ed8f9e5117e02da4bcd07ad41755b9cdf1f64f83176df73df9bc533.png
Conversion Time on GPU: 0.69427 ms

Advanced Image Processing: Edge Detection using cuCIM#

(Example edited from cuCIMโ€™s documentation)

Next, letโ€™s apply an edge detection filter using cuCIM, which utilizes optimized GPU routines.

from cucim.skimage import filters

# Apply edge detection
edges = filters.sobel(gray_image_device)

# Display the edges
plt.imshow(edges.get(), cmap='gray')
plt.title('Edge Detection')
plt.axis('off')
plt.show()
../../../_images/6ebf1e4fe1223dbdce49ec4445d027e81fa5c58a59a9f626cf0fa9f669d5b121.png

Conclusion

This tutorial introduced you to basic and slightly advanced GPU-accelerated image processing techniques using CuPy and cuCIM. By offloading computationally intensive tasks to the GPU, significant performance improvements can be achieved, as demonstrated in the grayscale conversion and edge detection examples.

import time

import cupy as cp
import matplotlib.pyplot as plt
import numpy as np
from skimage import data

durations = {}
for use_gpu in (False, True):

    if use_gpu:
        from cupyx.scipy import ndimage as ndi
        from cucim.skimage.util import img_as_float32
        from cucim.skimage.filters import gabor_kernel
        xp = cp
        asnumpy = cp.asnumpy
        device_name = "gpu"
    else:
        from scipy import ndimage as ndi
        from skimage.util import img_as_float32
        from skimage.filters import gabor_kernel
        xp = np
        asnumpy = np.asarray
        device_name = "cpu"

        
    def compute_feats(image, kernels):
        feats = xp.zeros((len(kernels), 2), dtype=np.double)
        for k, kernel in enumerate(kernels):
            filtered = ndi.convolve(image, kernel, mode='wrap')
            feats[k, 0] = filtered.mean()
            feats[k, 1] = filtered.var()
        return feats


    def match(feats, ref_feats):
        min_error = np.inf
        min_i = None
        for i in range(ref_feats.shape[0]):
            error = xp.sum((feats - ref_feats[i, :])**2)
            if error < min_error:
                min_error = error
                min_i = i
        return min_i

    tstart = time.time()

    # prepare filter bank kernels
    kernels = []
    for theta in range(4):
        theta = theta / 4. * np.pi
        for sigma in (1, 3):
            for frequency in (0.05, 0.25):
                kernel = gabor_kernel(frequency, theta=theta,
                                      sigma_x=sigma, sigma_y=sigma)
                kernels.append(kernel.real)


    brick = img_as_float32(xp.asarray(data.brick()))  
    grass = img_as_float32(xp.asarray(data.grass()))
    gravel = img_as_float32(xp.asarray(data.gravel()))
    image_names = ('brick', 'grass', 'gravel')
    images = (brick, grass, gravel)

    # prepare reference features
    ref_feats = xp.zeros((3, len(kernels), 2), dtype=np.double)
    ref_feats[0, :, :] = compute_feats(brick, kernels)
    ref_feats[1, :, :] = compute_feats(grass, kernels)
    ref_feats[2, :, :] = compute_feats(gravel, kernels)

    print('Rotated images matched against references using Gabor filter banks:')

    print('original: brick, rotated: 30deg, match result: ', end='')
    feats = compute_feats(ndi.rotate(brick, angle=190, reshape=False), kernels)
    print(image_names[match(feats, ref_feats)])

    print('original: brick, rotated: 70deg, match result: ', end='')
    feats = compute_feats(ndi.rotate(brick, angle=70, reshape=False), kernels)
    print(image_names[match(feats, ref_feats)])

    print('original: grass, rotated: 145deg, match result: ', end='')
    feats = compute_feats(ndi.rotate(grass, angle=145, reshape=False), kernels)
    print(image_names[match(feats, ref_feats)])


    def power(image, kernel):
        # Normalize images for better comparison.
        image = (image - image.mean()) / image.std()
        return xp.sqrt(ndi.convolve(image, kernel.real, mode='wrap')**2 +
                       ndi.convolve(image, kernel.imag, mode='wrap')**2)

    # Plot a selection of the filter bank kernels and their responses.
    results = []
    kernel_params = []
    for theta in (0, 1):
        theta = theta / 4. * np.pi
        for frequency in (0.1, 0.4):
            kernel = gabor_kernel(frequency, theta=theta)
            params = 'theta=%d,\nfrequency=%.2f' % (theta * 180 / np.pi, frequency)
            kernel_params.append(params)
            # Save kernel and the power image for each image
            results.append((kernel, xp.stack([power(img, kernel) for img in images])))
            
    dur = time.time() - tstart
    print(f"Duration {device_name} = {dur} s")
    durations[device_name] = dur

    fig, axes = plt.subplots(nrows=5, ncols=4, figsize=(12, 14))
    plt.gray()

    fig.suptitle('Image responses for Gabor filter kernels', fontsize=12)

    axes[0][0].axis('off')

    # Plot original images
    for label, img, ax in zip(image_names, images, axes[0][1:]):
        ax.imshow(asnumpy(img))
        ax.set_title(label, fontsize=9)
        ax.axis('off')

    for label, (kernel, powers), ax_row in zip(kernel_params, results, axes[1:]):
        # Plot Gabor kernel
        ax = ax_row[0]
        ax.imshow(asnumpy(kernel.real))
        ax.set_ylabel(label, fontsize=7)
        ax.set_xticks([])
        ax.set_yticks([])

        # Plot Gabor responses with the contrast normalized for each filter
        vmin = float(powers.min())
        vmax = float(powers.max())
        for patch, ax in zip(powers, ax_row[1:]):
            ax.imshow(asnumpy(patch), vmin=vmin, vmax=vmax)
            ax.axis('off')
    plt.tight_layout()
    plt.show()
    
print(f"GPU Acceleration = {durations['cpu']/durations['gpu']:0.4f}")
Rotated images matched against references using Gabor filter banks:
original: brick, rotated: 30deg, match result: brick
original: brick, rotated: 70deg, match result: brick
original: grass, rotated: 145deg, match result: brick
Duration cpu = 6.299292802810669 s
../../../_images/82e9f4f3c28c0635ee5e09c4c97e27db7d53c3e7fa896324f4eacf5118a2d8a3.png
Rotated images matched against references using Gabor filter banks:
original: brick, rotated: 30deg, match result: brick
original: brick, rotated: 70deg, match result: brick
original: grass, rotated: 145deg, match result: brick
Duration gpu = 0.13782525062561035 s
../../../_images/78c93e8837a1477b405b53e9df833c142b9d5c678268815ae335a0cceed6ff70.png
GPU Acceleration = 45.7049