conv2d_v2

Deadline

136 days 12 hours remaining (2025-12-30 00:00 UTC)

Language

Python

GPU Types

A100, B200, H100, L4

Description

Implement a 2D convolution kernel that matches the reference implementation. The kernel should perform 2D convolution with the given specifications We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all be even numbers with the exception of batch size which can sometimes be 1 We assume no padding and striding and instead vary the size of the input and kernel, number of channels, and batch size. Input: Tuple of (input_tensor, kernel) - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values

Reference Implementation

from utils import make_match_reference, DeterministicContext
import torch
import torch.nn.functional as F
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of 2D convolution using PyTorch.
    Args:
        data: Tuple of (input tensor, kernel tensor)
    Returns:
        Output tensor after convolution
    """
    with DeterministicContext():
        input_tensor, kernel, output = data
        return F.conv2d(
            input_tensor,
            kernel,
            # No padding and no striding
            stride=1,
            padding=0,
        )


def generate_input(
    size: int, kernelsize: int, channels: int, batch: int, seed: int
) -> input_t:
    """
    Generates random input and kernel tensors.
    Returns:
        Tuple of (input tensor, kernel tensor)
    """
    gen = torch.Generator(device="cuda")
    gen.manual_seed(seed)

    # Generate input tensor: [batch, in_channels, height, width]
    input_tensor = torch.randn(
        batch, channels, size, size, device="cuda", dtype=torch.float32, generator=gen
    ).contiguous()

    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
    # Here we use same number of output channels as input channels for simplicity
    kernel = torch.randn(
        channels,
        channels,
        kernelsize,
        kernelsize,
        device="cuda",
        dtype=torch.float32,
        generator=gen,
    ).contiguous()

    output_tensor = torch.empty(
        batch,
        channels,
        size - kernelsize + 1,
        size - kernelsize + 1,
        device="cuda",
        dtype=torch.float32,
    )

    return input_tensor, kernel, output_tensor


check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)

No submissions yet

Be the first to submit a solution for this challenge!