conv2d

Deadline

ended (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement a 2D convolution kernel that matches the reference implementation. The kernel should perform 2D convolution with the given specifications We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all be even numbers with the exception of batch size which can sometimes be 1 We assume no padding and striding and instead vary the size of the input and kernel, number of channels, and batch size. Input: Tuple of (input_tensor, kernel) - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values

Reference Implementation

from utils import make_match_reference
import torch
import torch.nn.functional as F
from task import input_t, output_t

class DisableCuDNNTF32:
    def __init__(self):
        self.allow_tf32 = torch.backends.cudnn.allow_tf32
        self.deterministic = torch.backends.cudnn.deterministic
        pass

    def __enter__(self):
        torch.backends.cudnn.allow_tf32 = False
        torch.backends.cudnn.deterministic = True
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        torch.backends.cudnn.allow_tf32 = self.allow_tf32
        torch.backends.cudnn.deterministic = self.deterministic


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of 2D convolution using PyTorch.
    Args:
        data: Tuple of (input tensor, kernel tensor)
    Returns:
        Output tensor after convolution
    """
    with DisableCuDNNTF32():
        input_tensor, kernel = data
        return F.conv2d(
            input_tensor, 
            kernel,

            # No padding and no striding
            # TODO: Can revisit this in future problems
            stride=1,
            padding=0
        )


def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
    """
    Generates random input and kernel tensors.
    Returns:
        Tuple of (input tensor, kernel tensor)
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    
    # Generate input tensor: [batch, in_channels, height, width]
    input_tensor = torch.randn(
        batch, channels, size, size,
        device='cuda', 
        dtype=torch.float32, 
        generator=gen
    ).contiguous()
    
    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
    # Here we use same number of output channels as input channels for simplicity
    kernel = torch.randn(
        channels, channels, kernelsize, kernelsize,
        device='cuda',
        dtype=torch.float32,
        generator=gen
    ).contiguous()
    
    return (input_tensor, kernel)


check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)

Rankings

L4

Chadlet 🥇	69302.430μs	AIDE_out.py
az 🥈	291155.807μs +221853.377μs	submission.py
ajhinh 🥉	311041.958μs +19886.151μs	l4.py

T4

az 🥇	971584.539μs	submission.py
ajhinh 🥈	1043712.628μs +72128.089μs	t4.py
Chadlet 🥉	1066887.741μs +23175.113μs	submission.py

A100

Chadlet 🥇	16719.655μs	AIDE_out.py
ajhinh 🥈	23116.982μs +6397.327μs	a100.py
yia_perf 🥉	23401.315μs +284.333μs	my_conv2d_kernel.py
siro	23420.895μs +19.580μs	submission.py
mancala	23458.855μs +37.960μs	ref.py
Phil Butler	23603.107μs +144.252μs	faafo.py
az	119748.801μs +96145.694μs	submission.py

H100

ajhinh 🥇	7574.126μs	h100.py
siro 🥈	7582.660μs +8.534μs	poopoo.py
Chadlet 🥉	7614.341μs +31.681μs	conv2d_AIDE_out.py
yechenzhi	47755.610μs +40141.269μs	ref.py
az	47778.981μs +23.371μs	submission.py