vectoradd

Deadline

41 days 20 hours (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement a float16 vector addition kernel. Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from a normal distribution with mean 0 and variance 1. Output: torch.Tensor of shape (N, N) and type torch.float16

Reference Implementation

from utils import make_match_reference
import torch
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of vector addition using PyTorch.
    Args:
        data: Tuple of tensors [A, B] to be added.
    Returns:
        Tensor containing element-wise sums.
    """
    A, B = data
    return A + B


def generate_input(size: int, seed: int) -> input_t:
    """
    Generates random input tensors of specified shapes.
    Returns:
        Tuple of tensors [A, B] to be added.
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
    return (A, B)


check_implementation = make_match_reference(ref_kernel)

Rankings

L4

FourCore 🥇 6515.646μs vectoradd.py
Brent 🥈 6610.478μs   +94.832μs submission_tuned_L4.py
ajhinh 🥉 6711.567μs   +101.089μs l4.py
Snektron 6803.561μs   +91.995μs a.py
everettknag 6843.141μs   +39.580μs vectoradd_triton_1d.py
truk@PLT 6851.278μs   +8.137μs submission.py
Ali Panahi 6853.392μs   +2.114μs submission.py
arund42 7029.181μs   +175.789μs vectoradd_128b_generic.py
Sagar 7114.675μs   +85.494μs vectorAdd_half2.py
mashisong 7242.812μs   +128.137μs vectoradd_triton.py
Trax 9081.337μs   +1838.525μs custom_kernel.py

T4

ajhinh 🥇 6270.295μs t4.py
akitaka_99 🥈 6308.868μs   +38.573μs vectoradd.py
Shlok 🥉 6325.110μs   +16.242μs vectoradd.py
Darshan 6332.942μs   +7.832μs vector_add_torch.py
Moses 6355.233μs   +22.291μs vector_sum_baseline.py
everettknag 6399.588μs   +44.355μs vectoradd_triton_1d.py
siclait 6407.439μs   +7.851μs submission.py
Dhanshre 6410.973μs   +3.535μs vectoradd.py
Haw 6499.274μs   +88.301μs submission_cuda_inline.py
Brent 6515.912μs   +16.637μs submission_tuned_T4.py
salad 6528.092μs   +12.180μs ref.py
FourCore 6535.435μs   +7.343μs vectoraddAsyncCopy.py
truk@PLT 6561.160μs   +25.725μs submission.py
arund42 6625.695μs   +64.535μs vectoradd_128b_generic.py
Snektron 6665.361μs   +39.666μs b.py
mashisong 7515.279μs   +849.918μs vectoradd_triton.py
Trax 9046.479μs   +1531.200μs custom_kernel.py
Gokul 9162.481μs   +116.002μs submission.py

A100

mancala 🥇 927.572μs submission.py
ajhinh 🥈 930.154μs   +2.582μs a100.py
pooya 🥉 960.983μs   +30.829μs vectoradd.py
Snektron 967.257μs   +6.274μs a.py
FourCore 977.404μs   +10.147μs vectoradd.py
truk@PLT 978.570μs   +1.166μs submission.py
Brent 987.544μs   +8.974μs submission_tuned_A100.py
mashisong 1032.990μs   +45.446μs vectoradd_triton.py
teddy_chou 1045.286μs   +12.296μs vecadd.py
everettknag 1118.644μs   +73.357μs vectoradd_triton_1d.py
arund42 1212.637μs   +93.993μs vectoradd_128b_generic.py
DizzleRama 1251.470μs   +38.834μs submission_cuda_inline.py
Trax 1538.904μs   +287.433μs custom_kernel.py
Gokul 2118.579μs   +579.675μs submission.py
veer 3704.028μs   +1585.450μs submission.py

H100

DUMBPANDABEAR 🥇 539.177μs submission_2.py
Snektron 🥈 539.416μs   +0.239μs b.py
mancala 🥉 539.580μs   +0.164μs submission.py
truk@PLT 540.120μs   +0.540μs submission.py
ajhinh 540.823μs   +0.703μs h100.py
arund42 541.140μs   +0.317μs vectoradd_128b_H100.py
Xavier Init 543.016μs   +1.875μs dimension_check.py
FourCore 550.080μs   +7.065μs vectoradd.py
.zos 557.373μs   +7.293μs i.py
osehn 575.002μs   +17.629μs vectoradd_triton_1d.py
Brent 603.834μs   +28.832μs submission.py
akitaka_99 607.936μs   +4.102μs vectoradd.py
jaynux_56616 613.304μs   +5.368μs triton_cuda_vector_add_kernel.py
mashisong 618.988μs   +5.684μs vectoradd_triton.py
everettknag 693.877μs   +74.889μs vectoradd_triton_1d.py
Trax 732.034μs   +38.157μs custom_kernel.py
Gokul 1321.964μs   +589.930μs submission.py