vectorsum

Deadline

ended (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor. Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1. Output: A scalar value equal to the sum of all elements in the input tensor.

Reference Implementation

from utils import make_match_reference
import torch
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of vector sum reduction using PyTorch.
    Args:
        data: Input tensor to be reduced
    Returns:
        Tensor containing the sum of all elements
    """
    # Let's be on the safe side here, and do the reduction in 64 bit
    return data.to(torch.float64).sum().to(torch.float32)


def generate_input(size: int, seed: int) -> input_t:
    """
    Generates random input tensor of specified shape with random offset and scale.
    The data is first generated as standard normal, then scaled and offset
    to prevent trivial solutions.
    
    Returns:
        Tensor to be reduced
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    
    # Generate base random data
    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
    
    # Generate random offset and scale (using different seeds to avoid correlation)
    offset_gen = torch.Generator(device='cuda')
    offset_gen.manual_seed(seed + 1)
    scale_gen = torch.Generator(device='cuda')
    scale_gen.manual_seed(seed + 2)
    
    # Generate random offset between -100 and 100
    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
    # Generate random scale between 0.1 and 10
    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
    
    # Apply scale and offset
    return (data * scale + offset).contiguous()


check_implementation = make_match_reference(ref_kernel)

Rankings

L4

tomaszki 🥇	665.082μs	vectorsum.py
FourCore 🥈	941.813μs +276.731μs	submission.py
Snektron 🥉	957.137μs +15.324μs	a.py
Nader	957.155μs +0.018μs	submission.py
Karang	964.620μs +7.465μs	vecsum_cuda.py
Darshan	971.431μs +6.811μs	vectorsum.py
ajhinh	983.189μs +11.758μs	l4.py
Sagar	1010.415μs +27.226μs	vectorsum_vannila_torch.py

T4

tomaszki 🥇	258.937μs	vectorsum.py
Nader 🥈	781.245μs +522.308μs	submission.py
Karang 🥉	793.310μs +12.066μs	vecsum_cuda.py
ajhinh	810.232μs +16.922μs	t4.py
Darshan	812.712μs +2.480μs	vectorsum.py
FourCore	815.884μs +3.172μs	vectorsum.py
Sagar	817.336μs +1.453μs	vectorsum_vannila_torch.py
az	265327.568μs +264510.232μs	submission_tinygrad.py

A100

tomaszki 🥇	99.117μs	vectorsum.py
Nader 🥈	150.511μs +51.394μs	submission.py
Snektron 🥉	154.198μs +3.687μs	a.py
FourCore	158.628μs +4.431μs	vectorsum.py
ajhinh	158.875μs +0.247μs	a100.py
LankyLad	183.723μs +24.848μs	submission.py
Karang	193.351μs +9.628μs	vecsum_cuda.py
Darshan	200.625μs +7.274μs	vectorsum.py
Bob	254.836μs +54.212μs	vectorsum.py

H100

tomaszki 🥇	73.976μs	vectorsum.py
Nader 🥈	90.218μs +16.242μs	submission.py
FourCore 🥉	93.859μs +3.641μs	vectorsum.py
Snektron	97.200μs +3.340μs	a.py
Darshan	97.452μs +0.252μs	vectorsum.py
Karang	98.200μs +0.748μs	vecsum_cuda.py
yechenzhi	98.985μs +0.786μs	submission_cuda.py
ajhinh	99.894μs +0.909μs	h100.py
DariusM	144.368μs +44.475μs	submission.py