vectorsum

Deadline

41 days 17 hours (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor. Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1. Output: A scalar value equal to the sum of all elements in the input tensor.

Reference Implementation

from utils import make_match_reference
import torch
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of vector sum reduction using PyTorch.
    Args:
        data: Input tensor to be reduced
    Returns:
        Tensor containing the sum of all elements
    """
    # Let's be on the safe side here, and do the reduction in 64 bit
    return data.to(torch.float64).sum().to(torch.float32)


def generate_input(size: int, seed: int) -> input_t:
    """
    Generates random input tensor of specified shape with random offset and scale.
    The data is first generated as standard normal, then scaled and offset
    to prevent trivial solutions.
    
    Returns:
        Tensor to be reduced
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    
    # Generate base random data
    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
    
    # Generate random offset and scale (using different seeds to avoid correlation)
    offset_gen = torch.Generator(device='cuda')
    offset_gen.manual_seed(seed + 1)
    scale_gen = torch.Generator(device='cuda')
    scale_gen.manual_seed(seed + 2)
    
    # Generate random offset between -100 and 100
    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
    # Generate random scale between 0.1 and 10
    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
    
    # Apply scale and offset
    return (data * scale + offset).contiguous()


check_implementation = make_match_reference(ref_kernel)

Rankings

L4

tomaszki 🥇 665.082μs vectorsum.py
FourCore 🥈 941.813μs   +276.731μs submission.py
Snektron 🥉 957.137μs   +15.324μs a.py
Karang 964.620μs   +7.483μs vecsum_cuda.py
Darshan 971.431μs   +6.811μs vectorsum.py
ajhinh 983.189μs   +11.758μs l4.py
Sagar 1010.415μs   +27.226μs vectorsum_vannila_torch.py

T4

tomaszki 🥇 258.937μs vectorsum.py
Karang 🥈 793.310μs   +534.373μs vecsum_cuda.py
ajhinh 🥉 810.232μs   +16.922μs t4.py
Darshan 812.712μs   +2.480μs vectorsum.py
FourCore 815.884μs   +3.172μs vectorsum.py
Sagar 817.336μs   +1.453μs vectorsum_vannila_torch.py
az 265327.568μs   +264510.232μs submission_tinygrad.py

A100

tomaszki 🥇 99.117μs vectorsum.py
Snektron 🥈 154.198μs   +55.081μs a.py
FourCore 🥉 158.628μs   +4.431μs vectorsum.py
ajhinh 158.875μs   +0.247μs a100.py
LankyLad 183.723μs   +24.848μs submission.py
Karang 193.351μs   +9.628μs vecsum_cuda.py
Darshan 200.625μs   +7.274μs vectorsum.py
Bob 254.836μs   +54.212μs vectorsum.py

H100

tomaszki 🥇 73.976μs vectorsum.py
FourCore 🥈 93.859μs   +19.883μs vectorsum.py
Snektron 🥉 97.200μs   +3.340μs a.py
Darshan 97.452μs   +0.252μs vectorsum.py
Karang 98.200μs   +0.748μs vecsum_cuda.py
ajhinh 99.894μs   +1.694μs h100.py
DariusM 144.368μs   +44.475μs submission.py