histogram

Deadline

41 days 17 hours (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement a histogram kernel that counts the number of elements falling into each bin across the specified range. The minimum and maximum values of the range are fixed to 0 and 100 respectively. All sizes are multiples of 16 and the number of bins is set to the size of the input tensor divided by 16. Input: - data: a tensor of shape (size,)

Reference Implementation

from utils import verbose_allequal
import torch
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of histogram using PyTorch.
    Args:
        data: tensor of shape (size,)
    Returns:
        Tensor containing bin counts
    """
    # Count values in each bin
    return torch.bincount(data, minlength=256)


def generate_input(size: int, contention: float, seed: int) -> input_t:
    """
    Generates random input tensor for histogram.

    Args:
        size: Size of the input tensor (must be multiple of 16)
        contention: float in [0, 100], specifying the percentage of identical values
        seed: Random seed
    Returns:
        The input tensor with values in [0, 255]
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    
    # Generate integer values between 0 and 256
    data = torch.randint(0, 256, (size,), device='cuda', dtype=torch.uint8, generator=gen)

    # make one value appear quite often, increasing the chance for atomic contention
    evil_value = torch.randint(0, 256, (), device='cuda', dtype=torch.uint8, generator=gen)
    evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
    data[evil_loc] = evil_value

    return data.contiguous()


def check_implementation(data, output):
    expected = ref_kernel(data)
    reasons = verbose_allequal(output, expected)

    if len(reasons) > 0:
        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)

    return ''

Rankings

L4

FourCore 🥇 79.095μs histogram.py
tomaszki 🥈 87.895μs   +8.800μs histogram_2.py
Darshan 🥉 96.719μs   +8.824μs baseline.py
Multivac 116.011μs   +19.292μs submission_shared_mem.py
ajhinh 117.647μs   +1.636μs l4.py
Karang 153.385μs   +35.738μs histo_v2_cuda.py

T4

tomaszki 🥇 115.777μs histogram.py
FourCore 🥈 129.195μs   +13.419μs histogram.py
Darshan 🥉 130.399μs   +1.203μs baseline.py
ajhinh 212.836μs   +82.438μs t4.py
Karang 314.760μs   +101.924μs histo_v2_cuda.py
Multivac 393.546μs   +78.785μs submission_shared_mem.py

A100

mancala 🥇 38.125μs submission.py
tomaszki 🥈 44.217μs   +6.092μs histogram_2.py
Darshan 🥉 44.764μs   +0.547μs baseline.py
FourCore 46.678μs   +1.914μs histogram.py
Snektron 47.872μs   +1.193μs hist.py
Multivac 79.181μs   +31.309μs submission_shared_mem.py
ajhinh 81.788μs   +2.608μs a100.py
Shihab 100.419μs   +18.631μs submission.py
Karang 117.395μs   +16.976μs histo_v2_cuda.py

H100

mancala 🥇 26.493μs submission.py
FourCore 🥈 31.467μs   +4.973μs histogram.py
tomaszki 🥉 34.097μs   +2.630μs histogram.py
Darshan 44.387μs   +10.291μs baseline.py
Snektron 48.661μs   +4.274μs hist.py
Multivac 57.008μs   +8.347μs submission_shared_mem.py
ajhinh 58.318μs   +1.310μs h100.py
Karang 68.507μs   +10.189μs histo_v2_cuda.py
Shihab 79.570μs   +11.063μs submission.py