grayscale

Deadline

41 days 19 hours (2025-06-30 00:00 UTC)

Language

Python

GPU Types

A100, H100, L4, T4

Description

Implement an RGB to grayscale conversion kernel that matches the reference implementation. The kernel should convert square RGB images with even sizes to grayscale using the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B Input: RGB tensor of shape (H, W, 3) with values in [0, 1] Output: Grayscale tensor of shape (H, W) with values in [0, 1]

Reference Implementation

from utils import make_match_reference
import torch
from task import input_t, output_t


def ref_kernel(data: input_t) -> output_t:
    """
    Reference implementation of RGB to grayscale conversion using PyTorch.
    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
    
    Args:
        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
    Returns:
        Grayscale tensor of shape (H, W) with values in [0, 1]
    """
    # Standard RGB to Grayscale coefficients
    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
                         device=data.device, 
                         dtype=data.dtype)
    return torch.sum(data * weights, dim=-1)


def generate_input(size: int, seed: int) -> input_t:
    """
    Generates random RGB image tensor of specified size.
    Returns:
        Tensor of shape (size, size, 3) with values in [0, 1]
    """
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    return torch.rand(size, size, 3, 
                     device='cuda', 
                     dtype=torch.float32, 
                     generator=gen).contiguous()


check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)

Rankings

L4

nikhilap 🥇 9109.776μs grayscale.py
cudawarped 🥈 16225.609μs   +7115.833μs submission_cuda_inline3.py
Shinsato Masumi 🥉 16465.310μs   +239.701μs submission3.py
geohot 16908.299μs   +442.990μs tinygrad.py
Karang 16921.183μs   +12.884μs grayscale_v1_cuda.py
pongtsu 16955.365μs   +34.182μs grayscale_fused.py
truk@PLT 17039.918μs   +84.554μs submission.py
Snektron 17067.446μs   +27.528μs aaa.py
mobicham 17073.246μs   +5.800μs grayscale_cuda_v3.py
NJR 17120.590μs   +47.344μs drjit_v1.py
blueblue 17136.907μs   +16.317μs sub4.py
ajhinh 17180.198μs   +43.291μs l4.py
Quantizr 17180.371μs   +0.173μs grayscale_L4.py
FourCore 17369.045μs   +188.674μs grayscale.py
salykova 17395.948μs   +26.903μs v2.py
gau.nernst 17496.732μs   +100.784μs submission_float12.py
Chadlet 17624.908μs   +128.176μs AIDE_out_grayscale.py
charles_irl 17647.015μs   +22.107μs triton.py
Karan Jakhar 17869.179μs   +222.164μs triton_sub.py
Trax 18322.243μs   +453.065μs custom_kernel.py
rb 19241.309μs   +919.066μs submission.py
siro 43783.646μs   +24542.337μs submission.py

T4

nikhilap 🥇 8748.413μs grayscale.py
cudawarped 🥈 16143.984μs   +7395.572μs submission_cuda_inline_base.py
Shinsato Masumi 🥉 16146.785μs   +2.801μs submission3.py
pongtsu 16253.513μs   +106.728μs grayscale_inline.py
truk@PLT 16297.267μs   +43.753μs submission.py
Joshua Swartz 16468.595μs   +171.328μs submission.py
ajhinh 16582.775μs   +114.180μs t4.py
blueblue 16692.272μs   +109.496μs submission.py
Karang 16807.300μs   +115.029μs grayscale_v1_cuda.py
NJR 17147.234μs   +339.934μs drjit_v2.py
charles_irl 17232.580μs   +85.346μs triton.py
FourCore 17245.283μs   +12.703μs grayscale.py
Karan Jakhar 17258.813μs   +13.529μs triton_sub.py
Chadlet 17270.829μs   +12.016μs AIDE_out_grayscale.py
Trax 17316.619μs   +45.790μs custom_kernel.py
tomaszki 17338.018μs   +21.399μs grayscale.py
Anthony 17366.782μs   +28.764μs submission.py
gau.nernst 17926.891μs   +560.109μs submission.py
Vlad 19992.128μs   +2065.237μs g_autotune_triton.py
siro 48176.454μs   +28184.326μs submission.py
Sharon 48336.262μs   +159.808μs from_task_import_input_t_output_t.py
Blue×Kill 76614.551μs   +28278.289μs grayscale_lb5.py

A100

nikhilap 🥇 1399.930μs grayscale.py
Shinsato Masumi 🥈 2428.769μs   +1028.840μs submission.py
salykova 🥉 2437.997μs   +9.227μs inline_cuda_ptx.py
cudawarped 2441.012μs   +3.016μs submission_cuda_inline1.py
tomaszki 2464.851μs   +23.839μs grayscale.py
FourCore 2530.905μs   +66.054μs grayscale.py
truk@PLT 2536.422μs   +5.518μs submission.py
dejavucoder 2541.482μs   +5.060μs fastest.py
Snektron 2555.310μs   +13.828μs aaa.py
Chadlet 2582.038μs   +26.728μs AIDE_out_grayscale.py
ajhinh 2606.225μs   +24.186μs a100.py
NJR 2670.595μs   +64.371μs drjit_v2.py
pongtsu 2697.218μs   +26.623μs grayscale_fused.py
mobicham 3068.234μs   +371.015μs grayscale_v3.7_a100_.py
Leiko 3082.762μs   +14.528μs lined.py
Quantizr 3087.569μs   +4.807μs grayscale_a100.py
geohot 3089.071μs   +1.502μs tinygrad.py
Karang 3094.217μs   +5.147μs grayscale_v1_cuda.py
f14 3096.536μs   +2.319μs submission.py
blueblue 3146.586μs   +50.050μs submission.py
dumball 3155.315μs   +8.729μs kernel.py
Anthony 3160.102μs   +4.787μs submission.py
gau.nernst 3163.628μs   +3.526μs submission.py
jack 3177.263μs   +13.635μs submission.py
Karan Jakhar 3184.078μs   +6.815μs triton_sub.py
Joshua Swartz 3185.983μs   +1.905μs tr2.py
Trax 3209.451μs   +23.468μs custom_kernel.py
charles_irl 3271.022μs   +61.571μs triton.py
Anne Ouyang 3289.363μs   +18.341μs submission.py
artem 3338.851μs   +49.487μs test.py
Tuna Tuncer 6325.797μs   +2986.947μs submission.py
osborn0016 9210.789μs   +2884.992μs submission.py
parrotsky 9262.040μs   +51.251μs submission.py
siclait 9267.880μs   +5.840μs submission.py
Art Moskvin 9280.346μs   +12.466μs submission.py
david_li_55686 9285.460μs   +5.114μs submission.py
mooglevich 9290.226μs   +4.766μs submission.py
gauravgokhale 9293.438μs   +3.212μs submission.py
cloudysky123_18954 9300.627μs   +7.190μs submission.py
youyc22_78608 9311.323μs   +10.695μs submission.py
_kernelfolw_ 9321.665μs   +10.342μs submission.py
egghao 9379.731μs   +58.066μs submission.py
legendary_fawn_56575 9381.815μs   +2.084μs submission.py
sridharnandigam 9412.135μs   +30.320μs submission.py
sahanp 9416.182μs   +4.047μs submission.py
Seraphim 10131.550μs   +715.368μs submission.py
siro 10159.300μs   +27.751μs submission.py
roby1805 11392.746μs   +1233.445μs submission.py
Smexy 14157.763μs   +2765.018μs submission.py
shikhar 18465.732μs   +4307.969μs lossfunk.py

H100

nikhilap 🥇 797.931μs grayscale.py
charles_irl 🥈 1042.208μs   +244.276μs triton.py
jack 🥉 1046.473μs   +4.266μs submission.py
Karang 1392.425μs   +345.952μs grayscale_v1_cuda.py
Snektron 1393.435μs   +1.010μs aaa.py
cudawarped 1394.261μs   +0.825μs submission_cuda_inline6a.py
Shinsato Masumi 1395.372μs   +1.112μs submission.py
mobicham 1399.076μs   +3.704μs grayscale_v3.5_h100_.py
tomaszki 1402.417μs   +3.340μs grayscale.py
salykova 1407.314μs   +4.897μs inline_cuda_ptx.py
truk@PLT 1407.989μs   +0.675μs submission.py
geohot 1408.142μs   +0.153μs tinygrad.py
Quantizr 1410.377μs   +2.235μs grayscale_h100.py
Nathan Wang 1419.196μs   +8.819μs grayscale.py
blueblue 1420.670μs   +1.474μs submission.py
az 1431.599μs   +10.929μs submission.py
dejavucoder 1432.831μs   +1.232μs fastest_h100.py
FourCore 1444.351μs   +11.520μs grayscale.py
Chadlet 1447.537μs   +3.186μs AIDE_out_grayscale.py
Joshua Swartz 1453.805μs   +6.268μs tr0.py
gau.nernst 1457.972μs   +4.167μs submission.py
Karan Jakhar 1459.655μs   +1.683μs triton_sub.py
pongtsu 1466.360μs   +6.705μs grayscale_triton.py
ajhinh 1469.029μs   +2.669μs h100.py
Trax 1473.298μs   +4.269μs custom_kernel.py
Anne Ouyang 1487.140μs   +13.842μs submission.py
Anthony 1491.799μs   +4.659μs submission.py
mancala 1542.927μs   +51.129μs submission.py
NJR 1590.000μs   +47.073μs drjit_v2.py
Leiko 1766.470μs   +176.470μs autotune_abuse.py
Vlad 1931.559μs   +165.089μs g_autotune_triton.py
Darshan 3454.750μs   +1523.191μs grayscale_triton.py
Seraphim 6084.559μs   +2629.809μs submission.py
siro 6118.796μs   +34.237μs submission.py
rcmalli 6283.132μs   +164.336μs submission.py