vectoradd
Deadline
10 days 14 hours remaining (2025-06-30 00:00 UTC)
GPU Types
A100, H100, L4, T4
Description
Implement a float16 vector addition kernel.
Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from
a normal distribution with mean 0 and variance 1.
Output: torch.Tensor of shape (N, N) and type torch.float16
Show
Copy
Reference Implementation
from utils import make_match_reference
import torch
from task import input_t, output_t
def ref_kernel(data: input_t) -> output_t:
"""
Reference implementation of vector addition using PyTorch.
Args:
data: Tuple of tensors [A, B] to be added.
Returns:
Tensor containing element-wise sums.
"""
A, B = data
return A + B
def generate_input(size: int, seed: int) -> input_t:
"""
Generates random input tensors of specified shapes.
Returns:
Tuple of tensors [A, B] to be added.
"""
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)
A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
return (A, B)
check_implementation = make_match_reference(ref_kernel)
Rankings
L4
Show All (13)
Nader 🥇
6487.993μs
submission.py
FourCore 🥈
6515.646μs
+27.653μs
vectoradd.py
Brent 🥉
6610.478μs
+94.832μs
submission_tuned_L4.py
ajhinh
6711.567μs
+101.089μs
l4.py
Snektron
6803.561μs
+91.995μs
a.py
lydodt
6813.978μs
+10.417μs
submission.py
everettknag
6843.141μs
+29.163μs
vectoradd_triton_1d.py
truk@PLT
6851.278μs
+8.137μs
submission.py
Ali Panahi
6853.392μs
+2.114μs
submission.py
arund42
7029.181μs
+175.789μs
vectoradd_128b_generic.py
Sagar
7114.675μs
+85.494μs
vectorAdd_half2.py
mashisong
7242.812μs
+128.137μs
vectoradd_triton.py
Trax
9081.337μs
+1838.525μs
custom_kernel.py
T4
Show All (20)
ajhinh 🥇
6270.295μs
t4.py
akitaka_99 🥈
6308.868μs
+38.573μs
vectoradd.py
lydodt 🥉
6309.171μs
+0.303μs
submission.py
Shlok
6325.110μs
+15.939μs
vectoradd.py
Darshan
6332.942μs
+7.832μs
vector_add_torch.py
Moses
6355.233μs
+22.291μs
vector_sum_baseline.py
everettknag
6399.588μs
+44.355μs
vectoradd_triton_1d.py
siclait
6407.439μs
+7.851μs
submission.py
Dhanshre
6410.973μs
+3.535μs
vectoradd.py
yechenzhi
6494.568μs
+83.594μs
submission_cuda_inline.py
Haw
6499.274μs
+4.707μs
submission_cuda_inline.py
Brent
6515.912μs
+16.637μs
submission_tuned_T4.py
salad
6528.092μs
+12.180μs
ref.py
FourCore
6535.435μs
+7.343μs
vectoraddAsyncCopy.py
truk@PLT
6561.160μs
+25.725μs
submission.py
arund42
6625.695μs
+64.535μs
vectoradd_128b_generic.py
Snektron
6665.361μs
+39.666μs
b.py
mashisong
7515.279μs
+849.918μs
vectoradd_triton.py
Trax
9046.479μs
+1531.200μs
custom_kernel.py
Gokul
9162.481μs
+116.002μs
submission.py
A100
Show All (17)
Nader 🥇
925.027μs
submission.py
mancala 🥈
927.572μs
+2.544μs
submission.py
ajhinh 🥉
930.154μs
+2.582μs
a100.py
pooya
960.983μs
+30.829μs
vectoradd.py
Snektron
967.257μs
+6.274μs
a.py
lydodt
973.853μs
+6.597μs
submission.py
FourCore
977.404μs
+3.551μs
vectoradd.py
truk@PLT
978.570μs
+1.166μs
submission.py
Brent
987.544μs
+8.974μs
submission_tuned_A100.py
mashisong
1032.990μs
+45.446μs
vectoradd_triton.py
teddy_chou
1045.286μs
+12.296μs
vecadd.py
everettknag
1118.644μs
+73.357μs
vectoradd_triton_1d.py
arund42
1212.637μs
+93.993μs
vectoradd_128b_generic.py
DizzleRama
1251.470μs
+38.834μs
submission_cuda_inline.py
Trax
1538.904μs
+287.433μs
custom_kernel.py
Gokul
2118.579μs
+579.675μs
submission.py
veer
3704.028μs
+1585.450μs
submission.py
H100
Show All (20)
DUMBPANDABEAR 🥇
539.177μs
submission_2.py
Snektron 🥈
539.416μs
+0.239μs
b.py
mancala 🥉
539.580μs
+0.164μs
submission.py
truk@PLT
540.120μs
+0.540μs
submission.py
jung
540.380μs
+0.260μs
submission.py
ajhinh
540.823μs
+0.443μs
h100.py
arund42
541.140μs
+0.317μs
vectoradd_128b_H100.py
Xavier Init
543.016μs
+1.875μs
dimension_check.py
FourCore
550.080μs
+7.065μs
vectoradd.py
lydodt
551.293μs
+1.213μs
submission.py
.zos
557.373μs
+6.080μs
i.py
osehn
575.002μs
+17.629μs
vectoradd_triton_1d.py
Brent
603.834μs
+28.832μs
submission.py
akitaka_99
607.936μs
+4.102μs
vectoradd.py
jaynux_56616
613.304μs
+5.368μs
triton_cuda_vector_add_kernel.py
mashisong
618.988μs
+5.684μs
vectoradd_triton.py
everettknag
693.877μs
+74.889μs
vectoradd_triton_1d.py
Trax
732.034μs
+38.157μs
custom_kernel.py
ALI
864.271μs
+132.238μs
VectorAdd.py
Gokul
1321.964μs
+457.692μs
submission.py