matmul_v2

Deadline

136 days 12 hours remaining (2025-12-30 00:00 UTC)

Language

Python

GPU Types

A100, B200, H100, L4

Description

Implement a custom matmul function that matches the reference implementation. The function should handle a tuple of input tensors and apply matmul The shapes of all outer and inner dimensions of tensors are multiples of 16

Reference Implementation

import torch
from task import input_t, output_t
from utils import make_match_reference, DeterministicContext


def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
    gen = torch.Generator(device='cuda')
    gen.manual_seed(seed)
    a = torch.empty(m, k, device='cuda', dtype=torch.float16)
    a.uniform_(0, 1, generator=gen)
    b = torch.empty(k, n, device='cuda', dtype=torch.float16)
    b.uniform_(0, 1, generator=gen)
    c = torch.empty(m, n, device='cuda', dtype=torch.float16)
    return a, b, c


def ref_kernel(data: input_t) -> output_t:
    with DeterministicContext():
        a, b = data
        return a @ b


check_implementation = make_match_reference(ref_kernel)

No submissions yet

Be the first to submit a solution for this challenge!