Add sources

Browse files

Files changed (8) hide show

build.toml +16 -0
flake.lock +167 -0
flake.nix +17 -0
relu/relu.mm +92 -0
torch-ext/registration.h +30 -0
torch-ext/relu_metal/__init__.py +7 -0
torch-ext/torch_binding.cpp +11 -0
torch-ext/torch_binding.h +3 -0

build.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[general]
+name = "relu_metal"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.activation]
+backend = "metal"
+src = [
+  "relu/relu.mm",
+]
+depends = [ "torch" ]

flake.lock ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1750234878,
+        "narHash": "sha256-q9DRC9zdpzUf88qqg1qbhP1qgJbE2cMtn8oUmosuyT8=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "c7132f90763d756da3e77da62e01be0a4546dc57",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1751910742,
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "f1099723e3df41950b073051839bc2c5b088c380",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for Metal ReLU test kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

relu/relu.mm ADDED Viewed

	@@ -0,0 +1,92 @@

+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <string>
+char const *CUSTOM_KERNEL = R"(
+        #include <metal_stdlib>
+        using namespace metal;
+        kernel void relu_forward_kernel_float(device const float *inA [[buffer(0)]],
+                                        device float *outC [[buffer(1)]],
+                                        uint index [[thread_position_in_grid]]) {
+            // Explicitly write to output
+            outC[index] = max(0.0f, inA[index]);
+        }
+        kernel void relu_forward_kernel_half(device const half *inA [[buffer(0)]],
+                                        device half *outC [[buffer(1)]],
+                                        uint index [[thread_position_in_grid]]) {
+            // Explicitly write to output
+            outC[index] = max(static_cast<half>(0.0), inA[index]);
+        }
+)";
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor& tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+torch::Tensor &dispatchReluKernel(torch::Tensor const &input, torch::Tensor &output) {
+    @autoreleasepool {
+        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+        NSError *error = nil;
+        int numThreads = input.numel();
+        id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource:[NSString stringWithUTF8String:CUSTOM_KERNEL]
+                                                                  options:nil
+                                                                    error:&error];
+        TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
+                std::string kernel_name = std::string("relu_forward_kernel_") + (input.scalar_type() == torch::kFloat ? "float" : "half");
+        id<MTLFunction> customReluFunction = [customKernelLibrary newFunctionWithName:[NSString stringWithUTF8String:kernel_name.c_str()]];
+        TORCH_CHECK(customReluFunction, "Failed to create function state object for ", kernel_name.c_str());
+        id<MTLComputePipelineState> reluPSO = [device newComputePipelineStateWithFunction:customReluFunction error:&error];
+        TORCH_CHECK(reluPSO, error.localizedDescription.UTF8String);
+        id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+                dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+        dispatch_sync(serialQueue, ^(){
+            id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+            TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+            [computeEncoder setComputePipelineState:reluPSO];
+            [computeEncoder setBuffer:getMTLBufferStorage(input) offset:input.storage_offset() * input.element_size() atIndex:0];
+            [computeEncoder setBuffer:getMTLBufferStorage(output) offset:output.storage_offset() * output.element_size() atIndex:1];
+            MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+            NSUInteger threadGroupSize = reluPSO.maxTotalThreadsPerThreadgroup;
+            if (threadGroupSize > numThreads) {
+                threadGroupSize = numThreads;
+            }
+            MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1);
+            [computeEncoder dispatchThreads:gridSize
+                      threadsPerThreadgroup:threadgroupSize];
+            [computeEncoder endEncoding];
+            torch::mps::commit();
+        });
+    }
+    return output;
+}
+torch::Tensor mps_relu(const torch::Tensor &input) {
+    TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor");
+    TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat ||
+                input.scalar_type() == torch::kHalf, "Unsupported data type: ", input.scalar_type());
+    torch::Tensor output = torch::empty_like(input);
+    return dispatchReluKernel(input, output);
+}

torch-ext/registration.h ADDED Viewed

	@@ -0,0 +1,30 @@

+// Registration macros from vLLM:
+// https://github.com/vllm-project/vllm/blob/main/csrc/core/registration.h
+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

torch-ext/relu_metal/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+from ._ops import ops
+def relu(input: torch.Tensor) -> torch.Tensor:
+    return ops.relu(input)

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,11 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("relu(Tensor input) -> Tensor");
+  ops.impl("relu", torch::kMPS, mps_relu);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #include <torch/torch.h>
2	+
3	+ torch::Tensor mps_relu(const torch::Tensor &input);