Add support for ROCm

Files changed (4) hide show

build.toml CHANGED Viewed

@@ -46,8 +46,12 @@ include = [ "." ]
 depends = [ "cutlass_3_6", "torch" ]
 [kernel.fp8_common]
 cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "9.0a" ]
 src = [
   "fp8/common.cu",
   "fp8/common.cuh",
   "dispatch_utils.h",
@@ -66,7 +70,9 @@ src = [
 depends = [ "torch" ]
 [kernel.int8_common]
 cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "9.0a" ]
 src = [
   "compressed_tensors/int8_quant_kernels.cu",
   "dispatch_utils.h"

 depends = [ "cutlass_3_6", "torch" ]
 [kernel.fp8_common]
+language = "cuda-hipify"
 cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "9.0a" ]
+rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
 src = [
+  "fp8/amd/hip_float8.h",
+  "fp8/amd/hip_float8_impl.h",
   "fp8/common.cu",
   "fp8/common.cuh",
   "dispatch_utils.h",
 depends = [ "torch" ]
 [kernel.int8_common]
+language = "cuda-hipify"
 cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "9.0a" ]
+rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
 src = [
   "compressed_tensors/int8_quant_kernels.cu",
   "dispatch_utils.h"

flake.lock CHANGED Viewed

@@ -41,17 +41,17 @@
         "rocm-nix": "rocm-nix"
       },
       "locked": {
-        "lastModified": 1742905006,
-        "narHash": "sha256-SCi1f5Lti4AM0kNPlAidcgN/5YM4HgJP4KwCsMrB0IE=",
-        "ref": "refs/heads/main",
-        "rev": "517a2bf2d0a3f1faf058ab995b6ca280b0999e7c",
-        "revCount": 105,
-        "type": "git",
-        "url": "ssh://[email protected]/huggingface/kernel-builder"
       },
       "original": {
-        "type": "git",
-        "url": "ssh://[email protected]/huggingface/kernel-builder"
       }
     },
     "nixpkgs": {
@@ -78,11 +78,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1742285724,
-        "narHash": "sha256-2QQn9fzmF/SKW082kXpSrEBgfmwKO2RNT5R91Fn/K4M=",
         "owner": "huggingface",
         "repo": "rocm-nix",
-        "rev": "a90de1c2e5698b2f4fe984b5f0faf052f466be49",
         "type": "github"
       },
       "original": {

         "rocm-nix": "rocm-nix"
       },
       "locked": {
+        "lastModified": 1743416390,
+        "narHash": "sha256-Krrrq9asF2d5SVWGJQIhQA8UxVcTpiCor8hQU4G5J38=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "e57cbde93f29032d32bbab8e32a1c86def6e9365",
+        "type": "github"
       },
       "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
       }
     },
     "nixpkgs": {
         ]
       },
       "locked": {
+        "lastModified": 1743085847,
+        "narHash": "sha256-uWG29p+nhZmGRV1LffWwRGjwtPIXeu1F0YTQbXgB+GU=",
         "owner": "huggingface",
         "repo": "rocm-nix",
+        "rev": "245cdc9bfb4bfafa818711c5f5e0b889afe1ba39",
         "type": "github"
       },
       "original": {

flake.nix CHANGED Viewed

@@ -2,7 +2,7 @@
   description = "Flake for quantization kernels";
   inputs = {
-    kernel-builder.url = "git+ssh://git@github.com/huggingface/kernel-builder";
   };
   outputs =

   description = "Flake for quantization kernels";
   inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
   };
   outputs =

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -4,6 +4,8 @@
 #include "torch_binding.h"
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(
@@ -26,6 +28,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
   // Compute FP8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
@@ -60,6 +64,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
            &dynamic_scaled_int8_quant);
   // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
   ops.def(
       "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
@@ -103,8 +109,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor s_tok, Tensor s_ch, Tensor s_group, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, "
       "SymInt size_k) -> Tensor");
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, ops) {
   ops.impl("awq_marlin_repack", &awq_marlin_repack);
   ops.impl("fp8_marlin_gemm", &fp8_marlin_gemm);
@@ -120,4 +129,6 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, ops) {
   ops.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 #include "torch_binding.h"
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  #ifndef USE_ROCM
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  #endif
   // Compute FP8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
   ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
            &dynamic_scaled_int8_quant);
+  #ifndef USE_ROCM
   // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
   ops.def(
       "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
       "Tensor s_tok, Tensor s_ch, Tensor s_group, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, "
       "SymInt size_k) -> Tensor");
+  #endif
 }
+#ifndef USE_ROCM
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, ops) {
   ops.impl("awq_marlin_repack", &awq_marlin_repack);
   ops.impl("fp8_marlin_gemm", &fp8_marlin_gemm);
   ops.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
 }
+#endif
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)