{"hash": "d2a8e4fda0aed3a2ab06067bed517a6a3c39da97a986bc1884de8d3262d30294", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 3, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "enable_fp_fusion": true, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/home/export/base/ycsc_1/1/.conda/envs/omni_vllm/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "shared": 1088, "name": "rotary_kernel"}