// Minimal reproduction of cuDNN Graph API conv+bias fusion failure // Demonstrates that convolution-only works but conv+bias fusion fails // Tested on: Windows 11, CUDA 12.4, cuDNN 9.0, RTX 4080 // // Compile: // cl minimal_reproduction.cpp /I"C:\Program Files\NVIDIA\CUDNN\v9.0_cuda12.4\include" ^ // /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include" ^ // /link cudnn.lib cudart.lib #include #include #include #include #include // Helper to check cuDNN status #define CHECK_CUDNN(call) \ do { \ cudnnStatus_t status = call; \ if (status != CUDNN_STATUS_SUCCESS) { \ printf("CUDNN Error at %s:%d - %d\n", __FILE__, __LINE__, status); \ return -1; \ } \ } while(0) // Helper to check CUDA status #define CHECK_CUDA(call) \ do { \ cudaError_t status = call; \ if (status != cudaSuccess) { \ printf("CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \ return -1; \ } \ } while(0) // RAII wrapper for backend descriptors class BackendDescriptor { public: cudnnBackendDescriptor_t desc = nullptr; ~BackendDescriptor() { if (desc) cudnnBackendDestroyDescriptor(desc); } cudnnBackendDescriptor_t* get() { return &desc; } cudnnBackendDescriptor_t raw() const { return desc; } }; // Create tensor descriptor cudnnStatus_t createTensorDescriptor( BackendDescriptor& descWrapper, int64_t uid, const std::vector& dims, const std::vector& strides, bool isVirtual) { cudnnStatus_t status; status = cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, descWrapper.get()); if (status != CUDNN_STATUS_SUCCESS) return status; cudnnDataType_t dataType = CUDNN_DATA_FLOAT; int64_t ndims = dims.size(); int64_t alignment = 16; cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &uid); cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dataType); cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, ndims, dims.data()); cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, ndims, strides.data()); cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment); cudnnBackendSetAttribute(descWrapper.raw(), CUDNN_ATTR_TENSOR_IS_VIRTUAL, CUDNN_TYPE_BOOLEAN, 1, &isVirtual); return cudnnBackendFinalize(descWrapper.raw()); } // Test convolution + bias fusion (EXPECTED TO FAIL) int test_conv_bias_fusion(cudnnHandle_t handle) { printf("\n=== TEST 1: Convolution + Bias Fusion ===\n"); printf("Configuration: N=16, C=16, H=16, W=16, kH=3, kW=3\n"); printf("Expected: Should work but FAILS with all engines rejecting (3000)\n\n"); // Create descriptors BackendDescriptor inputDesc, weightDesc, virtualConvDesc, biasDesc, outputDesc; BackendDescriptor convDesc, pointwiseDesc, convOp, biasOp, opGraph; // Input: [16, 16, 16, 16] with canonical NCHW strides CHECK_CUDNN(createTensorDescriptor(inputDesc, 100, {16, 16, 16, 16}, {4096, 256, 16, 1}, false)); printf("✓ Input tensor created\n"); // Weight: [16, 16, 3, 3] with canonical strides CHECK_CUDNN(createTensorDescriptor(weightDesc, 101, {16, 16, 3, 3}, {144, 9, 3, 1}, false)); printf("✓ Weight tensor created\n"); // Virtual conv output: [16, 16, 16, 16] - ENABLES FUSION CHECK_CUDNN(createTensorDescriptor(virtualConvDesc, 102, {16, 16, 16, 16}, {4096, 256, 16, 1}, true)); // isVirtual=true printf("✓ Virtual conv output created (enables fusion)\n"); // Bias: [1, 16, 1, 1] CHECK_CUDNN(createTensorDescriptor(biasDesc, 103, {1, 16, 1, 1}, {16, 1, 1, 1}, false)); printf("✓ Bias tensor created\n"); // Output: [16, 16, 16, 16] CHECK_CUDNN(createTensorDescriptor(outputDesc, 104, {16, 16, 16, 16}, {4096, 256, 16, 1}, false)); printf("✓ Output tensor created\n"); // Convolution descriptor CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, convDesc.get())); cudnnDataType_t computeType = CUDNN_DATA_FLOAT; cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; int64_t spatialDims = 2; std::vector convStrides = {1, 1}; std::vector prePad = {1, 1}; std::vector postPad = {1, 1}; std::vector dilation = {1, 1}; cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &computeType); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &spatialDims); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, 2, convStrides.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, 2, prePad.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, 2, postPad.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, 2, dilation.data()); CHECK_CUDNN(cudnnBackendFinalize(convDesc.raw())); printf("✓ Convolution descriptor created\n"); // Pointwise descriptor (for bias addition) CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_POINTWISE_DESCRIPTOR, pointwiseDesc.get())); cudnnPointwiseMode_t pwMode = CUDNN_POINTWISE_ADD; cudnnBackendSetAttribute(pointwiseDesc.raw(), CUDNN_ATTR_POINTWISE_MODE, CUDNN_TYPE_POINTWISE_MODE, 1, &pwMode); cudnnBackendSetAttribute(pointwiseDesc.raw(), CUDNN_ATTR_POINTWISE_MATH_PREC, CUDNN_TYPE_DATA_TYPE, 1, &computeType); CHECK_CUDNN(cudnnBackendFinalize(pointwiseDesc.raw())); printf("✓ Pointwise descriptor created\n"); // Convolution operation CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, convOp.get())); double alpha = 1.0, beta = 0.0; auto convDescPtr = convDesc.raw(); auto inputDescPtr = inputDesc.raw(); auto weightDescPtr = weightDesc.raw(); auto virtualConvPtr = virtualConvDesc.raw(); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA, CUDNN_TYPE_DOUBLE, 1, &alpha); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA, CUDNN_TYPE_DOUBLE, 1, &beta); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &convDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &inputDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &weightDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &virtualConvPtr); CHECK_CUDNN(cudnnBackendFinalize(convOp.raw())); printf("✓ Convolution operation created\n"); // Bias operation CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, biasOp.get())); auto pointwiseDescPtr = pointwiseDesc.raw(); auto biasDescPtr = biasDesc.raw(); auto outputDescPtr = outputDesc.raw(); double alpha1 = 1.0, alpha2 = 1.0; cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &pointwiseDescPtr); cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1, CUDNN_TYPE_DOUBLE, 1, &alpha1); cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2, CUDNN_TYPE_DOUBLE, 1, &alpha2); cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_XDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &virtualConvPtr); cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_BDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &biasDescPtr); cudnnBackendSetAttribute(biasOp.raw(), CUDNN_ATTR_OPERATION_POINTWISE_YDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &outputDescPtr); CHECK_CUDNN(cudnnBackendFinalize(biasOp.raw())); printf("✓ Bias operation created\n"); // Operation graph with both operations CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, opGraph.get())); cudnnBackendSetAttribute(opGraph.raw(), CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendDescriptor_t ops[2] = {convOp.raw(), biasOp.raw()}; int64_t opCount = 2; cudnnBackendSetAttribute(opGraph.raw(), CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, opCount, ops); CHECK_CUDNN(cudnnBackendFinalize(opGraph.raw())); printf("✓ Operation graph created (conv + bias fusion)\n"); // Try to get execution plan printf("\nAttempting to create execution plan...\n"); BackendDescriptor heurDesc; CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, heurDesc.get())); auto opGraphPtr = opGraph.raw(); cudnnBackendSetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &opGraphPtr); cudnnBackendHeurMode_t heurMode = CUDNN_HEUR_MODE_INSTANT; cudnnBackendSetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_MODE, CUDNN_TYPE_HEUR_MODE, 1, &heurMode); CHECK_CUDNN(cudnnBackendFinalize(heurDesc.raw())); int64_t engineCount = 0; CHECK_CUDNN(cudnnBackendGetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 0, &engineCount, nullptr)); printf("Found %lld engines\n", engineCount); if (engineCount == 0) { printf("✗ FAILURE: No engines found for fusion operation\n"); return -1; } // Try each engine std::vector engineConfigs(engineCount); for (int64_t i = 0; i < engineCount; ++i) { cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engineConfigs[i]); } int64_t retrievedCount = 0; CHECK_CUDNN(cudnnBackendGetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR, engineCount, &retrievedCount, engineConfigs.data())); printf("Retrieved %lld engine configs\n\n", retrievedCount); bool anySuccess = false; for (int64_t i = 0; i < retrievedCount; ++i) { printf("Testing engine %lld/%lld...\n", i+1, retrievedCount); BackendDescriptor execPlan; cudnnStatus_t status = cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, execPlan.get()); if (status != CUDNN_STATUS_SUCCESS) { printf(" ✗ Failed to create plan descriptor: %d\n", status); continue; } cudnnBackendSetAttribute(execPlan.raw(), CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engineConfigs[i]); cudnnBackendSetAttribute(execPlan.raw(), CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); status = cudnnBackendFinalize(execPlan.raw()); if (status != CUDNN_STATUS_SUCCESS) { printf(" ✗ Engine %lld finalize failed: %d (NOT_SUPPORTED=%s)\n", i+1, status, status == 3000 ? "YES" : "NO"); } else { printf(" ✓ Engine %lld SUCCESS!\n", i+1); anySuccess = true; break; } } // Clean up engine configs for (auto& ec : engineConfigs) { cudnnBackendDestroyDescriptor(ec); } if (!anySuccess) { printf("\n✗✗✗ FAILURE: ALL %lld ENGINES REJECTED FUSION ✗✗✗\n", retrievedCount); printf("Error code 3000 = CUDNN_STATUS_NOT_SUPPORTED\n"); return -1; } printf("\n✓ TEST PASSED\n"); return 0; } // Test convolution only (EXPECTED TO SUCCEED) int test_conv_only(cudnnHandle_t handle) { printf("\n=== TEST 2: Convolution Only (No Fusion) ===\n"); printf("Configuration: Same as Test 1 but without bias fusion\n"); printf("Expected: Should work (and DOES work)\n\n"); BackendDescriptor inputDesc, weightDesc, outputDesc, convDesc, convOp, opGraph; // Same tensors but output is NOT virtual CHECK_CUDNN(createTensorDescriptor(inputDesc, 100, {16, 16, 16, 16}, {4096, 256, 16, 1}, false)); printf("✓ Input tensor created\n"); CHECK_CUDNN(createTensorDescriptor(weightDesc, 101, {16, 16, 3, 3}, {144, 9, 3, 1}, false)); printf("✓ Weight tensor created\n"); // Output is REAL, not virtual - no fusion CHECK_CUDNN(createTensorDescriptor(outputDesc, 104, {16, 16, 16, 16}, {4096, 256, 16, 1}, false)); // isVirtual=false printf("✓ Output tensor created (NOT virtual - no fusion)\n"); // Convolution descriptor (same as before) CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, convDesc.get())); cudnnDataType_t computeType = CUDNN_DATA_FLOAT; cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; int64_t spatialDims = 2; std::vector convStrides = {1, 1}; std::vector prePad = {1, 1}; std::vector postPad = {1, 1}; std::vector dilation = {1, 1}; cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &computeType); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &spatialDims); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, 2, convStrides.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, 2, prePad.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, 2, postPad.data()); cudnnBackendSetAttribute(convDesc.raw(), CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, 2, dilation.data()); CHECK_CUDNN(cudnnBackendFinalize(convDesc.raw())); printf("✓ Convolution descriptor created\n"); // Convolution operation (output to real tensor) CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, convOp.get())); double alpha = 1.0, beta = 0.0; auto convDescPtr = convDesc.raw(); auto inputDescPtr = inputDesc.raw(); auto weightDescPtr = weightDesc.raw(); auto outputDescPtr = outputDesc.raw(); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA, CUDNN_TYPE_DOUBLE, 1, &alpha); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA, CUDNN_TYPE_DOUBLE, 1, &beta); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &convDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &inputDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &weightDescPtr); cudnnBackendSetAttribute(convOp.raw(), CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &outputDescPtr); CHECK_CUDNN(cudnnBackendFinalize(convOp.raw())); printf("✓ Convolution operation created\n"); // Operation graph with only convolution CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, opGraph.get())); cudnnBackendSetAttribute(opGraph.raw(), CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendDescriptor_t ops[1] = {convOp.raw()}; int64_t opCount = 1; cudnnBackendSetAttribute(opGraph.raw(), CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, opCount, ops); CHECK_CUDNN(cudnnBackendFinalize(opGraph.raw())); printf("✓ Operation graph created (convolution only)\n"); // Try to get execution plan printf("\nAttempting to create execution plan...\n"); BackendDescriptor heurDesc; CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, heurDesc.get())); auto opGraphPtr = opGraph.raw(); cudnnBackendSetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &opGraphPtr); cudnnBackendHeurMode_t heurMode = CUDNN_HEUR_MODE_INSTANT; cudnnBackendSetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_MODE, CUDNN_TYPE_HEUR_MODE, 1, &heurMode); CHECK_CUDNN(cudnnBackendFinalize(heurDesc.raw())); int64_t engineCount = 0; CHECK_CUDNN(cudnnBackendGetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 0, &engineCount, nullptr)); printf("Found %lld engines\n", engineCount); std::vector engineConfigs(engineCount); for (int64_t i = 0; i < engineCount; ++i) { cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engineConfigs[i]); } int64_t retrievedCount = 0; CHECK_CUDNN(cudnnBackendGetAttribute(heurDesc.raw(), CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR, engineCount, &retrievedCount, engineConfigs.data())); printf("Retrieved %lld engine configs\n\n", retrievedCount); // Try first engine BackendDescriptor execPlan; CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, execPlan.get())); cudnnBackendSetAttribute(execPlan.raw(), CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engineConfigs[0]); cudnnBackendSetAttribute(execPlan.raw(), CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); CHECK_CUDNN(cudnnBackendFinalize(execPlan.raw())); printf("✓ Engine 1 finalized successfully\n"); // Clean up for (auto& ec : engineConfigs) { cudnnBackendDestroyDescriptor(ec); } printf("\n✓ TEST PASSED\n"); return 0; } int main() { printf("========================================\n"); printf("cuDNN Graph API Conv+Bias Fusion Test\n"); printf("========================================\n\n"); // Initialize CUDA CHECK_CUDA(cudaSetDevice(0)); // Initialize cuDNN cudnnHandle_t handle; CHECK_CUDNN(cudnnCreate(&handle)); printf("CUDA device and cuDNN initialized\n"); // Run both tests int result1 = test_conv_bias_fusion(handle); int result2 = test_conv_only(handle); // Cleanup cudnnDestroy(handle); printf("\n========================================\n"); printf("SUMMARY\n"); printf("========================================\n"); printf("Test 1 (Conv+Bias Fusion): %s\n", result1 == 0 ? "PASSED" : "FAILED"); printf("Test 2 (Conv Only): %s\n", result2 == 0 ? "PASSED" : "FAILED"); printf("\n"); if (result1 != 0 && result2 == 0) { printf("DIAGNOSIS: Graph API works but fusion is not supported\n"); printf("This confirms the bug is specific to conv+bias fusion\n"); } return (result1 == 0 && result2 == 0) ? 0 : -1; }