.version 1.2 .target sm_11, map_f64_to_f32 // compiled with /usr/local/cuda/open64/lib//be // nvopencc built on 2008-06-19 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_00004c8d_00000000-7_cuda_kernel.cpp3.i (/tmp/ccBI#.2sNNOT) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_11, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_00004c8d_00000000-6_cuda_kernel.cudafe2.gpu" .file 2 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 3 "/usr/local/cuda/bin/../include/crt/device_runtime.h" .file 4 "/usr/local/cuda/bin/../include/crt/../host_defines.h" .file 5 "/usr/local/cuda/bin/../include/crt/../builtin_types.h" .file 6 "/usr/local/cuda/bin/../include/crt/../device_types.h" .file 7 "/usr/local/cuda/bin/../include/crt/../driver_types.h" .file 8 "/usr/local/cuda/bin/../include/crt/../texture_types.h" .file 9 "/usr/local/cuda/bin/../include/crt/../vector_types.h" .file 10 "/usr/local/cuda/bin/../include/crt/../device_launch_parameters.h" .file 11 "/usr/local/cuda/bin/../include/crt/storage_class.h" .file 12 "/usr/include/bits/types.h" .file 13 "/usr/include/time.h" .file 14 "cuda_kernel.cu" .file 15 "/usr/local/cuda/bin/../include/common_functions.h" .file 16 "/usr/local/cuda/bin/../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin/../include/math_functions.h" .file 18 "/usr/local/cuda/bin/../include/device_functions.h" .file 19 "/usr/local/cuda/bin/../include/math_constants.h" .file 20 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" .tex .u64 input_texture1; .entry __globfunc__Z9my_kerneljtP5uint4 { .reg .u16 %rh<7>; .reg .u32 %r<42>; .reg .u64 %rd<3>; .reg .pred %p<7>; .param .u32 __cudaparm___globfunc__Z9my_kerneljtP5uint4_tex_id; .param .u16 __cudaparm___globfunc__Z9my_kerneljtP5uint4_index_range; .param .u64 __cudaparm___globfunc__Z9my_kerneljtP5uint4_output_data; // result = 16 // flag = 32 // ch = 0 .loc 14 31 0 // 27 } // 28 // 29 // 30 // 31 __global__ void my_kernel(unsigned int tex_id, unsigned short index_range, uint4 * output_data) $LBB1___globfunc__Z9my_kerneljtP5uint4: .loc 14 43 0 // 39 //flags to indicate whether a 'A' is detected // 40 unsigned int flag; // 41 // 42 //current index location in texture // 43 unsigned int input_index = start_index; cvt.u32.u16 %r1, %ntid.x; // cvt.u32.u16 %r2, %ctaid.x; // mul.lo.u32 %r3, %r2, %r1; // cvt.u32.u16 %r4, %tid.x; // add.u32 %r5, %r3, %r4; // ld.param.u16 %r6, [__cudaparm___globfunc__Z9my_kerneljtP5uint4_index_range]; // id:81 __cudaparm___globfunc__Z9my_kerneljtP5uint4_index_range+0x0 mul.lo.u32 %r7, %r5, %r6; // mov.s32 %r8, %r7; // mov.u32 %r9, 0; // setp.eq.u32 %p1, %r6, %r9; // @%p1 bra $Lt_0_14; // add.u32 %r10, %r6, 3; // shr.s32 %r11, %r10, 31; // mov.s32 %r12, 3; // and.b32 %r13, %r11, %r12; // add.s32 %r14, %r13, %r10; // shr.s32 %r15, %r14, 2; // ld.param.u32 %r16, [__cudaparm___globfunc__Z9my_kerneljtP5uint4_tex_id]; // id:85 __cudaparm___globfunc__Z9my_kerneljtP5uint4_tex_id+0x0 mov.u32 %r17, 1; // setp.eq.u32 %p2, %r16, %r17; // mov.u32 %r18, 0; // mov.s32 %r19, %r15; // $Lt_0_16: // Loop body line 43, nesting depth: 1, estimated iterations: unknown @!%p2 bra $Lt_0_18; // // Part of loop body line 43, head labeled $Lt_0_16 mov.s32 %r20, %r8; // mov.s32 %r21, 0; // mov.s32 %r22, 0; // mov.s32 %r23, 0; // tex.1d.v4.u32.s32 {%r24,%r25,%r26,%r27},[input_texture1,{%r20,%r21,%r22,%r23}]; .loc 14 64 0 // 60 * // 61 */ // 62 if (tex_id == 1) // 63 { // 64 ch = tex1Dfetch(input_texture1,input_index); mov.s32 %r28, %r24; // cvt.u8.u32 %r29, %r28; // cvt.u8.u32 %rh1, %r29; // bra.uni $Lt_0_17; // $Lt_0_18: // Part of loop body line 43, head labeled $Lt_0_16 cvt.u32.u8 %r29, %rh1; // $Lt_0_17: // Part of loop body line 43, head labeled $Lt_0_16 mov.u32 %r30, 65; // setp.ne.u32 %p3, %r29, %r30; // @%p3 bra $Lt_0_20; // // Part of loop body line 43, head labeled $Lt_0_16 .loc 14 81 0 // 77 * This set of values will always work, regardless of the test (tex_id == 1) condition above. // 78 * Adding ANY constant term to ANY of the 4 fields (as in the '3' below) results in correct operation // 79 * HOWEVER, if ALL fields are based on ANY variable available to the kernel, operation fails. (see second comment) // 80 */ // 81 result.x = blockDim.x; mov.s32 %r31, %r1; // .loc 14 82 0 // 82 result.y = 3; mov.u32 %r32, 3; // mov.s32 %r33, %r32; // .loc 14 83 0 // 83 result.z = threadIdx.x; mov.s32 %r34, %r4; // .loc 14 84 0 // 84 result.w = start_index + bytes_processed; add.u32 %r35, %r7, %r18; // mov.s32 %r36, %r35; // .loc 14 85 0 // 85 flag = 1; mov.u16 %rh2, 1; // bra.uni $Lt_0_19; // $Lt_0_20: // Part of loop body line 43, head labeled $Lt_0_16 mov.s16 %rh2, %rh3; // $Lt_0_19: // Part of loop body line 43, head labeled $Lt_0_16 .loc 14 109 0 // 105 * // 106 */ // 107 } // 108 // 109 write_result(flag,&result,0,output_data); mov.u16 %rh4, 0; // setp.ne.u16 %p4, %rh2, %rh4; // @%p4 bra $Lt_0_22; // // Part of loop body line 43, head labeled $Lt_0_16 bra.uni $Lt_0_21; // $Lt_0_22: // Part of loop body line 43, head labeled $Lt_0_16 .loc 14 25 0 ld.param.u64 %rd1, [__cudaparm___globfunc__Z9my_kerneljtP5uint4_output_data]; // id:77 __cudaparm___globfunc__Z9my_kerneljtP5uint4_output_data+0x0 mov.s32 %r37, %r31; // mov.s32 %r38, %r33; // mov.s32 %r39, %r34; // mov.s32 %r40, %r36; // st.global.v4.u32 [%rd1+0], {%r37,%r38,%r39,%r40}; // $Lt_0_21: // Part of loop body line 43, head labeled $Lt_0_16 .loc 14 110 0 // 110 flag = 0; mov.u16 %rh5, 0; // mov.s16 %rh3, %rh5; // .loc 14 113 0 // 111 // 112 //update input index // 113 input_index++; add.u32 %r8, %r8, 1; // add.u32 %r18, %r18, 4; // setp.lt.u32 %p5, %r18, %r6; // @%p5 bra $Lt_0_16; // $Lt_0_14: .loc 14 118 0 // 114 // 115 //update progress through buffer // 116 bytes_processed+=4; // 117 } // 118 } exit; // $LDWend___globfunc__Z9my_kerneljtP5uint4: } // __globfunc__Z9my_kerneljtP5uint4