#include #include #include #include int size = 1024*1024*300; double gettime() { timeval t; gettimeofday(&t, NULL); return t.tv_sec + (1.e-6)*t.tv_usec; } pthread_barrier_t barrier; void *test(void *varg) { int arg = *( (int*)varg ); cudaSetDevice(arg); char *h_buf; cudaMallocHost((void **)&h_buf, size); char *d_buf; cudaMalloc((void **)&d_buf, size); pthread_barrier_wait(&barrier); cudaMemcpy(d_buf, h_buf, size, cudaMemcpyHostToDevice); cudaMemcpy(h_buf, d_buf, size, cudaMemcpyDeviceToHost); cudaThreadSynchronize(); pthread_barrier_wait(&barrier); pthread_exit(NULL); } int main(int argc, char **argv) { pthread_t threads[2]; int args[2]; int cards = 0; int deviceCount; cudaGetDeviceCount(&deviceCount); for (int dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (strcmp(deviceProp.name, "GeForce 9800 GX2") == 0) { args[cards++] = dev; } if (cards == 2) break; } if (cards != 2) { printf("Only detected %i GX2s, exiting.\n", cards); exit(1); } printf("Testing %i and %i\n", args[0], args[1]); pthread_barrier_init(&barrier, NULL, cards+1); for (int i = 0; i < cards; i++) { int rc = pthread_create(&threads[i], NULL, test, (void *)(args+i)); if (rc) { perror("pthread_create\n"); exit(-1); } } pthread_barrier_wait(&barrier); double time = -1*gettime(); pthread_barrier_wait(&barrier); time += gettime(); printf("%lf MB/s\n", double(size*2*cards)/time/1024.0/1024.0); pthread_exit(NULL); }