@@ -34,9 +34,9 @@ namespace plugin {
34
34
static constexpr int kNumCUDAThreads = 512 ;
35
35
static constexpr int kNumMaximumNumBlocks = 4096 ;
36
36
37
- static inline int NumBlocks (const int64_t N) {
38
- return std::min< int64_t > ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39
- static_cast < int64_t >( kNumMaximumNumBlocks ) );
37
+ static inline int NumBlocks (const int N) {
38
+ return std::min ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39
+ kNumMaximumNumBlocks );
40
40
}
41
41
42
42
static inline int ConvOutputSize (
@@ -367,66 +367,66 @@ __device__ half DmcnIm2colBilinear<half>(const half* bottom_data,
367
367
368
368
template <typename T>
369
369
__global__ void ModulatedDeformableIm2colGpuKernel (
370
- const int64_t nthreads,
370
+ const int nthreads,
371
371
const T* data_im,
372
372
const T* data_offset,
373
373
const T* data_mask,
374
- const int64_t height,
375
- const int64_t width,
376
- const int64_t kernel_h,
377
- const int64_t kernel_w,
378
- const int64_t pad_h,
379
- const int64_t pad_w,
380
- const int64_t stride_h,
381
- const int64_t stride_w,
382
- const int64_t dilation_h,
383
- const int64_t dilation_w,
384
- const int64_t channel_per_deformable_group,
385
- const int64_t batch_size,
386
- const int64_t num_channels,
387
- const int64_t deformable_group,
388
- const int64_t height_col,
389
- const int64_t width_col,
374
+ const int height,
375
+ const int width,
376
+ const int kernel_h,
377
+ const int kernel_w,
378
+ const int pad_h,
379
+ const int pad_w,
380
+ const int stride_h,
381
+ const int stride_w,
382
+ const int dilation_h,
383
+ const int dilation_w,
384
+ const int channel_per_deformable_group,
385
+ const int batch_size,
386
+ const int num_channels,
387
+ const int deformable_group,
388
+ const int height_col,
389
+ const int width_col,
390
390
T* data_col);
391
391
392
392
template <>
393
393
__global__ void ModulatedDeformableIm2colGpuKernel<float >(
394
- const int64_t nthreads,
394
+ const int nthreads,
395
395
const float * data_im,
396
396
const float * data_offset,
397
397
const float * data_mask,
398
- const int64_t height,
399
- const int64_t width,
400
- const int64_t kernel_h,
401
- const int64_t kernel_w,
402
- const int64_t pad_h,
403
- const int64_t pad_w,
404
- const int64_t stride_h,
405
- const int64_t stride_w,
406
- const int64_t dilation_h,
407
- const int64_t dilation_w,
408
- const int64_t channel_per_deformable_group,
409
- const int64_t batch_size,
410
- const int64_t num_channels,
411
- const int64_t deformable_group,
412
- const int64_t height_col,
413
- const int64_t width_col,
398
+ const int height,
399
+ const int width,
400
+ const int kernel_h,
401
+ const int kernel_w,
402
+ const int pad_h,
403
+ const int pad_w,
404
+ const int stride_h,
405
+ const int stride_w,
406
+ const int dilation_h,
407
+ const int dilation_w,
408
+ const int channel_per_deformable_group,
409
+ const int batch_size,
410
+ const int num_channels,
411
+ const int deformable_group,
412
+ const int height_col,
413
+ const int width_col,
414
414
float * data_col) {
415
- int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
416
- int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
415
+ int index = blockIdx .x * blockDim .x + threadIdx .x ;
416
+ int offset = blockDim .x * gridDim .x ;
417
417
418
418
float minus_one = -1 .0f , height_t = height, width_t = width;
419
- for (int64_t i = index; i < nthreads; i += offset) {
420
- const int64_t w_col = i % width_col;
421
- const int64_t h_col = (i / width_col) % height_col;
422
- const int64_t b_col = (i / width_col) / height_col % batch_size;
423
- const int64_t c_im = (i / width_col / height_col) / batch_size;
424
- const int64_t c_col = c_im * kernel_h * kernel_w;
419
+ for (size_t i = index; i < nthreads; i += offset) {
420
+ const int w_col = i % width_col;
421
+ const int h_col = (i / width_col) % height_col;
422
+ const int b_col = (i / width_col) / height_col % batch_size;
423
+ const int c_im = (i / width_col / height_col) / batch_size;
424
+ const int c_col = c_im * kernel_h * kernel_w;
425
425
426
- const int64_t deformable_group_index = c_im / channel_per_deformable_group;
426
+ const int deformable_group_index = c_im / channel_per_deformable_group;
427
427
428
- const int64_t h_in = h_col * stride_h - pad_h;
429
- const int64_t w_in = w_col * stride_w - pad_w;
428
+ const int h_in = h_col * stride_h - pad_h;
429
+ const int w_in = w_col * stride_w - pad_w;
430
430
431
431
float * data_col_ptr =
432
432
data_col +
@@ -440,14 +440,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
440
440
data_mask + (b_col * deformable_group + deformable_group_index) *
441
441
kernel_h * kernel_w * height_col * width_col;
442
442
443
- for (int64_t i = 0 ; i < kernel_h; ++i) {
444
- for (int64_t j = 0 ; j < kernel_w; ++j) {
445
- const int64_t data_offset_h_ptr =
443
+ for (int i = 0 ; i < kernel_h; ++i) {
444
+ for (int j = 0 ; j < kernel_w; ++j) {
445
+ const int data_offset_h_ptr =
446
446
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
447
- const int64_t data_offset_w_ptr =
447
+ const int data_offset_w_ptr =
448
448
((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
449
449
w_col;
450
- const int64_t data_mask_hw_ptr =
450
+ const int data_mask_hw_ptr =
451
451
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
452
452
453
453
const float offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -471,45 +471,43 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
471
471
472
472
template <>
473
473
__global__ void ModulatedDeformableIm2colGpuKernel<half>(
474
- const int64_t nthreads,
474
+ const int nthreads,
475
475
const half* data_im,
476
476
const half* data_offset,
477
477
const half* data_mask,
478
- const int64_t height,
479
- const int64_t width,
480
- const int64_t kernel_h,
481
- const int64_t kernel_w,
482
- const int64_t pad_h,
483
- const int64_t pad_w,
484
- const int64_t stride_h,
485
- const int64_t stride_w,
486
- const int64_t dilation_h,
487
- const int64_t dilation_w,
488
- const int64_t channel_per_deformable_group,
489
- const int64_t batch_size,
490
- const int64_t num_channels,
491
- const int64_t deformable_group,
492
- const int64_t height_col,
493
- const int64_t width_col,
478
+ const int height,
479
+ const int width,
480
+ const int kernel_h,
481
+ const int kernel_w,
482
+ const int pad_h,
483
+ const int pad_w,
484
+ const int stride_h,
485
+ const int stride_w,
486
+ const int dilation_h,
487
+ const int dilation_w,
488
+ const int channel_per_deformable_group,
489
+ const int batch_size,
490
+ const int num_channels,
491
+ const int deformable_group,
492
+ const int height_col,
493
+ const int width_col,
494
494
half* data_col) {
495
495
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
496
- int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
497
- int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
496
+ int index = blockIdx .x * blockDim .x + threadIdx .x ;
497
+ int offset = blockDim .x * gridDim .x ;
498
498
499
- half minus_one = -1 .0f ,
500
- height_t = static_cast <half>(static_cast <float >(height)),
501
- width_t = static_cast <half>(static_cast <float >(width));
499
+ half minus_one = -1 .0f , height_t = height, width_t = width;
502
500
for (size_t i = index; i < nthreads; i += offset) {
503
- const int64_t w_col = i % width_col;
504
- const int64_t h_col = (i / width_col) % height_col;
505
- const int64_t b_col = (i / width_col) / height_col % batch_size;
506
- const int64_t c_im = (i / width_col / height_col) / batch_size;
507
- const int64_t c_col = c_im * kernel_h * kernel_w;
501
+ const int w_col = i % width_col;
502
+ const int h_col = (i / width_col) % height_col;
503
+ const int b_col = (i / width_col) / height_col % batch_size;
504
+ const int c_im = (i / width_col / height_col) / batch_size;
505
+ const int c_col = c_im * kernel_h * kernel_w;
508
506
509
- const int64_t deformable_group_index = c_im / channel_per_deformable_group;
507
+ const int deformable_group_index = c_im / channel_per_deformable_group;
510
508
511
- const int64_t h_in = h_col * stride_h - pad_h;
512
- const int64_t w_in = w_col * stride_w - pad_w;
509
+ const int h_in = h_col * stride_h - pad_h;
510
+ const int w_in = w_col * stride_w - pad_w;
513
511
514
512
half* data_col_ptr =
515
513
data_col +
@@ -523,22 +521,21 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
523
521
data_mask + (b_col * deformable_group + deformable_group_index) *
524
522
kernel_h * kernel_w * height_col * width_col;
525
523
526
- for (int64_t i = 0 ; i < kernel_h; ++i) {
527
- for (int64_t j = 0 ; j < kernel_w; ++j) {
528
- const int64_t data_offset_h_ptr =
524
+ for (int i = 0 ; i < kernel_h; ++i) {
525
+ for (int j = 0 ; j < kernel_w; ++j) {
526
+ const int data_offset_h_ptr =
529
527
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
530
- const int64_t data_offset_w_ptr =
528
+ const int data_offset_w_ptr =
531
529
((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
532
530
w_col;
533
- const int64_t data_mask_hw_ptr =
531
+ const int data_mask_hw_ptr =
534
532
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
535
533
536
534
const half offset_h = data_offset_ptr[data_offset_h_ptr];
537
535
const half offset_w = data_offset_ptr[data_offset_w_ptr];
538
536
const half mask = data_mask_ptr[data_mask_hw_ptr];
539
537
half val = 0 ;
540
- half h_im_t = static_cast <float >(h_in) + i * dilation_h,
541
- w_im_t = static_cast <float >(w_in) + j * dilation_w;
538
+ half h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
542
539
const half h_im = h_im_t + offset_h;
543
540
const half w_im = w_im_t + offset_w;
544
541
if (h_im > minus_one && w_im > minus_one && h_im < height_t &&
0 commit comments