Skip to content

Commit 7730c8b

Browse files
Revert "Fix paddle.vision.ops.deform_conv2d API big Tensor (#74058)" (#74430)
* Revert "Fix paddle.vision.ops.deform_conv2d API big Tensor (#74058)" This reverts commit 3067726. * test=document_fix (#74337) --------- Co-authored-by: tianshuo78520a <[email protected]>
1 parent a2ba7bb commit 7730c8b

File tree

9 files changed

+372
-381
lines changed

9 files changed

+372
-381
lines changed

paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu

Lines changed: 87 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ namespace plugin {
3434
static constexpr int kNumCUDAThreads = 512;
3535
static constexpr int kNumMaximumNumBlocks = 4096;
3636

37-
static inline int NumBlocks(const int64_t N) {
38-
return std::min<int64_t>((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
39-
static_cast<int64_t>(kNumMaximumNumBlocks));
37+
static inline int NumBlocks(const int N) {
38+
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
39+
kNumMaximumNumBlocks);
4040
}
4141

4242
static inline int ConvOutputSize(
@@ -367,66 +367,66 @@ __device__ half DmcnIm2colBilinear<half>(const half* bottom_data,
367367

368368
template <typename T>
369369
__global__ void ModulatedDeformableIm2colGpuKernel(
370-
const int64_t nthreads,
370+
const int nthreads,
371371
const T* data_im,
372372
const T* data_offset,
373373
const T* data_mask,
374-
const int64_t height,
375-
const int64_t width,
376-
const int64_t kernel_h,
377-
const int64_t kernel_w,
378-
const int64_t pad_h,
379-
const int64_t pad_w,
380-
const int64_t stride_h,
381-
const int64_t stride_w,
382-
const int64_t dilation_h,
383-
const int64_t dilation_w,
384-
const int64_t channel_per_deformable_group,
385-
const int64_t batch_size,
386-
const int64_t num_channels,
387-
const int64_t deformable_group,
388-
const int64_t height_col,
389-
const int64_t width_col,
374+
const int height,
375+
const int width,
376+
const int kernel_h,
377+
const int kernel_w,
378+
const int pad_h,
379+
const int pad_w,
380+
const int stride_h,
381+
const int stride_w,
382+
const int dilation_h,
383+
const int dilation_w,
384+
const int channel_per_deformable_group,
385+
const int batch_size,
386+
const int num_channels,
387+
const int deformable_group,
388+
const int height_col,
389+
const int width_col,
390390
T* data_col);
391391

392392
template <>
393393
__global__ void ModulatedDeformableIm2colGpuKernel<float>(
394-
const int64_t nthreads,
394+
const int nthreads,
395395
const float* data_im,
396396
const float* data_offset,
397397
const float* data_mask,
398-
const int64_t height,
399-
const int64_t width,
400-
const int64_t kernel_h,
401-
const int64_t kernel_w,
402-
const int64_t pad_h,
403-
const int64_t pad_w,
404-
const int64_t stride_h,
405-
const int64_t stride_w,
406-
const int64_t dilation_h,
407-
const int64_t dilation_w,
408-
const int64_t channel_per_deformable_group,
409-
const int64_t batch_size,
410-
const int64_t num_channels,
411-
const int64_t deformable_group,
412-
const int64_t height_col,
413-
const int64_t width_col,
398+
const int height,
399+
const int width,
400+
const int kernel_h,
401+
const int kernel_w,
402+
const int pad_h,
403+
const int pad_w,
404+
const int stride_h,
405+
const int stride_w,
406+
const int dilation_h,
407+
const int dilation_w,
408+
const int channel_per_deformable_group,
409+
const int batch_size,
410+
const int num_channels,
411+
const int deformable_group,
412+
const int height_col,
413+
const int width_col,
414414
float* data_col) {
415-
int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
416-
int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
415+
int index = blockIdx.x * blockDim.x + threadIdx.x;
416+
int offset = blockDim.x * gridDim.x;
417417

418418
float minus_one = -1.0f, height_t = height, width_t = width;
419-
for (int64_t i = index; i < nthreads; i += offset) {
420-
const int64_t w_col = i % width_col;
421-
const int64_t h_col = (i / width_col) % height_col;
422-
const int64_t b_col = (i / width_col) / height_col % batch_size;
423-
const int64_t c_im = (i / width_col / height_col) / batch_size;
424-
const int64_t c_col = c_im * kernel_h * kernel_w;
419+
for (size_t i = index; i < nthreads; i += offset) {
420+
const int w_col = i % width_col;
421+
const int h_col = (i / width_col) % height_col;
422+
const int b_col = (i / width_col) / height_col % batch_size;
423+
const int c_im = (i / width_col / height_col) / batch_size;
424+
const int c_col = c_im * kernel_h * kernel_w;
425425

426-
const int64_t deformable_group_index = c_im / channel_per_deformable_group;
426+
const int deformable_group_index = c_im / channel_per_deformable_group;
427427

428-
const int64_t h_in = h_col * stride_h - pad_h;
429-
const int64_t w_in = w_col * stride_w - pad_w;
428+
const int h_in = h_col * stride_h - pad_h;
429+
const int w_in = w_col * stride_w - pad_w;
430430

431431
float* data_col_ptr =
432432
data_col +
@@ -440,14 +440,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
440440
data_mask + (b_col * deformable_group + deformable_group_index) *
441441
kernel_h * kernel_w * height_col * width_col;
442442

443-
for (int64_t i = 0; i < kernel_h; ++i) {
444-
for (int64_t j = 0; j < kernel_w; ++j) {
445-
const int64_t data_offset_h_ptr =
443+
for (int i = 0; i < kernel_h; ++i) {
444+
for (int j = 0; j < kernel_w; ++j) {
445+
const int data_offset_h_ptr =
446446
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
447-
const int64_t data_offset_w_ptr =
447+
const int data_offset_w_ptr =
448448
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
449449
w_col;
450-
const int64_t data_mask_hw_ptr =
450+
const int data_mask_hw_ptr =
451451
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
452452

453453
const float offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -471,45 +471,43 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
471471

472472
template <>
473473
__global__ void ModulatedDeformableIm2colGpuKernel<half>(
474-
const int64_t nthreads,
474+
const int nthreads,
475475
const half* data_im,
476476
const half* data_offset,
477477
const half* data_mask,
478-
const int64_t height,
479-
const int64_t width,
480-
const int64_t kernel_h,
481-
const int64_t kernel_w,
482-
const int64_t pad_h,
483-
const int64_t pad_w,
484-
const int64_t stride_h,
485-
const int64_t stride_w,
486-
const int64_t dilation_h,
487-
const int64_t dilation_w,
488-
const int64_t channel_per_deformable_group,
489-
const int64_t batch_size,
490-
const int64_t num_channels,
491-
const int64_t deformable_group,
492-
const int64_t height_col,
493-
const int64_t width_col,
478+
const int height,
479+
const int width,
480+
const int kernel_h,
481+
const int kernel_w,
482+
const int pad_h,
483+
const int pad_w,
484+
const int stride_h,
485+
const int stride_w,
486+
const int dilation_h,
487+
const int dilation_w,
488+
const int channel_per_deformable_group,
489+
const int batch_size,
490+
const int num_channels,
491+
const int deformable_group,
492+
const int height_col,
493+
const int width_col,
494494
half* data_col) {
495495
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
496-
int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
497-
int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
496+
int index = blockIdx.x * blockDim.x + threadIdx.x;
497+
int offset = blockDim.x * gridDim.x;
498498

499-
half minus_one = -1.0f,
500-
height_t = static_cast<half>(static_cast<float>(height)),
501-
width_t = static_cast<half>(static_cast<float>(width));
499+
half minus_one = -1.0f, height_t = height, width_t = width;
502500
for (size_t i = index; i < nthreads; i += offset) {
503-
const int64_t w_col = i % width_col;
504-
const int64_t h_col = (i / width_col) % height_col;
505-
const int64_t b_col = (i / width_col) / height_col % batch_size;
506-
const int64_t c_im = (i / width_col / height_col) / batch_size;
507-
const int64_t c_col = c_im * kernel_h * kernel_w;
501+
const int w_col = i % width_col;
502+
const int h_col = (i / width_col) % height_col;
503+
const int b_col = (i / width_col) / height_col % batch_size;
504+
const int c_im = (i / width_col / height_col) / batch_size;
505+
const int c_col = c_im * kernel_h * kernel_w;
508506

509-
const int64_t deformable_group_index = c_im / channel_per_deformable_group;
507+
const int deformable_group_index = c_im / channel_per_deformable_group;
510508

511-
const int64_t h_in = h_col * stride_h - pad_h;
512-
const int64_t w_in = w_col * stride_w - pad_w;
509+
const int h_in = h_col * stride_h - pad_h;
510+
const int w_in = w_col * stride_w - pad_w;
513511

514512
half* data_col_ptr =
515513
data_col +
@@ -523,22 +521,21 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
523521
data_mask + (b_col * deformable_group + deformable_group_index) *
524522
kernel_h * kernel_w * height_col * width_col;
525523

526-
for (int64_t i = 0; i < kernel_h; ++i) {
527-
for (int64_t j = 0; j < kernel_w; ++j) {
528-
const int64_t data_offset_h_ptr =
524+
for (int i = 0; i < kernel_h; ++i) {
525+
for (int j = 0; j < kernel_w; ++j) {
526+
const int data_offset_h_ptr =
529527
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
530-
const int64_t data_offset_w_ptr =
528+
const int data_offset_w_ptr =
531529
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
532530
w_col;
533-
const int64_t data_mask_hw_ptr =
531+
const int data_mask_hw_ptr =
534532
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
535533

536534
const half offset_h = data_offset_ptr[data_offset_h_ptr];
537535
const half offset_w = data_offset_ptr[data_offset_w_ptr];
538536
const half mask = data_mask_ptr[data_mask_hw_ptr];
539537
half val = 0;
540-
half h_im_t = static_cast<float>(h_in) + i * dilation_h,
541-
w_im_t = static_cast<float>(w_in) + j * dilation_w;
538+
half h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
542539
const half h_im = h_im_t + offset_h;
543540
const half w_im = w_im_t + offset_w;
544541
if (h_im > minus_one && w_im > minus_one && h_im < height_t &&

0 commit comments

Comments
 (0)