Skip to content

Commit 67a68c2

Browse files
eyalrozeyalroz-gehc
authored andcommitted
Fixes #711, fixes #723: Update static tables with (micro)architecture capabilities in device_properties.hpp
1 parent 93b6c48 commit 67a68c2

File tree

1 file changed

+54
-46
lines changed

1 file changed

+54
-46
lines changed

src/cuda/api/detail/device_properties.hpp

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -159,33 +159,38 @@ namespace detail_ {
159159
inline constexpr unsigned max_in_flight_threads_per_processor(const compute_capability_t& cc)
160160
{
161161
return
162-
(cc.architecture.major == 1) ? 8 :
163-
(cc.architecture.major == 2) ? 32 :
164-
(cc.architecture.major == 3) ? 192 :
162+
(cc.architecture.major == 1) ? 8 :
163+
(cc.architecture.major == 2) ? 32 :
164+
(cc.architecture.major == 3) ? 192 :
165165
// Note: No architecture number 4!
166-
(cc.architecture.major == 5) ? 128 :
167-
(cc.as_combined_number() == 60) ? 64 :
168-
(cc.architecture.major == 6) ? 128 :
169-
(cc.architecture.major == 7) ? 64 :
170-
(cc.as_combined_number() == 80) ? 64 :
171-
(cc.architecture.major == 8) ? 128 :
172-
(cc.architecture.major == 9) ? 128 :
166+
(cc.architecture.major == 5) ? 128 :
167+
(cc.as_combined_number() == 60) ? 64 :
168+
(cc.architecture.major == 6) ? 128 :
169+
(cc.architecture.major == 7) ? 64 :
170+
(cc.as_combined_number() == 80) ? 64 :
171+
(cc.architecture.major == 8) ? 128 :
172+
(cc.architecture.major == 9) ? 128 :
173+
(cc.architecture.major == 10) ? 128 :
174+
(cc.as_combined_number() == 120) ? 128 :
173175
invalid_compute_capability_return;
174176
}
175177

176178
inline constexpr unsigned max_warp_schedulings_per_processor_cycle(const compute_capability_t& cc)
177179
{
178180
return
179-
(cc.architecture.major == 1) ? 1 :
180-
(cc.architecture.major == 2) ? 2 :
181-
(cc.architecture.major == 3) ? 4 :
181+
(cc.architecture.major == 1) ? 1 :
182+
(cc.architecture.major == 2) ? 2 :
183+
(cc.architecture.major == 3) ? 4 :
182184
// Note: No architecture number 4!
183-
(cc.architecture.major == 5) ? 4 :
184-
(cc.as_combined_number() == 60) ? 2 :
185-
(cc.architecture.major == 6) ? 4 :
186-
(cc.architecture.major == 7) ? 4 :
187-
(cc.architecture.major == 8) ? 4 :
188-
(cc.architecture.major == 9) ? 4 :
185+
(cc.architecture.major == 5) ? 4 :
186+
(cc.as_combined_number() == 60) ? 2 :
187+
(cc.architecture.major == 6) ? 4 :
188+
(cc.architecture.major == 7) ? 4 :
189+
(cc.architecture.major == 8) ? 4 :
190+
(cc.architecture.major == 9) ? 4 :
191+
(cc.architecture.major == 10) ? 4 :
192+
// Note: No architecture number 11!
193+
(cc.as_combined_number() == 120) ? 4 :
189194
invalid_compute_capability_return;
190195
}
191196

@@ -201,46 +206,49 @@ inline constexpr unsigned max_warp_schedulings_per_processor_cycle(const compute
201206
* cudaSharedmemCarveoutMaxShared
202207
* );
203208
*
204-
* for details, see the CUDA Programming Guide, section K.7.3
209+
* for details, see the CUDA C++ Programming Guide v12.8, section 16.6.4
205210
*/
206211
inline constexpr unsigned max_shared_memory_per_block(const compute_capability_t& cc)
207212
{
213+
// Based on table 24 in the CUDA C++ Programming Guide
208214
return
209-
(cc.architecture.major == 1) ? 16 * KiB :
210-
(cc.architecture.major == 2) ? 48 * KiB :
211-
(cc.architecture.major == 3) ? 48 * KiB :
215+
(cc.architecture.major == 1) ? 16 * KiB :
216+
(cc.architecture.major == 2) ? 48 * KiB :
217+
(cc.architecture.major == 3) ? 48 * KiB :
212218
// Note: No architecture number 4!
213-
(cc.architecture.major == 5) ? 48 * KiB :
214-
(cc.architecture.major == 6) ? 48 * KiB :
215-
(cc.as_combined_number() == 7) ? 64 * KiB : // of 128
216-
(cc.as_combined_number() == 72) ? 48 * KiB : // of 128
217-
(cc.as_combined_number() == 75) ? 64 * KiB : // of 96
218-
(cc.architecture.major == 7) ? 96 * KiB : // of 128
219-
(cc.as_combined_number() == 80) ? 163 * KiB : // of 192
220-
(cc.as_combined_number() == 86) ? 99 * KiB : // of 128
221-
(cc.as_combined_number() == 87) ? 163 * KiB : // of 192
222-
(cc.as_combined_number() == 89) ? 99 * KiB : // of 100
223-
(cc.as_combined_number() == 90) ? 227 * KiB : // of 256
224-
(cc.architecture.major == 10) ? 99 * KiB : // of 256
225-
(cc.architecture.major == 12) ? 99 * KiB : // of 256
219+
(cc.architecture.major == 5) ? 48 * KiB :
220+
(cc.architecture.major == 6) ? 48 * KiB :
221+
(cc.as_combined_number() == 70) ? 96 * KiB : // of 96
222+
(cc.as_combined_number() == 72) ? 96 * KiB : // of 96
223+
(cc.as_combined_number() == 75) ? 64 * KiB : // of 64
224+
(cc.as_combined_number() == 80) ? 163 * KiB : // of 164
225+
(cc.as_combined_number() == 86) ? 99 * KiB : // of 100
226+
(cc.as_combined_number() == 87) ? 163 * KiB : // of 164
227+
(cc.as_combined_number() == 89) ? 99 * KiB : // of 100
228+
(cc.as_combined_number() == 90) ? 227 * KiB : // of 228
229+
(cc.architecture.major == 10) ? 99 * KiB : // of 256
230+
(cc.as_combined_number() == 120) ? 99 * KiB : // of 128
226231
invalid_compute_capability_return;
227232
}
228233

229234

230235
inline constexpr unsigned max_resident_warps_per_processor(const compute_capability_t& cc) noexcept
231236
{
237+
// Based on table 24 in the CUDA C++ Programming Guide
232238
return
233-
(cc.architecture.major == 1) ? 24 :
234-
(cc.architecture.major == 2) ? 48 :
235-
(cc.architecture.major == 3) ? 64 :
239+
(cc.architecture.major == 1) ? 24 :
240+
(cc.architecture.major == 2) ? 48 :
241+
(cc.architecture.major == 3) ? 64 :
236242
// Note: No architecture number 4!
237-
(cc.architecture.major == 5) ? 64 :
238-
(cc.architecture.major == 6) ? 64 :
239-
(cc.as_combined_number() == 75) ? 32 :
240-
(cc.architecture.major == 7) ? 64 :
241-
(cc.as_combined_number() == 80) ? 64 :
242-
(cc.architecture.major == 8) ? 48 :
243-
(cc.as_combined_number() == 90) ? 64 :
243+
(cc.architecture.major == 5) ? 64 :
244+
(cc.architecture.major == 6) ? 64 :
245+
(cc.as_combined_number() == 75) ? 32 :
246+
(cc.architecture.major == 7) ? 64 :
247+
(cc.as_combined_number() == 80) ? 64 :
248+
(cc.architecture.major == 8) ? 48 :
249+
(cc.as_combined_number() == 90) ? 64 :
250+
(cc.architecture.major == 10) ? 64 :
251+
(cc.as_combined_number() == 120) ? 48 :
244252
invalid_compute_capability_return;
245253
}
246254

0 commit comments

Comments
 (0)