@@ -159,33 +159,38 @@ namespace detail_ {
159
159
inline constexpr unsigned max_in_flight_threads_per_processor (const compute_capability_t & cc)
160
160
{
161
161
return
162
- (cc.architecture .major == 1 ) ? 8 :
163
- (cc.architecture .major == 2 ) ? 32 :
164
- (cc.architecture .major == 3 ) ? 192 :
162
+ (cc.architecture .major == 1 ) ? 8 :
163
+ (cc.architecture .major == 2 ) ? 32 :
164
+ (cc.architecture .major == 3 ) ? 192 :
165
165
// Note: No architecture number 4!
166
- (cc.architecture .major == 5 ) ? 128 :
167
- (cc.as_combined_number () == 60 ) ? 64 :
168
- (cc.architecture .major == 6 ) ? 128 :
169
- (cc.architecture .major == 7 ) ? 64 :
170
- (cc.as_combined_number () == 80 ) ? 64 :
171
- (cc.architecture .major == 8 ) ? 128 :
172
- (cc.architecture .major == 9 ) ? 128 :
166
+ (cc.architecture .major == 5 ) ? 128 :
167
+ (cc.as_combined_number () == 60 ) ? 64 :
168
+ (cc.architecture .major == 6 ) ? 128 :
169
+ (cc.architecture .major == 7 ) ? 64 :
170
+ (cc.as_combined_number () == 80 ) ? 64 :
171
+ (cc.architecture .major == 8 ) ? 128 :
172
+ (cc.architecture .major == 9 ) ? 128 :
173
+ (cc.architecture .major == 10 ) ? 128 :
174
+ (cc.as_combined_number () == 120 ) ? 128 :
173
175
invalid_compute_capability_return;
174
176
}
175
177
176
178
inline constexpr unsigned max_warp_schedulings_per_processor_cycle (const compute_capability_t & cc)
177
179
{
178
180
return
179
- (cc.architecture .major == 1 ) ? 1 :
180
- (cc.architecture .major == 2 ) ? 2 :
181
- (cc.architecture .major == 3 ) ? 4 :
181
+ (cc.architecture .major == 1 ) ? 1 :
182
+ (cc.architecture .major == 2 ) ? 2 :
183
+ (cc.architecture .major == 3 ) ? 4 :
182
184
// Note: No architecture number 4!
183
- (cc.architecture .major == 5 ) ? 4 :
184
- (cc.as_combined_number () == 60 ) ? 2 :
185
- (cc.architecture .major == 6 ) ? 4 :
186
- (cc.architecture .major == 7 ) ? 4 :
187
- (cc.architecture .major == 8 ) ? 4 :
188
- (cc.architecture .major == 9 ) ? 4 :
185
+ (cc.architecture .major == 5 ) ? 4 :
186
+ (cc.as_combined_number () == 60 ) ? 2 :
187
+ (cc.architecture .major == 6 ) ? 4 :
188
+ (cc.architecture .major == 7 ) ? 4 :
189
+ (cc.architecture .major == 8 ) ? 4 :
190
+ (cc.architecture .major == 9 ) ? 4 :
191
+ (cc.architecture .major == 10 ) ? 4 :
192
+ // Note: No architecture number 11!
193
+ (cc.as_combined_number () == 120 ) ? 4 :
189
194
invalid_compute_capability_return;
190
195
}
191
196
@@ -201,46 +206,49 @@ inline constexpr unsigned max_warp_schedulings_per_processor_cycle(const compute
201
206
* cudaSharedmemCarveoutMaxShared
202
207
* );
203
208
*
204
- * for details, see the CUDA Programming Guide, section K.7.3
209
+ * for details, see the CUDA C++ Programming Guide v12.8 , section 16.6.4
205
210
*/
206
211
inline constexpr unsigned max_shared_memory_per_block (const compute_capability_t & cc)
207
212
{
213
+ // Based on table 24 in the CUDA C++ Programming Guide
208
214
return
209
- (cc.architecture .major == 1 ) ? 16 * KiB :
210
- (cc.architecture .major == 2 ) ? 48 * KiB :
211
- (cc.architecture .major == 3 ) ? 48 * KiB :
215
+ (cc.architecture .major == 1 ) ? 16 * KiB :
216
+ (cc.architecture .major == 2 ) ? 48 * KiB :
217
+ (cc.architecture .major == 3 ) ? 48 * KiB :
212
218
// Note: No architecture number 4!
213
- (cc.architecture .major == 5 ) ? 48 * KiB :
214
- (cc.architecture .major == 6 ) ? 48 * KiB :
215
- (cc.as_combined_number () == 7 ) ? 64 * KiB : // of 128
216
- (cc.as_combined_number () == 72 ) ? 48 * KiB : // of 128
217
- (cc.as_combined_number () == 75 ) ? 64 * KiB : // of 96
218
- (cc.architecture .major == 7 ) ? 96 * KiB : // of 128
219
- (cc.as_combined_number () == 80 ) ? 163 * KiB : // of 192
220
- (cc.as_combined_number () == 86 ) ? 99 * KiB : // of 128
221
- (cc.as_combined_number () == 87 ) ? 163 * KiB : // of 192
222
- (cc.as_combined_number () == 89 ) ? 99 * KiB : // of 100
223
- (cc.as_combined_number () == 90 ) ? 227 * KiB : // of 256
224
- (cc.architecture .major == 10 ) ? 99 * KiB : // of 256
225
- (cc.architecture .major == 12 ) ? 99 * KiB : // of 256
219
+ (cc.architecture .major == 5 ) ? 48 * KiB :
220
+ (cc.architecture .major == 6 ) ? 48 * KiB :
221
+ (cc.as_combined_number () == 70 ) ? 96 * KiB : // of 96
222
+ (cc.as_combined_number () == 72 ) ? 96 * KiB : // of 96
223
+ (cc.as_combined_number () == 75 ) ? 64 * KiB : // of 64
224
+ (cc.as_combined_number () == 80 ) ? 163 * KiB : // of 164
225
+ (cc.as_combined_number () == 86 ) ? 99 * KiB : // of 100
226
+ (cc.as_combined_number () == 87 ) ? 163 * KiB : // of 164
227
+ (cc.as_combined_number () == 89 ) ? 99 * KiB : // of 100
228
+ (cc.as_combined_number () == 90 ) ? 227 * KiB : // of 228
229
+ (cc.architecture .major == 10 ) ? 99 * KiB : // of 256
230
+ (cc.as_combined_number () == 120 ) ? 99 * KiB : // of 128
226
231
invalid_compute_capability_return;
227
232
}
228
233
229
234
230
235
inline constexpr unsigned max_resident_warps_per_processor (const compute_capability_t & cc) noexcept
231
236
{
237
+ // Based on table 24 in the CUDA C++ Programming Guide
232
238
return
233
- (cc.architecture .major == 1 ) ? 24 :
234
- (cc.architecture .major == 2 ) ? 48 :
235
- (cc.architecture .major == 3 ) ? 64 :
239
+ (cc.architecture .major == 1 ) ? 24 :
240
+ (cc.architecture .major == 2 ) ? 48 :
241
+ (cc.architecture .major == 3 ) ? 64 :
236
242
// Note: No architecture number 4!
237
- (cc.architecture .major == 5 ) ? 64 :
238
- (cc.architecture .major == 6 ) ? 64 :
239
- (cc.as_combined_number () == 75 ) ? 32 :
240
- (cc.architecture .major == 7 ) ? 64 :
241
- (cc.as_combined_number () == 80 ) ? 64 :
242
- (cc.architecture .major == 8 ) ? 48 :
243
- (cc.as_combined_number () == 90 ) ? 64 :
243
+ (cc.architecture .major == 5 ) ? 64 :
244
+ (cc.architecture .major == 6 ) ? 64 :
245
+ (cc.as_combined_number () == 75 ) ? 32 :
246
+ (cc.architecture .major == 7 ) ? 64 :
247
+ (cc.as_combined_number () == 80 ) ? 64 :
248
+ (cc.architecture .major == 8 ) ? 48 :
249
+ (cc.as_combined_number () == 90 ) ? 64 :
250
+ (cc.architecture .major == 10 ) ? 64 :
251
+ (cc.as_combined_number () == 120 ) ? 48 :
244
252
invalid_compute_capability_return;
245
253
}
246
254
0 commit comments