Merge remote-tracking branch 'upstream/master'

anthonix · anthonix · commit 293c40034c69 · 2024-07-30T12:00:23.000-07:00
diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
@@ -30,7 +30,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm permute
 all: $(TARGETS)
 all_ptx:  $(TARGETS:%=%.ptx)
 all_sass: $(TARGETS:%=%.sass)
@@ -64,6 +64,8 @@ matmul_backward: matmul_backward.cu
 adamw: adamw.cu
 global_norm: global_norm.cu
 
+permute: permute.cu
+
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
 	$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
@@ -1137,6 +1137,7 @@ int main(int argc, char **argv) {
     free(dinp);
     free(dpreatt);
     free(datt);
+    free(h_dinp);
     cudaCheck(cudaFree(d_inp));
     cudaCheck(cudaFree(d_qkvr));
     cudaCheck(cudaFree(d_preatt));
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
@@ -1377,6 +1377,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaFree(d_preatt));
     cudaCheck(cudaFree(d_att));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_stats));
     cublasDestroy(cublas_handle);
 
     #ifdef ENABLE_CUDNN
diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
@@ -766,6 +766,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaFree(d_logits));
     cudaCheck(cudaFree(d_dlosses));
     cudaCheck(cudaFree(d_targets));
+    cudaCheck(cudaFree(d_dlogits_no_pad));
 
     return 0;
 }
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
@@ -193,5 +193,6 @@ int main(int argc, char **argv) {
 
   free(all_reduce_buffer_host);
   cudaCheck(cudaFree(all_reduce_buffer));
+  cudaCheck(cudaFree(all_reduce_buffer_recv));
   multi_gpu_config_free(&multi_gpu_config);
 }
diff --git a/dev/cuda/permute.cu b/dev/cuda/permute.cu
@@ -0,0 +1,199 @@
+/*
+Kernels to demonstrate permute operation.
+
+Compile example:
+nvcc -O3 permute.cu -o permute
+
+The goal is to permute a 4D matrix from its original shape (dim1, dim2, dim3, dim4) to a new shape (dim4, dim3, dim1, dim2).
+
+Before permutation, we need to understand how to access elements in a flattened (linear) form of the matrix.
+
+Given:
+
+dim1 = size of the 1st dimension
+dim2 = size of the 2nd dimension
+dim3 = size of the 3rd dimension
+dim4 = size of the 4th dimension
+
+For any element in a 4D matrix at position (i1, i2, i3, i4), where:
+
+i1 is the index in dimension 1
+i2 is the index in dimension 2
+i3 is the index in dimension 3
+i4 is the index in dimension 4
+
+If you find it challenging to calculate the indices i1, i2, i3, and i4, observe the pattern in the index calculations.
+Initially, it might take some time to grasp, but with practice, you'll develop a mental model for it.
+
+To calculate the indices, use the following formulas:
+
+i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+i2 = (idx / (dim3 * dim4)) % dim2;
+i3 = (idx / dim4) % dim3;
+i4 = idx % dim4;
+
+Pattern Explanation:
+To find the index for any dimension, divide the thread ID (idx) by the product of all subsequent dimensions.
+Then, perform modulo operation with the current dimension.
+
+
+
+The linear index in a flattened 1D array is calculated as:
+linear_idx = i1 × ( dim2 × dim3 × dim4 ) + i2 × ( dim3 × dim4 ) + i3 × dim4 + i4
+This linear index uniquely identifies the position of the element in the 1D array.
+
+To permute the matrix, we need to rearrange the indices according to the new shape. 
+In this case, we are permuting from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2).
+
+The new dimension post permutation will be as follow:
+
+dim1 becomes the new 3rd dimension.
+dim2 becomes the new 4th dimension.
+dim3 becomes the new 2nd dimension.
+dim4 becomes the new 1st dimension.
+
+permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+
+Here's how this works:
+
+i4 * (dim3 * dim1 * dim2): This accounts for how many complete dim3 × dim1 × dim2 blocks fit before the current i4 block.
+i3 * (dim1 * dim2): This accounts for the offset within the current i4 block, specifying which i3 block we are in.
+i1 * dim2: This accounts for the offset within the current i3 block, specifying which i1 block we are in.
+i2: This gives the offset within the current i1 block.
+
+Lastly at the end we store the current value at idx index of the original value to the permuted index in the permuted_matrix.
+
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Similarly we can follow the above approach to permute matrices of any dimensions.
+
+*/
+
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath> 
+
+// CPU function to permute a 4D matrix
+void permute_cpu(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int total_threads = dim1 * dim2 * dim3 * dim4;
+
+    for (int idx = 0; idx < total_threads; idx++) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+// CUDA kernel to permute a 4D matrix
+__global__ void permute_cuda(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    // Ensure index is within bounds
+    if (idx < dim1 * dim2 * dim3 * dim4) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+// Function to check if the CUDA permutation result matches the CPU result
+bool verify_results(const float* permuted_matrix_cuda, const float* permuted_matrix_cpu, int totalElements) {
+    bool success = true;
+    for (int i = 0; i < totalElements; i++) {
+        // Allow a small tolerance for floating-point comparison
+        if (fabs(permuted_matrix_cuda[i] - permuted_matrix_cpu[i]) > 1e-5) {
+            success = false;
+            printf("Permute Operation Failed\n");
+            printf("CPU: %f\n", permuted_matrix_cpu[i]);
+            printf("CUDA: %f\n", permuted_matrix_cuda[i]);
+            break; // Exit early on the first failure
+        }
+    }
+    if (success) {
+        printf("Permute Operation Passed\n");
+    }
+    return success;
+}
+
+// Function to initialize the matrix with random values
+void initialize_matrix(float* mat, int dim_1, int dim_2, int dim_3, int dim_4) {
+    for (int i = 0; i < dim_1 * dim_2 * dim_3 * dim_4; ++i) {
+        mat[i] = static_cast<float>(rand()) / RAND_MAX;
+    }
+    printf("Matrix Initialized\n");
+}
+
+int main() {
+    int dim_1 = 24;
+    int dim_2 = 42;
+    int dim_3 = 20;
+    int dim_4 = 32;
+
+    // Set up the device
+    int deviceIdx = 0;
+    cudaSetDevice(deviceIdx);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, deviceIdx);
+    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+
+    // Allocate host memory
+    float* matrix = (float*)malloc(dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+    float* permuted_matrix = (float*)malloc(dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+    float* permuted_matrix_cpu = (float*)malloc(dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Initialize the matrix with random values
+    initialize_matrix(matrix, dim_1, dim_2, dim_3, dim_4);
+
+    // Allocate device memory
+    float *d_matrix, *d_permuted_matrix;
+    cudaMalloc(&d_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+    cudaMalloc(&d_permuted_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Copy matrix from host to device
+    cudaMemcpy(d_matrix, matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Perform permutation on CPU
+    permute_cpu(matrix, permuted_matrix_cpu, dim_1, dim_2, dim_3, dim_4);
+
+    // Define block and grid sizes
+    dim3 blockSize(256); 
+    int totalThreads = dim_1 * dim_2 * dim_3 * dim_4;
+    int gridSize = (totalThreads + blockSize.x - 1) / blockSize.x; // Compute grid size
+
+    // Launch CUDA kernel to perform permutation
+    permute_cuda<<<gridSize, blockSize>>>(d_matrix, d_permuted_matrix, dim_1, dim_2, dim_3, dim_4);
+    cudaDeviceSynchronize(); // Ensure kernel execution is complete
+
+    // Copy the result from device to host
+    cudaMemcpy(permuted_matrix, d_permuted_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Verify results
+    verify_results(permuted_matrix, permuted_matrix_cpu, dim_1 * dim_2 * dim_3 * dim_4);
+
+    // Free allocated memory
+    free(matrix);
+    free(permuted_matrix);
+    free(permuted_matrix_cpu);
+    cudaFree(d_matrix);
+    cudaFree(d_permuted_matrix);
+
+    return 0;
+}
+
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
@@ -643,6 +643,7 @@ int main(int argc, char **argv) {
     free(inp);
     cudaCheck(cudaFree(d_out));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_qkvr));
     cublasDestroy(cublas_handle);
 
     return 0;
diff --git a/dev/unistd.h b/dev/unistd.h
@@ -13,6 +13,8 @@
 #include <string.h>
 #include <direct.h> // for _mkdir and _stat
 #include <io.h> // needed for _access below and _findfirst, _findnext, _findclose
+#pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
+#include <winsock2.h>
 
 #define CLOCK_MONOTONIC 0
 static inline int clock_gettime(int ignore_variable, struct timespec* tv)
diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh
@@ -79,6 +79,36 @@ __device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
 typedef Packed128<float> f128;
 typedef Packed128<floatX> x128;
 
+// ----------------------------------------------------------------------------
+// DType support
+
+// enumerator to indentify the datatype of a tensor.
+enum class DType : uint8_t {
+    FP32, FP16, BF16
+};
+
+// Given a datatype enum, returns the underlying number of bytes
+// for a scalar of that type
+size_t sizeof_dtype(DType type) {
+    switch (type) {
+        case DType::FP32:
+            return sizeof(float);
+        case DType::FP16:
+            return sizeof(half);
+        case DType::BF16:
+            return sizeof(nv_bfloat16);
+        default: // handle or get compiler warning
+            fprintf(stderr, "Unknown datatype\n");
+            exit(EXIT_FAILURE);
+    }
+}
+
+DType dtype_of(float* f) { return DType::FP32; }
+DType dtype_of(nv_bfloat16 * f) { return DType::BF16; }
+DType dtype_of(half * f) { return DType::FP16; }
+
+
+
 // ----------------------------------------------------------------------------
 // Copy, cast functions
 
diff --git a/llmc/cudnn_att.cpp b/llmc/cudnn_att.cpp
@@ -3,6 +3,7 @@
 // TODO this currently duplicates some of the utilities from the main file
 
 #define NOMINMAX
+#include <unistd.h>
 #include "cudnn_att.h"
 #include <cudnn_frontend.h>
 
diff --git a/llmc/utils.h b/llmc/utils.h
@@ -16,9 +16,6 @@
 #ifndef _WIN32
 #include <dirent.h>
 #include <arpa/inet.h>
-#else
-#pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
-#include <winsock2.h>
 #endif
 
 // ----------------------------------------------------------------------------
diff --git a/profile_gpt2.cu b/profile_gpt2.cu
@@ -58,6 +58,7 @@ int main(int argc, char *argv[]) {
     model.config.num_layers = 1;
     set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
+    gpt2_allocate_state(&model, B, T);
     // do a training step
     gpt2_forward(&model, x, B, T);
     gpt2_backward_and_reduce(&model, x, y, 1, 0);
diff --git a/scripts/run_gpt3_125M.sh b/scripts/run_gpt3_125M.sh
@@ -1,14 +1,14 @@
-# GPT-3 (125M) repro on FineWeb
+# GPT-3 (125M) repro, but using FineWeb
 # 125M parameter model on 300B tokens
 # note context length: 1024 -> 2048 for GPT-3
-# => 6 * 124e6 * 300e9 = 7.44e18 ~= 2.2e20 capability model
-# 565,950 steps of 524,288 tokens/step
-# on 8X A100 80GB SXM ($14/hr) steps in ~300ms/iter
-# => training time 565,950 * 300ms ~= 47 hours ~= $658
+# => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model
+# 572,204 steps of 524,288 tokens/step => 300B
+# on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter
+# => training time 572,204 * 150ms ~= 24 hours ~= $336
 
 make train_gpt2cu USE_CUDNN=1
-out_dir="log_gpt3_124M"
-done_file="$out_dir/DONE_00565950"
+out_dir="log_gpt3_125M"
+done_file="$out_dir/DONE_00572204"
 
 while true; do
 
@@ -18,8 +18,6 @@ while true; do
         break
     fi
 
-    # run python dev/data/fineweb.py --version 10B to prepro data
-    # run python dev/data/hellaswag.py to prepro hellaswag eval
     mpirun -np 8 ./train_gpt2cu \
                 -i "dev/data/fineweb100B/fineweb_train_*.bin" \
                 -j "dev/data/fineweb100B/fineweb_val_*.bin" \
@@ -32,11 +30,16 @@ while true; do
                 -z 1 \
                 -c 0.1 \
                 -l 0.0006 \
-                -q 0.0 \
+                -q 0.1 \
                 -u 700 \
                 -n 10000 \
+                -nk 5 \
+                -nm 50000 \
+                -ge 1 \
+                -sl 7.0 \
+                -sg 7.0 \
                 -y 1 \
-                -x 565950 \
+                -x 572204 \
                 -e "gpt3:c768"
 
     sleep 1
diff --git a/test_gpt2.cu b/test_gpt2.cu
@@ -168,6 +168,8 @@ int main(int argc, char *argv[]) {
     // overall OK signal for the test
     int allok = 1;
 
+    gpt2_allocate_state(&model, B, T);
+
     // First, do target-free forward pass to validate logits
     gpt2_forward(&model, x, B, T);
     // at this point, target should be equal to expected_logits, let's compare
@@ -346,6 +348,7 @@ int main(int argc, char *argv[]) {
     gpt2_free(&model);
     gpt2_build_from_checkpoint(&model, "test_gpt2cu_model.ckpt");
     int ld_step;
+    gpt2_allocate_state(&model, B, T);
     load_state(&ld_step, &model, &loader, "test_gpt2cu_state.ckpt");
     for (int step = 0; step < 10; step++) {
         dataloader_next_batch(&loader);
diff --git a/train_gpt2.cu b/train_gpt2.cu

Original file line number	Diff line number	Diff line change
`@@ -766,6 +766,7 @@ int main(int argc, char **argv) {`
`766`	`766`	`cudaCheck(cudaFree(d_logits));`
`767`	`767`	`cudaCheck(cudaFree(d_dlosses));`
`768`	`768`	`cudaCheck(cudaFree(d_targets));`
	`769`	`+ cudaCheck(cudaFree(d_dlogits_no_pad));`
`769`	`770`
`770`	`771`	`return 0;`
`771`	`772`	`}`
Original file line number	Diff line number	Diff line change
`@@ -193,5 +193,6 @@ int main(int argc, char **argv) {`
`193`	`193`
`194`	`194`	`free(all_reduce_buffer_host);`
`195`	`195`	`cudaCheck(cudaFree(all_reduce_buffer));`
	`196`	`+ cudaCheck(cudaFree(all_reduce_buffer_recv));`
`196`	`197`	`multi_gpu_config_free(&multi_gpu_config);`
`197`	`198`	`}`