diff --git a/cmake/test_macros.cmake b/cmake/test_macros.cmake index 1505d5f8..4b96db17 100644 --- a/cmake/test_macros.cmake +++ b/cmake/test_macros.cmake @@ -28,8 +28,9 @@ include(GoogleTest) # Features: # 1. Create executable target # 2. Configure compile options, link libraries, and include paths -# 3. Use gtest_discover_tests to auto-discover test cases -# 4. Set test labels +# 3. Use gtest_discover_tests to auto-discover CPU test cases +# 4. Register CUDA tests at binary granularity with CTest GPU resources +# 5. Set test labels # # Arguments: # SOURCES: Source file list (required) @@ -73,7 +74,7 @@ macro(infini_train_add_test) # 5. Link project library (reuses framework linking strategy) link_infini_train_exe(${ARG_TEST_NAME}) - # 6. Auto-discover gtest cases and register as ctest tests + # 6. Register tests set(labels "cpu") if(ARG_LABELS) set(labels "${ARG_LABELS}") @@ -84,16 +85,30 @@ macro(infini_train_add_test) set(test_timeout ${ARG_TEST_TIMEOUT}) endif() - if(ARG_TEST_FILTER) + list(FIND labels cuda _has_cuda_label) + if(NOT _has_cuda_label EQUAL -1) + set(_cuda_test_args) + if(ARG_TEST_FILTER) + list(APPEND _cuda_test_args --gtest_filter=${ARG_TEST_FILTER}) + endif() + + add_test( + NAME ${ARG_TEST_NAME} + COMMAND $ ${_cuda_test_args} + ) + set_tests_properties(${ARG_TEST_NAME} + PROPERTIES + LABELS "${labels}" + TIMEOUT ${test_timeout} + ) + elseif(ARG_TEST_FILTER) gtest_discover_tests(${ARG_TEST_NAME} - EXTRA_ARGS --gtest_output=xml:%T.xml TEST_FILTER "${ARG_TEST_FILTER}" DISCOVERY_TIMEOUT 10 PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout} ) else() gtest_discover_tests(${ARG_TEST_NAME} - EXTRA_ARGS --gtest_output=xml:%T.xml PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout} ) endif() diff --git a/scripts/compare_utils.py b/scripts/compare_utils.py index 0831f7be..f29ea49d 100644 --- a/scripts/compare_utils.py +++ b/scripts/compare_utils.py @@ -8,7 +8,9 @@ def collect_log_files(base_dir: Path): duplicates = {} for path in base_dir.rglob("*.log"): - if path.name.startswith("build") or path.name.endswith("_profile.log"): + if not path.name.startswith(("gpt2_", "llama3_")): + continue + if path.name.endswith("_profile.log"): continue key = path.name diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash index e3c67293..1a58c681 100755 --- a/scripts/run_models_and_profile.bash +++ b/scripts/run_models_and_profile.bash @@ -71,7 +71,6 @@ LOG_DIR="$(read_var LOG_DIR)"; : "${LOG_DIR:=logs}" PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)"; : "${PROFILE_LOG_DIR:=./profile_logs}" COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}" RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}" -CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}" mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR" @@ -114,6 +113,74 @@ clean_build_dir() { rm -rf "${BUILD_DIR:?}/"* } +run_ctest() { + local gpu_list=() + local cuda_tests=() + + if [[ -n "${CTEST_CUDA_GPUS:-}" ]]; then + IFS=',' read -r -a gpu_list <<< "$CTEST_CUDA_GPUS" + elif command -v nvidia-smi >/dev/null 2>&1; then + mapfile -t gpu_list < <(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null || true) + fi + + if [[ ${#gpu_list[@]} -eq 0 ]]; then + gpu_list=(0) + fi + + local filtered_gpu_list=() + local gpu + for gpu in "${gpu_list[@]}"; do + gpu="${gpu//[[:space:]]/}" + [[ -z "$gpu" ]] && continue + filtered_gpu_list+=("$gpu") + done + + if [[ ${#filtered_gpu_list[@]} -eq 0 ]]; then + filtered_gpu_list=(0) + fi + + ctest --output-on-failure -LE cuda -j"$(nproc)" + + mapfile -t cuda_tests < <(ctest -N -L cuda | sed -n 's/^ *Test *#[0-9][0-9]*: //p') + if [[ ${#cuda_tests[@]} -eq 0 ]]; then + return 0 + fi + + local worker_count="${#filtered_gpu_list[@]}" + local pids=() + local worker_idx + for ((worker_idx = 0; worker_idx < worker_count; worker_idx++)); do + ( + local worker_failed=0 + local test_idx="$worker_idx" + local test_name + local assigned_gpu="${filtered_gpu_list[$worker_idx]}" + + while ((test_idx < ${#cuda_tests[@]})); do + test_name="${cuda_tests[$test_idx]}" + echo "[CUDA GPU ${assigned_gpu}] ${test_name}" + if ! CUDA_VISIBLE_DEVICES="$assigned_gpu" ctest --output-on-failure -R "^${test_name}$" -j1; then + worker_failed=1 + fi + test_idx=$((test_idx + worker_count)) + done + + exit "$worker_failed" + ) & + pids+=("$!") + done + + local failed=0 + local pid + for pid in "${pids[@]}"; do + if ! wait "$pid"; then + failed=1 + fi + done + + return "$failed" +} + # Run a command and log output run_and_log() { local cmd="$1" @@ -247,7 +314,7 @@ for ((id=0; id