From cde045869bbbf1b3b77f773323e19d3e7d59668c Mon Sep 17 00:00:00 2001 From: cx Date: Tue, 9 Jun 2026 08:40:07 +0000 Subject: [PATCH 1/2] Organize benchmark logs by date/branch/commit and propagate run metadata to Feishu Restructure run_models_and_profile.bash to write logs under /_/ and emit a run_metadata.log plus per-log metadata headers (branch, commit, start time). write_to_feishu_sheet.py now reads this metadata so each sheet row reflects the actual run instead of the current git state, and accepts --log-dir/--profile-log-dir. compare_utils.py skips the new run_metadata.log when collecting logs. --- scripts/compare_utils.py | 2 + scripts/run_models_and_profile.bash | 54 +++++++++++++++++-- scripts/write_to_feishu_sheet.py | 82 ++++++++++++++++++++++++++--- 3 files changed, 128 insertions(+), 10 deletions(-) diff --git a/scripts/compare_utils.py b/scripts/compare_utils.py index 0831f7be..8083bec2 100644 --- a/scripts/compare_utils.py +++ b/scripts/compare_utils.py @@ -8,6 +8,8 @@ def collect_log_files(base_dir: Path): duplicates = {} for path in base_dir.rglob("*.log"): + if path.name == "run_metadata.log": + continue if path.name.startswith("build") or path.name.endswith("_profile.log"): continue diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash index e3c67293..110d2a69 100755 --- a/scripts/run_models_and_profile.bash +++ b/scripts/run_models_and_profile.bash @@ -73,10 +73,7 @@ COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}" RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}" CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}" -mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR" - -# export custom PATHs -export BUILD_DIR LOG_DIR PROFILE_LOG_DIR +# export custom variables from config first. LOG_DIR/PROFILE_LOG_DIR are normalized below. while IFS="=" read -r k v; do [[ -z "$k" || "$k" == "null" ]] && continue export "$k"="$v" @@ -86,6 +83,50 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE") LAST_CMAKE_CMD="" declare -A SELECTED_TAGS=() +RUN_STARTED_AT="$(date '+%Y-%m-%d %H:%M:%S')" +RUN_ID="$(date '+%Y%m%d_%H%M%S')" +RUN_DATE="$(date '+%Y%m%d')" +GIT_BRANCH="$(git rev-parse --abbrev-ref HEAD 2>/dev/null || true)" +: "${GIT_BRANCH:=unknown}" +GIT_COMMIT_FULL="$(git rev-parse HEAD 2>/dev/null || true)" +: "${GIT_COMMIT_FULL:=unknown}" +GIT_COMMIT_SHORT="${GIT_COMMIT_FULL:0:7}" +SAFE_GIT_BRANCH="${GIT_BRANCH//\//_}" +SAFE_GIT_BRANCH="${SAFE_GIT_BRANCH//[[:space:]]/_}" +SAFE_GIT_BRANCH="$(printf '%s' "$SAFE_GIT_BRANCH" | tr -cd '[:alnum:]_.-')" +: "${SAFE_GIT_BRANCH:=unknown}" + +LOG_DIR_BASENAME="$(basename "$LOG_DIR")" +PROFILE_LOG_DIR_BASENAME="$(basename "$PROFILE_LOG_DIR")" +LOG_DIR_PARENT="$(dirname "$LOG_DIR")" +if [[ "$LOG_DIR_PARENT" == "." ]]; then + RUN_OUTPUT_DIR="${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}" +else + RUN_OUTPUT_DIR="${LOG_DIR_PARENT}/${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}" +fi +LOG_DIR="${RUN_OUTPUT_DIR}/${LOG_DIR_BASENAME}" +PROFILE_LOG_DIR="${RUN_OUTPUT_DIR}/${PROFILE_LOG_DIR_BASENAME}" + +mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR" +export BUILD_DIR LOG_DIR PROFILE_LOG_DIR + +RUN_METADATA_FILE="${LOG_DIR}/run_metadata.log" +: > "$RUN_METADATA_FILE" +RUN_METADATA_FILE="$(realpath "$RUN_METADATA_FILE")" +{ + echo "[RUN_STARTED_AT] $RUN_STARTED_AT" + echo "[RUN_ID] $RUN_ID" + echo "[GIT_BRANCH] $GIT_BRANCH" + echo "[GIT_COMMIT] $GIT_COMMIT_FULL" + echo "[GIT_COMMIT_SHORT] $GIT_COMMIT_SHORT" + echo "[CONFIG_FILE] $CONFIG_FILE" + echo "[LOG_DIR] $(realpath "$LOG_DIR")" + echo "[PROFILE_LOG_DIR] $(realpath "$PROFILE_LOG_DIR")" +} > "$RUN_METADATA_FILE" +echo -e "\033[1;33mRun metadata:\033[0m $RUN_METADATA_FILE" +echo -e "\033[1;33mRun log dir:\033[0m $(realpath "$LOG_DIR")" +echo -e "\033[1;33mRun profile log dir:\033[0m $(realpath "$PROFILE_LOG_DIR")" + normalize_tag() { local raw="$1" raw="${raw#"${raw%%[![:space:]]*}"}" @@ -157,6 +198,11 @@ run_and_log() { fi # Write the current run command to the log + echo "[RUN_METADATA] $RUN_METADATA_FILE" >> "$log_path" + echo "[RUN_STARTED_AT] $RUN_STARTED_AT" >> "$log_path" + echo "[GIT_BRANCH] $GIT_BRANCH" >> "$log_path" + echo "[GIT_COMMIT] $GIT_COMMIT_FULL" >> "$log_path" + echo "[GIT_COMMIT_SHORT] $GIT_COMMIT_SHORT" >> "$log_path" echo "[COMMAND] $cmd" >> "$log_path" # Run the command and append both stdout and stderr to the log file diff --git a/scripts/write_to_feishu_sheet.py b/scripts/write_to_feishu_sheet.py index c9055cbf..e6735696 100644 --- a/scripts/write_to_feishu_sheet.py +++ b/scripts/write_to_feishu_sheet.py @@ -336,6 +336,66 @@ def parse_command_args(log_content: str, start_flag="--dtype"): return None return None +def parse_metadata_lines(log_content: str): + """Parse run metadata lines written by run_models_and_profile.bash.""" + key_map = { + "RUN_METADATA": "run_metadata", + "RUN_STARTED_AT": "run_started_at", + "GIT_BRANCH": "git_branch", + "GIT_COMMIT": "git_commit", + "GIT_COMMIT_SHORT": "git_commit_short", + } + metadata = {} + for line in log_content.splitlines(): + match = re.match(r"^\[([A-Z_]+)\]\s*(.*)$", line) + if not match: + continue + key = key_map.get(match.group(1)) + if key: + metadata[key] = match.group(2).strip() + return metadata + +def load_run_metadata(log_content: str): + """Load run metadata from the training log and its referenced metadata file.""" + metadata = parse_metadata_lines(log_content) + metadata_file = metadata.get("run_metadata") + if metadata_file and os.path.exists(metadata_file): + try: + with open(metadata_file, 'r', encoding='utf-8') as f: + file_metadata = parse_metadata_lines(f.read()) + file_metadata.update(metadata) + metadata = file_metadata + except OSError as exc: + print(f"Failed to read run metadata file {metadata_file}: {exc}") + return metadata + +def get_run_date(run_metadata): + """Get benchmark run date from metadata, falling back to today's date.""" + started_at = run_metadata.get("run_started_at") + if started_at: + for fmt in ("%Y-%m-%d %H:%M:%S", "%Y%m%d_%H%M%S"): + try: + return datetime.strptime(started_at, fmt).date() + except ValueError: + pass + try: + return datetime.fromisoformat(started_at).date() + except ValueError: + pass + return datetime.now().date() + +def get_run_branch(run_metadata): + """Get benchmark branch from metadata, falling back to current git for old logs.""" + return run_metadata.get("git_branch") or get_git_branch() + +def get_run_commit_id(run_metadata): + """Get benchmark commit id from metadata, falling back to current git for old logs.""" + if run_metadata.get("git_commit_short"): + return run_metadata["git_commit_short"] + if run_metadata.get("git_commit"): + return run_metadata["git_commit"][:7] + return get_git_commit_id() + def parse_training_log(log_content): """Parse training log to extract avg latency and throughput from step >= 2 and peak mem usage during whole time""" pattern_with_peak = ( @@ -491,11 +551,13 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir avg_latency, avg_throughput, peak_used_max, peak_reserved_max = None, None, None, None cmd_args = None + run_metadata = {} # Read training log if os.path.exists(log_file_path): with open(log_file_path, 'r', encoding='utf-8') as f: content = f.read() + run_metadata = load_run_metadata(content) result = parse_training_log(content) if result: avg_latency, avg_throughput, peak_used_max, peak_reserved_max = result @@ -522,9 +584,9 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir combined_df = combined_df.astype(object) # Fill first row's first $META_COLS columns with info - combined_df.iloc[0, 0] = FeishuSheetHandler.convert_to_feishu_date(datetime.now().date()) - combined_df.iloc[0, 1] = get_git_branch() - combined_df.iloc[0, 2] = get_git_commit_id() + combined_df.iloc[0, 0] = FeishuSheetHandler.convert_to_feishu_date(get_run_date(run_metadata)) + combined_df.iloc[0, 1] = get_run_branch(run_metadata) + combined_df.iloc[0, 2] = get_run_commit_id(run_metadata) if avg_latency is not None: combined_df.iloc[0, 3] = avg_latency if avg_throughput is not None: @@ -540,6 +602,8 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir def main(): parser = argparse.ArgumentParser(description='Script to write training metrics to Feishu sheets') parser.add_argument('config_file', help='Path to JSON config file (e.g. token.json)') + parser.add_argument('--log-dir', default='logs', help='Training log directory. Default: logs') + parser.add_argument('--profile-log-dir', default='profile_logs', help='Profile log directory. Default: profile_logs') args = parser.parse_args() config = load_config(args.config_file) @@ -563,9 +627,9 @@ def main(): print(f"\n--- Processing model={model_name} tag={tag} ---") model_name = model_name.lower() - testcases = discover_testcases(model_name, tag) + testcases = discover_testcases(model_name, tag, log_dir=args.log_dir) if not testcases: - print(f"No local testcases found under logs/{tag}/ for model={model_name}, skipping") + print(f"No local testcases found under {args.log_dir}/{tag}/ for model={model_name}, skipping") continue print(f"Discovered {len(testcases)} local testcases: {testcases}") @@ -597,7 +661,13 @@ def main(): print(f"Processing testcase '{testcase}' -> sheet_id={sheet_id}") - cmd_args, sheet_data = get_model_data(model_name=model_name, sheet_title=testcase, tag=tag) + cmd_args, sheet_data = get_model_data( + model_name=model_name, + sheet_title=testcase, + tag=tag, + log_dir=args.log_dir, + profile_log_dir=args.profile_log_dir + ) if not sheet_data: print("No valid data generated, skipping") From 0e19707693d3a6dd0b38b6725da78f7eb59395ab Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 12 Jun 2026 03:17:31 +0000 Subject: [PATCH 2/2] fix: resolve Feishu log dirs from run output dir --- scripts/run_models_and_profile.bash | 6 ++--- scripts/write_to_feishu_sheet.py | 34 ++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash index 110d2a69..1d351b4c 100755 --- a/scripts/run_models_and_profile.bash +++ b/scripts/run_models_and_profile.bash @@ -96,16 +96,14 @@ SAFE_GIT_BRANCH="${SAFE_GIT_BRANCH//[[:space:]]/_}" SAFE_GIT_BRANCH="$(printf '%s' "$SAFE_GIT_BRANCH" | tr -cd '[:alnum:]_.-')" : "${SAFE_GIT_BRANCH:=unknown}" -LOG_DIR_BASENAME="$(basename "$LOG_DIR")" -PROFILE_LOG_DIR_BASENAME="$(basename "$PROFILE_LOG_DIR")" LOG_DIR_PARENT="$(dirname "$LOG_DIR")" if [[ "$LOG_DIR_PARENT" == "." ]]; then RUN_OUTPUT_DIR="${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}" else RUN_OUTPUT_DIR="${LOG_DIR_PARENT}/${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}" fi -LOG_DIR="${RUN_OUTPUT_DIR}/${LOG_DIR_BASENAME}" -PROFILE_LOG_DIR="${RUN_OUTPUT_DIR}/${PROFILE_LOG_DIR_BASENAME}" +LOG_DIR="${RUN_OUTPUT_DIR}/logs" +PROFILE_LOG_DIR="${RUN_OUTPUT_DIR}/profile_logs" mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR" export BUILD_DIR LOG_DIR PROFILE_LOG_DIR diff --git a/scripts/write_to_feishu_sheet.py b/scripts/write_to_feishu_sheet.py index e6735696..704e2136 100644 --- a/scripts/write_to_feishu_sheet.py +++ b/scripts/write_to_feishu_sheet.py @@ -544,6 +544,21 @@ def get_git_commit_id(): return "unknown" +def resolve_log_dirs(run_log_dir): + """Resolve training and profile log directories from a run output directory.""" + log_dir = os.path.join(run_log_dir, "logs") + profile_log_dir = os.path.join(run_log_dir, "profile_logs") + + if not os.path.isdir(log_dir): + print(f"Training log directory does not exist: {log_dir}") + return None, None + if not os.path.isdir(profile_log_dir): + print(f"Profile log directory does not exist: {profile_log_dir}") + return None, None + + return log_dir, profile_log_dir + + def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir="profile_logs"): """Construct 2D list for writing to Feishu""" log_file_path = os.path.join(log_dir, tag, f"{model_name}_{sheet_title}.log") @@ -602,10 +617,17 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir def main(): parser = argparse.ArgumentParser(description='Script to write training metrics to Feishu sheets') parser.add_argument('config_file', help='Path to JSON config file (e.g. token.json)') - parser.add_argument('--log-dir', default='logs', help='Training log directory. Default: logs') - parser.add_argument('--profile-log-dir', default='profile_logs', help='Profile log directory. Default: profile_logs') + parser.add_argument( + '--log-dir', + default='.', + help='Run output directory containing logs/ and profile_logs/. Default: current directory' + ) args = parser.parse_args() + log_dir, profile_log_dir = resolve_log_dirs(args.log_dir) + if not log_dir or not profile_log_dir: + return + config = load_config(args.config_file) if not config: print("Failed to load config file, exiting") @@ -627,9 +649,9 @@ def main(): print(f"\n--- Processing model={model_name} tag={tag} ---") model_name = model_name.lower() - testcases = discover_testcases(model_name, tag, log_dir=args.log_dir) + testcases = discover_testcases(model_name, tag, log_dir=log_dir) if not testcases: - print(f"No local testcases found under {args.log_dir}/{tag}/ for model={model_name}, skipping") + print(f"No local testcases found under {log_dir}/{tag}/ for model={model_name}, skipping") continue print(f"Discovered {len(testcases)} local testcases: {testcases}") @@ -665,8 +687,8 @@ def main(): model_name=model_name, sheet_title=testcase, tag=tag, - log_dir=args.log_dir, - profile_log_dir=args.profile_log_dir + log_dir=log_dir, + profile_log_dir=profile_log_dir ) if not sheet_data: