Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/compare_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ def collect_log_files(base_dir: Path):
duplicates = {}

for path in base_dir.rglob("*.log"):
if path.name == "run_metadata.log":
continue
if path.name.startswith("build") or path.name.endswith("_profile.log"):
continue

Expand Down
52 changes: 48 additions & 4 deletions scripts/run_models_and_profile.bash
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,7 @@ COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}"
RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}"
CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}"

mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"

# export custom PATHs
export BUILD_DIR LOG_DIR PROFILE_LOG_DIR
# export custom variables from config first. LOG_DIR/PROFILE_LOG_DIR are normalized below.
while IFS="=" read -r k v; do
[[ -z "$k" || "$k" == "null" ]] && continue
export "$k"="$v"
Expand All @@ -86,6 +83,48 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
LAST_CMAKE_CMD=""
declare -A SELECTED_TAGS=()

RUN_STARTED_AT="$(date '+%Y-%m-%d %H:%M:%S')"
RUN_ID="$(date '+%Y%m%d_%H%M%S')"
RUN_DATE="$(date '+%Y%m%d')"
GIT_BRANCH="$(git rev-parse --abbrev-ref HEAD 2>/dev/null || true)"
: "${GIT_BRANCH:=unknown}"
GIT_COMMIT_FULL="$(git rev-parse HEAD 2>/dev/null || true)"
: "${GIT_COMMIT_FULL:=unknown}"
GIT_COMMIT_SHORT="${GIT_COMMIT_FULL:0:7}"
SAFE_GIT_BRANCH="${GIT_BRANCH//\//_}"
SAFE_GIT_BRANCH="${SAFE_GIT_BRANCH//[[:space:]]/_}"
SAFE_GIT_BRANCH="$(printf '%s' "$SAFE_GIT_BRANCH" | tr -cd '[:alnum:]_.-')"
: "${SAFE_GIT_BRANCH:=unknown}"

LOG_DIR_PARENT="$(dirname "$LOG_DIR")"
if [[ "$LOG_DIR_PARENT" == "." ]]; then
RUN_OUTPUT_DIR="${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}"
else
RUN_OUTPUT_DIR="${LOG_DIR_PARENT}/${RUN_DATE}/${SAFE_GIT_BRANCH}_${GIT_COMMIT_SHORT}"
fi
LOG_DIR="${RUN_OUTPUT_DIR}/logs"
PROFILE_LOG_DIR="${RUN_OUTPUT_DIR}/profile_logs"

mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
export BUILD_DIR LOG_DIR PROFILE_LOG_DIR

RUN_METADATA_FILE="${LOG_DIR}/run_metadata.log"
: > "$RUN_METADATA_FILE"
RUN_METADATA_FILE="$(realpath "$RUN_METADATA_FILE")"
{
echo "[RUN_STARTED_AT] $RUN_STARTED_AT"
echo "[RUN_ID] $RUN_ID"
echo "[GIT_BRANCH] $GIT_BRANCH"
echo "[GIT_COMMIT] $GIT_COMMIT_FULL"
echo "[GIT_COMMIT_SHORT] $GIT_COMMIT_SHORT"
echo "[CONFIG_FILE] $CONFIG_FILE"
echo "[LOG_DIR] $(realpath "$LOG_DIR")"
echo "[PROFILE_LOG_DIR] $(realpath "$PROFILE_LOG_DIR")"
} > "$RUN_METADATA_FILE"
echo -e "\033[1;33mRun metadata:\033[0m $RUN_METADATA_FILE"
echo -e "\033[1;33mRun log dir:\033[0m $(realpath "$LOG_DIR")"
echo -e "\033[1;33mRun profile log dir:\033[0m $(realpath "$PROFILE_LOG_DIR")"

normalize_tag() {
local raw="$1"
raw="${raw#"${raw%%[![:space:]]*}"}"
Expand Down Expand Up @@ -157,6 +196,11 @@ run_and_log() {
fi

# Write the current run command to the log
echo "[RUN_METADATA] $RUN_METADATA_FILE" >> "$log_path"
echo "[RUN_STARTED_AT] $RUN_STARTED_AT" >> "$log_path"
echo "[GIT_BRANCH] $GIT_BRANCH" >> "$log_path"
echo "[GIT_COMMIT] $GIT_COMMIT_FULL" >> "$log_path"
echo "[GIT_COMMIT_SHORT] $GIT_COMMIT_SHORT" >> "$log_path"
echo "[COMMAND] $cmd" >> "$log_path"

# Run the command and append both stdout and stderr to the log file
Expand Down
104 changes: 98 additions & 6 deletions scripts/write_to_feishu_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,66 @@ def parse_command_args(log_content: str, start_flag="--dtype"):
return None
return None

def parse_metadata_lines(log_content: str):
"""Parse run metadata lines written by run_models_and_profile.bash."""
key_map = {
"RUN_METADATA": "run_metadata",
"RUN_STARTED_AT": "run_started_at",
"GIT_BRANCH": "git_branch",
"GIT_COMMIT": "git_commit",
"GIT_COMMIT_SHORT": "git_commit_short",
}
metadata = {}
for line in log_content.splitlines():
match = re.match(r"^\[([A-Z_]+)\]\s*(.*)$", line)
if not match:
continue
key = key_map.get(match.group(1))
if key:
metadata[key] = match.group(2).strip()
return metadata

def load_run_metadata(log_content: str):
"""Load run metadata from the training log and its referenced metadata file."""
metadata = parse_metadata_lines(log_content)
metadata_file = metadata.get("run_metadata")
if metadata_file and os.path.exists(metadata_file):
try:
with open(metadata_file, 'r', encoding='utf-8') as f:
file_metadata = parse_metadata_lines(f.read())
file_metadata.update(metadata)
metadata = file_metadata
except OSError as exc:
print(f"Failed to read run metadata file {metadata_file}: {exc}")
return metadata

def get_run_date(run_metadata):
"""Get benchmark run date from metadata, falling back to today's date."""
started_at = run_metadata.get("run_started_at")
if started_at:
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y%m%d_%H%M%S"):
try:
return datetime.strptime(started_at, fmt).date()
except ValueError:
pass
try:
return datetime.fromisoformat(started_at).date()
except ValueError:
pass
return datetime.now().date()

def get_run_branch(run_metadata):
"""Get benchmark branch from metadata, falling back to current git for old logs."""
return run_metadata.get("git_branch") or get_git_branch()

def get_run_commit_id(run_metadata):
"""Get benchmark commit id from metadata, falling back to current git for old logs."""
if run_metadata.get("git_commit_short"):
return run_metadata["git_commit_short"]
if run_metadata.get("git_commit"):
return run_metadata["git_commit"][:7]
return get_git_commit_id()

def parse_training_log(log_content):
"""Parse training log to extract avg latency and throughput from step >= 2 and peak mem usage during whole time"""
pattern_with_peak = (
Expand Down Expand Up @@ -484,18 +544,35 @@ def get_git_commit_id():
return "unknown"


def resolve_log_dirs(run_log_dir):
"""Resolve training and profile log directories from a run output directory."""
log_dir = os.path.join(run_log_dir, "logs")
profile_log_dir = os.path.join(run_log_dir, "profile_logs")

if not os.path.isdir(log_dir):
print(f"Training log directory does not exist: {log_dir}")
return None, None
if not os.path.isdir(profile_log_dir):
print(f"Profile log directory does not exist: {profile_log_dir}")
return None, None

return log_dir, profile_log_dir


def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir="profile_logs"):
"""Construct 2D list for writing to Feishu"""
log_file_path = os.path.join(log_dir, tag, f"{model_name}_{sheet_title}.log")
profile_file_path = os.path.join(profile_log_dir, tag, f"{model_name}_{sheet_title}_profile_{model_name}.report.rank0")

avg_latency, avg_throughput, peak_used_max, peak_reserved_max = None, None, None, None
cmd_args = None
run_metadata = {}

# Read training log
if os.path.exists(log_file_path):
with open(log_file_path, 'r', encoding='utf-8') as f:
content = f.read()
run_metadata = load_run_metadata(content)
result = parse_training_log(content)
if result:
avg_latency, avg_throughput, peak_used_max, peak_reserved_max = result
Expand All @@ -522,9 +599,9 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir
combined_df = combined_df.astype(object)

# Fill first row's first $META_COLS columns with info
combined_df.iloc[0, 0] = FeishuSheetHandler.convert_to_feishu_date(datetime.now().date())
combined_df.iloc[0, 1] = get_git_branch()
combined_df.iloc[0, 2] = get_git_commit_id()
combined_df.iloc[0, 0] = FeishuSheetHandler.convert_to_feishu_date(get_run_date(run_metadata))
combined_df.iloc[0, 1] = get_run_branch(run_metadata)
combined_df.iloc[0, 2] = get_run_commit_id(run_metadata)
if avg_latency is not None:
combined_df.iloc[0, 3] = avg_latency
if avg_throughput is not None:
Expand All @@ -540,8 +617,17 @@ def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir
def main():
parser = argparse.ArgumentParser(description='Script to write training metrics to Feishu sheets')
parser.add_argument('config_file', help='Path to JSON config file (e.g. token.json)')
parser.add_argument(
'--log-dir',
default='.',
help='Run output directory containing logs/ and profile_logs/. Default: current directory'
)
args = parser.parse_args()

log_dir, profile_log_dir = resolve_log_dirs(args.log_dir)
if not log_dir or not profile_log_dir:
return

config = load_config(args.config_file)
if not config:
print("Failed to load config file, exiting")
Expand All @@ -563,9 +649,9 @@ def main():
print(f"\n--- Processing model={model_name} tag={tag} ---")
model_name = model_name.lower()

testcases = discover_testcases(model_name, tag)
testcases = discover_testcases(model_name, tag, log_dir=log_dir)
if not testcases:
print(f"No local testcases found under logs/{tag}/ for model={model_name}, skipping")
print(f"No local testcases found under {log_dir}/{tag}/ for model={model_name}, skipping")
continue
print(f"Discovered {len(testcases)} local testcases: {testcases}")

Expand Down Expand Up @@ -597,7 +683,13 @@ def main():

print(f"Processing testcase '{testcase}' -> sheet_id={sheet_id}")

cmd_args, sheet_data = get_model_data(model_name=model_name, sheet_title=testcase, tag=tag)
cmd_args, sheet_data = get_model_data(
model_name=model_name,
sheet_title=testcase,
tag=tag,
log_dir=log_dir,
profile_log_dir=profile_log_dir
)

if not sheet_data:
print("No valid data generated, skipping")
Expand Down
Loading