Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,49 @@ settings:
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
experiments_root_path: /raid/s3/opengptx/user/richard-rutmann/experiments/modalities/moe_fsdp2
experiment_folder_path: ${settings.paths.experiments_root_path}/${settings.experiment_id}
checkpoint_saving_path: /raid/s3/opengptx/user/richard-rutmann/experiments/modalities/moe_fsdp2/checkpoints
train_dataset_path: /raid/s3/opengptx/user/richard-rutmann/data/modalities/gpt2_tokenized/000_00000.pbin
checkpoint_saving_path: data/checkpoints
train_dataset_path: ./data/lorem_ipsum_long.pbin
test_dataset_path: ./data/lorem_ipsum.pbin
experiments_root_path: ${modalities_env:experiments_root_path}
intervals:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 1001
evaluation_interval_in_steps: 1001
checkpointing_interval_in_steps: 32
evaluation_interval_in_steps: 32
consistency_enforcement:
enforce_tokens_per_step_consistency: true
enforce_tokens_per_step_consistency: false
enforce_last_step_logged: false
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 4
local_train_micro_batch_size: 2
sequence_length: 4096
gradient_accumulation_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_num_steps
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
config:
dataset_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_steps: ${settings.training_target.num_target_steps}
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
global_num_tokens: ${settings.training_target.num_target_tokens}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps: 10
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
Expand All @@ -62,17 +72,13 @@ train_dataset:
config:
raw_data_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
variant_key: default
config:
# we set num_workers to 0 so that the the data is loaded in the main process
# this is required to track how often the collator has been called
# in the library tutorials. Otherwise the collator will be copied for each worker
# and the number of call is out of scope.
num_workers: 0
num_workers: 2
pin_memory: true
dataloader_tag: train
dataset:
Expand Down Expand Up @@ -101,7 +107,48 @@ train_dataloader:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders: []
test_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.test_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

test_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
dataloader_tag: test
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
drop_last: true
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders:
- instance_key: test_dataloader
pass_type: BY_REFERENCE

checkpoint_saving:
component_key: checkpoint_saving
Expand All @@ -111,12 +158,12 @@ checkpoint_saving:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
k: -1
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: dcp
config:
checkpoint_path: ${settings.paths.experiment_folder_path}
checkpoint_path: ${settings.paths.checkpoint_saving_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}

Expand All @@ -136,15 +183,14 @@ device_mesh:
config:
device_type: cuda
data_parallel_replicate_degree: 1
# Keep FSDP sharding on dp_shard and reserve tp for expert parallel.
data_parallel_shard_degree: -1
tensor_parallel_degree: 4
world_size: ${settings.cuda_env.world_size}

dp_degree:
component_key: number_conversion
variant_key: parallel_degree
config: # get the parallel degree from the device mesh
config:
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
Expand All @@ -154,7 +200,7 @@ app_state:
component_key: app_state
variant_key: raw
config:
model:
model:
instance_key: initialized_model
pass_type: BY_REFERENCE
optimizer:
Expand All @@ -180,13 +226,14 @@ initialized_model:
mean: 0.0
std: 0.02
num_layers: ${model_raw.config.num_layers}
multi_device_generator_policy: error

ep_model:
component_key: model
variant_key: ep_wrapped
config:
model:
instance_key: model_raw # Bypass torch.compile - MoE routing is incompatible
instance_key: model_raw
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
Expand All @@ -196,7 +243,7 @@ ep_model:

ac_model:
component_key: model
variant_key: activation_checkpointed # using modalities fsdp2 ac. should do to job also for ep layers
variant_key: activation_checkpointed
config:
model:
instance_key: ep_model
Expand All @@ -222,34 +269,35 @@ fsdp_model:
reshard_after_forward: true
block_names: [TransformerBlock]

compiled_model:
component_key: model
variant_key: compiled
config:
model:
instance_key: model_raw
pass_type: BY_REFERENCE
block_names: [TransformerBlock]

model_raw:
component_key: model
variant_key: moe
config:
vocab_size: 50257 # to match a pretrained tokenizer, tochange
max_seq_len: 4096
d_model: 2048
d_ff: 6144
n_heads: 32
n_kv_heads: 8
num_layers: 8
config:
sample_key: ${settings.referencing_keys.sample_key}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304
max_seq_len: ${settings.step_profile.sequence_length}
d_model: 128
n_heads: 8
n_kv_heads: 4
num_layers: 2
d_ff: 128
attn_dropout: 0.0
ffn_dropout: 0.0
tie_embeddings: false
norm_eps: 1e-06
norm_eps: 1e-6
rope_base: 1000000.0
moe_num_experts: 128
moe_d_ff: 768
moe_top_k: 8
moe_num_experts: 8
moe_top_k: 2
moe_d_ff: 128
moe_capacity_factor: 1.25
moe_min_capacity: 4
moe_overflow_policy: residual
moe_router_noise_std: 0.0
moe_router_temperature: 1.0
moe_router_dropout: 0.0
moe_aux_loss_coef: 0.001
moe_z_loss_coef: 0.0

lr_scheduler:
component_key: scheduler
Expand All @@ -262,7 +310,7 @@ lr_scheduler:
div_factor: 10
final_div_factor: 1
total_steps: ${settings.training_target.num_target_steps}
pct_start: 0.02
pct_start: 0.01
anneal_strategy: cos
last_epoch: ${settings.training_progress.last_step}

Expand All @@ -283,7 +331,7 @@ optimizer:
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
component_key: gradient_clipper
variant_key: ep
config:
wrapped_model:
Expand All @@ -309,9 +357,14 @@ progress_subscriber:

evaluation_subscriber:
component_key: results_subscriber
variant_key: to_disc
variant_key: wandb
config:
output_file_path: ${settings.paths.experiment_folder_path}/evaluation_results.jsonl
global_rank: ${settings.cuda_env.global_rank}
project: modalities_dcp_tests
mode: OFFLINE
experiment_id: ${settings.experiment_id}
directory: wandb_storage
config_file_path: ${settings.config_file_path}

mfu_calculator:
component_key: mfu_calculator
Expand All @@ -327,39 +380,3 @@ mfu_calculator:
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE

# profiler:
# component_key: steppable_profiler
# variant_key: combined
# config:
# profilers:
# - instance_key: kernel_profiler
# pass_type: BY_REFERENCE
# # - instance_key: memory_profiler
# # pass_type: BY_REFERENCE

kernel_profiler:
component_key: steppable_profiler
variant_key: kernel_tracing
config:
num_wait_steps: 1
num_warmup_steps: 1
num_active_steps: 3
profiler_activities: [CUDA]
profile_memory: true
record_shapes: true
with_stack: true
with_flops: true
with_modules: true
tracked_ranks: [0]
output_folder_path: ${settings.paths.experiment_folder_path}/profiling

memory_profiler:
component_key: steppable_profiler
variant_key: memory_tracing
config:
memory_snapshot_folder_path: ${settings.paths.experiment_folder_path}/profiling
num_wait_steps: 1
num_warmup_steps: 1
num_active_steps: 3
tracked_ranks: [0]
Loading
Loading