diff --git a/config_files/model_tests/base_210M.yaml b/config_files/model_tests/base_210M.yaml new file mode 100644 index 000000000..384d5fe29 --- /dev/null +++ b/config_files/model_tests/base_210M.yaml @@ -0,0 +1,2732 @@ +# ============================================================================= +# ~210M NON-EMBEDDING baseline (adapted from your 124M-class config) +# Architecture changes vs. original: n_layer 12->16, n_embd 768->1024, n_head 12->16, ffn_hidden 3072->4096 +# -> 205.6M non-embedding params, 308.6M total, head_dim=64, aspect ratio d/L=64 (canonical scaled-GPT-2) +# All widths kept multiple of 128 and head_dim fixed at 64 (Flash fast path + qk_norm stays 64). +# See accompanying notes for LR/batch/token-budget rationale and ablation knobs. +# ============================================================================= +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: /raid/s3/opengptx/mfrey/tests/checkpoints + experiments_root_path: /raid/s3/opengptx/mfrey/tests/checkpoints + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 25 + evaluation_interval_in_steps: 25 + consistency_enforcement: + enforce_tokens_per_step_consistency: false + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 16 + local_train_micro_batch_size: 8 + sequence_length: 4096 + # 16 GPUs -> mbs=4, gas=2 + # 32 GPUs -> mbs=4, gas=1 + # 64 GPUs -> mbs=2, gas=1 + # 1 GPU -> mbs=2, gas=64 + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + training_target: + # 20B tokens (~65x total params / ~97x non-embedding). + num_target_tokens: 20971520000 # 40000 steps * 524288 tokens/step + num_target_steps: + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: combined + config: + datasets: + # --- score4.5+: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.5+/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + # --- score4.0-4.5: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + # --- score3.5-4.0: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score3.5-4.0/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/git_repos/experimentation/data/eval_data/validation.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +eval_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: eval + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 2 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + drop_last: true + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataloaders: +- instance_key: eval_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +dp_degree: + component_key: number_conversion + variant_key: parallel_degree + config: + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + parallelism_methods: [dp_shard, dp_replicate] + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + multi_device_generator_policy: error + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: compiled_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +compiled_model: + component_key: model + variant_key: compiled + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + block_names: [GPT2Block] + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: false + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 + # ---- ~210M NON-EMBEDDING shape ---- + n_layer: 16 + n_head_q: 16 + n_head_kv: 16 + ffn_hidden: 4096 + n_embd: 1024 + # ---------------------------------------------- + dropout: 0.0 + bias: false + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} + seq_length_dim: -2 + base_freq: 10000 + qk_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 64 + eps: 1.0e-05 + attention_implementation: pytorch_flash + activation_type: swiglu + attention_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + ffn_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + lm_head_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 0.003 + div_factor: 10 # initial LR = max_lr/10 = 3.0e-4 + final_div_factor: 1 # final LR = initial = max_lr/10 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0003 # overridden by OneCycle; set to max_lr/div_factor for consistency + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 # decoupled WD=0.1, standard (GPT-3/Llama/OLMo); also aids LR transfer across width + weight_decay_groups_excluded: [embedding, pytorch_rms_norm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + global_rank: ${settings.cuda_env.global_rank} + project: tests + mode: ONLINE + experiment_id: base_toadapt + directory: wandb_storage + config_file_path: ${settings.config_file_path} + +mfu_calculator: + component_key: mfu_calculator + variant_key: gpt2 + config: + n_layer: ${model_raw.config.n_layer} + sequence_length: ${settings.step_profile.sequence_length} + n_embd: ${model_raw.config.n_embd} + world_size: ${settings.cuda_env.world_size} + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +tokenizer: + component_key: tokenizer + variant_key: pretrained_hf_tokenizer + config: + pretrained_model_name_or_path: gpt2 + +model_converter: + component_key: model_converter + variant_key: default + config: + command_template: "CUDA_VISIBLE_DEVICES=7 python src/modalities/conversion/gpt2/convert_gpt2.py {modalities_config} {output_dir} --checkpoint_path {checkpoint_path} > {checkpoint_path}/conversion.log 2>&1" + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + eval_interval: ${settings.intervals.checkpointing_interval_in_steps} + +downstream_evaluator: + component_key: downstream_evaluator + variant_key: default + config: + tokenizer: + instance_key: tokenizer + pass_type: BY_REFERENCE + tasks: + - "minerva_math_algebra:bpb::olmes" + - "minerva_math_counting_and_probability:bpb::olmes" + - "minerva_math_geometry:bpb::olmes" + - "minerva_math_intermediate_algebra:bpb::olmes" + - "minerva_math_number_theory:bpb::olmes" + - "minerva_math_prealgebra:bpb::olmes" + - "minerva_math_precalculus:bpb::olmes" + - "arc_challenge:rc::olmes:full" + - "arc_easy:rc::olmes:full" + - "hellaswag:rc::olmes:full" + - "winogrande:rc::olmes:full" + - "socialiqa:rc::olmes:full" + - "piqa:rc::olmes:full" + - "qasper_yesno:rc::olmes" + - "lambada" + - "arc_challenge:rc:bpb::olmes:full" + - "arc_easy:rc:bpb::olmes:full" + - "hellaswag:rc:bpb::olmes:full" + - "winogrande:rc:bpb::olmes:full" + - "socialiqa:rc:bpb::olmes:full" + - "piqa:rc:bpb::olmes:full" + - "qasper_yesno:rc:bpb::olmes" + - "lambada:bpb" + - "gsm8k::olmes" + eval_interval: ${settings.intervals.evaluation_interval_in_steps} + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + olmes_command_template: "CUDA_VISIBLE_DEVICES=7 . /home/markus_frey/Github/olmes/.venv/bin/activate && olmes --model {hf_model_dir} --model-args '{{\"trust_remote_code\": true}}' --task {tasks} --limit 8 --output-dir {hf_model_dir}/olmes_eval_{step} > {hf_model_dir}/olmes_eval_{step}.log 2>&1" \ No newline at end of file diff --git a/config_files/model_tests/base_toadapt.yaml b/config_files/model_tests/base_toadapt.yaml new file mode 100644 index 000000000..a7d659f63 --- /dev/null +++ b/config_files/model_tests/base_toadapt.yaml @@ -0,0 +1,419 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: /raid/s3/opengptx/mfrey/tests/checkpoints + experiments_root_path: /raid/s3/opengptx/mfrey/tests/checkpoints + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 100 + evaluation_interval_in_steps: 100 + consistency_enforcement: + enforce_tokens_per_step_consistency: false + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 2 + local_train_micro_batch_size: 4 + sequence_length: 4096 + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + training_target: + num_target_tokens: 10000000 + num_target_steps: + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: combined + config: + datasets: + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/data/smollm-corpus-fw-edu-dedup-jsonl-buckets-raw/tokenized/score4.0-4.5/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/mehdi-ali/git_repos/experimentation/data/eval_data/validation.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +eval_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: eval + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 2 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + drop_last: true + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataloaders: +- instance_key: eval_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +dp_degree: + component_key: number_conversion + variant_key: parallel_degree + config: + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + parallelism_methods: [dp_shard, dp_replicate] + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + multi_device_generator_policy: error + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: compiled_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +compiled_model: + component_key: model + variant_key: compiled + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + block_names: [GPT2Block] + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: false + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 + n_layer: 12 + n_head_q: 12 + n_head_kv: 12 + ffn_hidden: 3072 + n_embd: 768 + dropout: 0.0 + bias: false + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} + seq_length_dim: -2 + base_freq: 500000 + qk_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 64 # n_embd / n_head_q + eps: 1.0e-05 + attention_implementation: pytorch_flash + activation_type: swiglu + attention_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 768 + eps: 1.0e-05 + ffn_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 768 + eps: 1.0e-05 + lm_head_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 768 + eps: 1.0e-05 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 0.003 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + weight_decay_groups_excluded: [embedding, pytorch_rms_norm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + global_rank: ${settings.cuda_env.global_rank} + project: tests + mode: ONLINE + experiment_id: base_toadapt + directory: wandb_storage + config_file_path: ${settings.config_file_path} + +mfu_calculator: + component_key: mfu_calculator + variant_key: gpt2 + config: + n_layer: ${model_raw.config.n_layer} + sequence_length: ${settings.step_profile.sequence_length} + n_embd: ${model_raw.config.n_embd} + world_size: ${settings.cuda_env.world_size} + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +tokenizer: + component_key: tokenizer + variant_key: pretrained_hf_tokenizer + config: + pretrained_model_name_or_path: gpt2 + +model_converter: + component_key: model_converter + variant_key: default + config: + command_template: "CUDA_VISIBLE_DEVICES=7 python src/modalities/conversion/gpt2/convert_gpt2.py {modalities_config} {output_dir} --checkpoint_path {checkpoint_path} > {checkpoint_path}/conversion.log 2>&1" + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + eval_interval: 100 + +downstream_evaluator: + component_key: downstream_evaluator + variant_key: default + config: + tokenizer: + instance_key: tokenizer + pass_type: BY_REFERENCE + tasks: + - "arc_challenge::olmes" + - "hellaswag::olmes" + eval_interval: 100 + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + olmes_command_template: "CUDA_VISIBLE_DEVICES=7 . /home/markus_frey/Github/olmes/.venv/bin/activate && olmes --model {hf_model_dir} --model-args '{{\"trust_remote_code\": true}}' --task {tasks} --limit 128 --output-dir {hf_model_dir}/olmes_eval_{step} > {hf_model_dir}/olmes_eval_{step}.log 2>&1" \ No newline at end of file diff --git a/config_files/model_tests/leonardo/base_210M.yaml b/config_files/model_tests/leonardo/base_210M.yaml new file mode 100644 index 000000000..a846e4255 --- /dev/null +++ b/config_files/model_tests/leonardo/base_210M.yaml @@ -0,0 +1,2728 @@ +# ============================================================================= +# ~210M NON-EMBEDDING baseline (adapted from your 124M-class config) +# Architecture changes vs. original: n_layer 12->16, n_embd 768->1024, n_head 12->16, ffn_hidden 3072->4096 +# -> 205.6M non-embedding params, 308.6M total, head_dim=64, aspect ratio d/L=64 (canonical scaled-GPT-2) +# All widths kept multiple of 128 and head_dim fixed at 64 (Flash fast path + qk_norm stays 64). +# See accompanying notes for LR/batch/token-budget rationale and ablation knobs. +# ============================================================================= +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: /leonardo_scratch/large/userexternal/mfrey000/experiments_tests + experiments_root_path: /leonardo_scratch/large/userexternal/mfrey000/experiments_tests + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 10000 + evaluation_interval_in_steps: 10000 + consistency_enforcement: + enforce_tokens_per_step_consistency: true + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 8 + sequence_length: 4096 + # 16 GPUs -> mbs=4, gas=2 + # 32 GPUs -> mbs=4, gas=1 + # 64 GPUs -> mbs=2, gas=1 + # 1 GPU -> mbs=2, gas=64 + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + training_target: + # 20B tokens (~65x total params / ~97x non-embedding). + num_target_tokens: 20971520000 # 40000 steps * 524288 tokens/step + num_target_steps: + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + dp_degree: + instance_key: dp_degree + pass_type: BY_REFERENCE + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: combined + config: + datasets: + # --- score4.5+: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + # --- score4.0-4.5: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.0-4.5/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + # --- score3.5-4.0: 128 shards --- + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00001.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00002.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00003.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00004.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00005.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00006.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00007.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00008.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00009.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00010.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00011.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00012.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00013.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00014.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00015.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00016.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00017.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00018.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00019.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00020.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00021.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00022.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00023.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00024.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00025.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00026.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00027.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00028.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00029.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00030.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00031.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00032.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00033.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00034.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00035.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00036.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00037.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00038.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00039.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00040.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00041.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00042.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00043.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00044.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00045.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00046.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00047.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00048.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00049.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00050.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00051.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00052.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00053.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00054.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00055.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00056.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00057.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00058.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00059.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00060.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00061.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00062.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00063.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00064.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00065.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00066.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00067.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00068.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00069.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00070.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00071.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00072.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00073.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00074.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00075.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00076.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00077.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00078.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00079.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00080.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00081.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00082.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00083.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00084.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00085.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00086.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00087.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00088.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00089.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00090.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00091.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00092.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00093.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00094.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00095.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00096.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00097.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00098.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00099.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00100.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00101.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00102.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00103.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00104.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00105.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00106.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00107.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00108.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00109.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00110.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00111.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00112.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00113.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00114.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00115.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00116.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00117.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00118.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00119.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00120.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00121.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00122.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00123.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00124.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00125.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00126.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + - component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score3.5-4.0/part-00127.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /leonardo_work/EUHPC_E05_119/mfrey/tokenized/score4.5+/part-00000.pbin + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} +eval_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: eval + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 2 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + drop_last: true + dataset: + instance_key: eval_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE +eval_dataloaders: +- instance_key: eval_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +dp_degree: + component_key: number_conversion + variant_key: parallel_degree + config: + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + parallelism_methods: [dp_shard, dp_replicate] + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + multi_device_generator_policy: error + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: compiled_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +compiled_model: + component_key: model + variant_key: compiled + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + block_names: [GPT2Block] + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: false + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 + # ---- ~210M NON-EMBEDDING shape ---- + n_layer: 16 + n_head_q: 16 + n_head_kv: 16 + ffn_hidden: 4096 + n_embd: 1024 + # ---------------------------------------------- + dropout: 0.0 + bias: false + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} + seq_length_dim: -2 + base_freq: 10000 + qk_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 64 + eps: 1.0e-05 + attention_implementation: pytorch_flash + activation_type: swiglu + attention_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + ffn_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + lm_head_norm_config: + norm_type: pytorch_rms_norm + config: + normalized_shape: 1024 + eps: 1.0e-05 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 0.003 + div_factor: 10 # initial LR = max_lr/10 = 3.0e-4 + final_div_factor: 1 # final LR = initial = max_lr/10 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0003 # overridden by OneCycle; set to max_lr/div_factor for consistency + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 # decoupled WD=0.1, standard (GPT-3/Llama/OLMo); also aids LR transfer across width + weight_decay_groups_excluded: [embedding, pytorch_rms_norm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + global_rank: ${settings.cuda_env.global_rank} + project: tests + mode: OFFLINE + experiment_id: base_toadapt + directory: ${settings.paths.experiments_root_path}/wandb + config_file_path: ${settings.config_file_path} + +mfu_calculator: + component_key: mfu_calculator + variant_key: gpt2 + config: + n_layer: ${model_raw.config.n_layer} + sequence_length: ${settings.step_profile.sequence_length} + n_embd: ${model_raw.config.n_embd} + world_size: ${settings.cuda_env.world_size} + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +tokenizer: + component_key: tokenizer + variant_key: pretrained_hf_tokenizer + config: + pretrained_model_name_or_path: /leonardo_work/EUHPC_D21_101/mfrey/tokenizer/local_gpt2_tokenizer/ + padding: false + truncation: false + +model_converter: + component_key: model_converter + variant_key: default + config: + command_template: "python src/modalities/conversion/gpt2/convert_gpt2.py {modalities_config} {output_dir} --checkpoint_path {checkpoint_path} > {checkpoint_path}/conversion.log 2>&1" + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + eval_interval: ${settings.intervals.checkpointing_interval_in_steps} + +downstream_evaluator: + component_key: downstream_evaluator + variant_key: default + config: + tokenizer: + instance_key: tokenizer + pass_type: BY_REFERENCE + tasks: + - "minerva_math_algebra:bpb::olmes" + - "minerva_math_counting_and_probability:bpb::olmes" + - "minerva_math_geometry:bpb::olmes" + - "minerva_math_intermediate_algebra:bpb::olmes" + - "minerva_math_number_theory:bpb::olmes" + - "minerva_math_prealgebra:bpb::olmes" + - "minerva_math_precalculus:bpb::olmes" + - "arc_challenge:rc::olmes:full" + - "arc_easy:rc::olmes:full" + - "hellaswag:rc::olmes:full" + - "winogrande:rc::olmes:full" + - "socialiqa:rc::olmes:full" + - "piqa:rc::olmes:full" + - "qasper_yesno:rc::olmes" + - "lambada" + - "arc_challenge:rc:bpb::olmes:full" + - "arc_easy:rc:bpb::olmes:full" + - "hellaswag:rc:bpb::olmes:full" + - "winogrande:rc:bpb::olmes:full" + - "socialiqa:rc:bpb::olmes:full" + - "piqa:rc:bpb::olmes:full" + - "qasper_yesno:rc:bpb::olmes" + - "lambada:bpb" + - "gsm8k::olmes" + eval_interval: ${settings.intervals.evaluation_interval_in_steps} + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + olmes_command_template: "bash scripts/evaluation/run_olmes_sbatch.sh {hf_model_dir} '{tasks}' {step} 128 1" \ No newline at end of file diff --git a/docs/components/downstream_evaluation.md b/docs/components/downstream_evaluation.md new file mode 100644 index 000000000..8927ca16f --- /dev/null +++ b/docs/components/downstream_evaluation.md @@ -0,0 +1,158 @@ +# Downstream Evaluation Pipeline + +## Overview + +The downstream evaluation pipeline in Modalities is a decoupled, three-stage callback system that executes at configurable step intervals during the training loop. + +The order of execution inside `Trainer.train` is: +1. `checkpointing_callback`: Saves the PyTorch/FSDP checkpoint to disk. +2. `conversion_callback`: (Optional) Converts the PyTorch checkpoint to a Hugging Face (HF) checkpoint. +3. `downstream_evaluation_callback`: (Optional) Runs external evaluation tools (like OLMES) on the newly created HF checkpoint. + +By keeping conversion and evaluation decoupled, you can configure just the converter, just the evaluator (if HF checkpoints are generated elsewhere), or both. + +--- + +## 1. Conversion Callback (`ModelConverter`) + +**Location:** `src/modalities/conversion/model_converter.py` (Lines 10-67) + +The `ModelConverter` is a thin wrapper that executes a shell command template via a subprocess. + +### Behavior +- Triggered if `num_train_steps_done % eval_interval == 0`. +- Only executes on `global_rank == 0`. You can prefix the command with `CUDA_VISIBLE_DEVICES=X` to manually specify which GPU the evaluation script should run on. +- Reads `last_checkpoint_info.json` from the checkpoint directory to determine the latest checkpoint path. +- Checks if the `{checkpoint_path}/hf_checkpoint` directory already exists. If it does, conversion is skipped. +- If it does not exist, it formats the `command_template` and runs it using `subprocess.run(cmd, shell=True, check=True)`. + +### Placeholders +The `command_template` string can use the following placeholders: +- `{checkpoint_path}`: The path to the latest checkpoint directory (resolved at runtime). +- `{output_dir}`: Evaluates to `{checkpoint_path}/hf_checkpoint`. +- `{modalities_config}`: Path to the YAML config file found inside or next to the checkpoint directory. + +### YAML Configuration +```yaml +model_converter: + component_key: model_converter + variant_key: default + config: + command_template: "python src/modalities/conversion/gpt2/convert_gpt2.py {modalities_config} {output_dir} --checkpoint_path {checkpoint_path}" + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + eval_interval: 1000 +``` + +--- + +## 2. Downstream Evaluation Callback (`DownstreamEvaluator`) + +**Location:** `src/modalities/evaluator.py` (Lines 210-335) + +The `DownstreamEvaluator` checks for the existence of an HF checkpoint, launches an evaluation script via a subprocess, tracks active processes, and syncs OLMES metrics to the active W&B run. + +### Behavior +- Triggered if `num_train_steps_done % eval_interval == 0`. +- Only executes on `global_rank == 0`. +- Reads `last_checkpoint_info.json` to find the latest checkpoint. +- Checks if `{checkpoint_path}/hf_checkpoint` exists. If it does NOT exist, evaluation is skipped with a warning (assuming conversion failed or was disabled). +- If the HF checkpoint exists, it formats the `olmes_command_template` and launches it asynchronously using `subprocess.Popen(cmd, shell=True)`. +- **Process Tracking**: Stores `(Popen, step, hf_model_dir)` tuples in `self.active_processes` (Lines 233, 258). +- **Graceful Exit**: `wait_for_evaluations()` (Lines 264-275) iterates over `active_processes`, calls `.wait()`, and syncs metrics after each evaluation completes. +- **W&B Metric Sync**: `_sync_metrics_to_wandb()` (Lines 277-315) parses `metrics-all.jsonl` from the OLMES output directory, extracts `primary_score` for each task alias, and logs them to the active `wandb.run` as `eval/{alias}` at the correct training step. Gracefully skips if W&B is disabled or not installed. + +### Placeholders +The `olmes_command_template` string can use the following placeholders: +- `{hf_model_dir}`: The path to the `{checkpoint_path}/hf_checkpoint` directory. +- `{tasks}`: A space-separated string of the tasks provided in the config (Line 248). +- `{step}`: The current `num_train_steps_done`. + +### HPC / SLURM Integration +For HPC environments (like Leonardo Booster), running OLMES directly from the trainer process can cause GPU Out-of-Memory (OOM) errors. You can decouple evaluation by creating a wrapper script (`scripts/evaluation/run_olmes_sbatch.sh`) that submits an independent SLURM job using `sbatch --wait`. Because `DownstreamEvaluator` uses `subprocess.Popen` asynchronously, the wrapper script will wait in the background on the training node without blocking the training loop! + +> [!IMPORTANT] +> **Nested SLURM Job Environment Isolation (`--export=NONE`)** +> When submitting the nested evaluation job using `sbatch` from within a running SLURM training job, the nested job inherits the parent job's environment variables (such as CUDA variables, `RANK`, `WORLD_SIZE`, `MASTER_ADDR`, etc.) by default. This will cause the evaluation job to fail or behave incorrectly. +> +> To prevent environment leakage, you **must** include `#SBATCH --export=NONE` in the nested `sbatch` script header. This ensures the evaluation job starts with a clean, isolated environment. + +### YAML Configuration +```yaml +downstream_evaluator: + component_key: downstream_evaluator + variant_key: default + config: + tokenizer: + instance_key: tokenizer + pass_type: BY_REFERENCE + tasks: + - "arc_challenge::olmes" + - "hellaswag::olmes" + eval_interval: 100 + checkpoint_dir: ${settings.paths.experiments_root_path}/${settings.experiment_id} + global_rank: ${settings.cuda_env.global_rank} + olmes_command_template: "bash scripts/evaluation/run_olmes_sbatch.sh {hf_model_dir} '{tasks}' {step} 1024 1" +``` + +--- + +## System Integration Summary + +For context on how these components are wired into the system, the following files handle the integration: + +1. **`src/modalities/trainer.py`** + - `conversion_callback` was added to `train()` signature. + - Pre-loop and in-loop execution order was explicitly set to: `checkpointing_callback` -> `conversion_callback` -> `downstream_evaluation_callback`. + +2. **`src/modalities/gym.py`** + - Threads `conversion_callback` through `Gym.run()` and passes it down to `self.trainer.train()`. + +3. **`src/modalities/main.py` (Lines 227-249)** + - Resolves `components.model_converter.convert` and `components.downstream_evaluator.evaluate`. + - Passes them into `gym.run()`. + - **Post-Training Wait** (Lines 244-249): At the very end of `run()`, explicitly calls `components.downstream_evaluator.wait_for_evaluations()` with prominent `print_rank_0` logging to ensure training does not exit until evaluations complete. + +4. **`src/modalities/config/config.py`** + - Defines Pydantic models `ModelConverterConfig` and `DownstreamEvaluatorConfig`. + +5. **`src/modalities/config/instantiation_models.py`** + - Adds `model_converter` and `downstream_evaluator` fields to `TrainingComponentsInstantiationModel`. + +6. **`src/modalities/registry/components.py`** + - Registers both classes to the `"default"` component registry. + +7. **`src/modalities/conversion/gpt2/convert_gpt2.py` (Lines 105-112)** + - Updated to support Hugging Face tokenizers (`pretrained_hf_tokenizer`) alongside SentencePiece. Detects tokenizer configs and saves `vocab.json` / `tokenizer.json` directly to the `hf_checkpoint` directory. + +8. **`tests/test_downstream_evaluator.py`** + - Contains comprehensive tests mocking the `subprocess` calls and verifying interval gating, rank gating, and directory existence logic. + +--- + +## 3. Precaching Datasets (Offline Environments) + +If your compute cluster nodes do not have internet access, you must precache the Hugging Face datasets that OLMES requires. We provide a generalized script `scripts/evaluation/precache_tasks.py` that you can run on a login node (or any environment with internet access). + +### Usage + +Activate your evaluation environment (virtualenv, conda, or your Singularity container shell) and set the `HF_DATASETS_CACHE` and `HF_HOME` variables to a location accessible by your compute nodes. + +```bash +# 1. Activate your python environment (e.g. venv where olmes is installed) +source /path/to/olmes/venv/bin/activate +export PYTHONPATH=/path/to/olmes/venv/lib/python3.12/site-packages:$PYTHONPATH + +# 2. Point Hugging Face to a shared scratch space or cache directory +export HF_DATASETS_CACHE="/path/to/shared/hf_cache" +export HF_HOME="/path/to/shared/hf_cache" +export HF_TOKEN="your_hf_access_token" # If needed for gated models/datasets + +# 3. Define the tasks you need +export OLMES_TASKS="arc_challenge:rc::olmes:full hellaswag:rc::olmes:full gsm8k::olmes" + +# 4. Run the precache script +python scripts/evaluation/precache_tasks.py --tasks $OLMES_TASKS +``` + +This script will resolve the tasks via OLMES and download all required datasets to your cache directory. When you run your training job via `sbatch`, ensure the compute nodes also set `HF_DATASETS_CACHE` and `HF_HOME` to the exact same shared directory. diff --git a/scripts/evaluation/precache_tasks.py b/scripts/evaluation/precache_tasks.py new file mode 100644 index 000000000..51949e8ae --- /dev/null +++ b/scripts/evaluation/precache_tasks.py @@ -0,0 +1,62 @@ +import argparse +import copy +import os + +from datasets import load_dataset +from oe_eval.configs.tasks import TASK_CONFIGS +from oe_eval.launch import resolve_task_suite +from oe_eval.run_eval import load_task + + +def main(): + parser = argparse.ArgumentParser(description="Precache OLMES tasks and required HF datasets.") + parser.add_argument( + "--tasks", + nargs="+", + required=True, + help="List of OLMES tasks to precache (e.g. arc_challenge:rc::olmes:full hellaswag:rc::olmes:full)" + ) + args = parser.parse_args() + + hf_home = os.environ.get("HF_DATASETS_CACHE", os.environ.get("HF_HOME", "~/.cache/huggingface")) + print(f"HF_DATASETS_CACHE is set to: {hf_home}") + + # ---- Part 1: OLMES tasks ---- + print("\n--- Caching OLMES tasks ---") + all_tasks = [] + for t in args.tasks: + try: + all_tasks += resolve_task_suite(t, {}) + except Exception as e: + print(f"!! could not resolve {t}: {e}") + + print(f"\nWill download {len(all_tasks)} tasks to {hf_home}") + for task_name in all_tasks: + if task_name not in TASK_CONFIGS: + print(f"?? not in TASK_CONFIGS: {task_name}") + continue + try: + cfg = copy.deepcopy(TASK_CONFIGS[task_name]) + task = load_task(cfg, ".") + print(f"-> downloading {task_name}") + task.download() + print(f" done") + except Exception as e: + print(f"!! failed {task_name}: {e}") + + # ---- Part 2: pseudo-sources used by paloma_diagnostics.py ---- + print("\n--- Caching diagnostics pseudo-source datasets (gsm8k, trivia_qa) ---") + specs = [ + ('gsm8k', 'main', 'test'), + ('trivia_qa', 'rc.nocontext', 'validation'), + ] + for path, config, split in specs: + try: + print(f"-> downloading {path} ({config}, {split})") + load_dataset(path, config, split=split) + print(f" done") + except Exception as e: + print(f"!! failed {path}: {e}") + +if __name__ == "__main__": + main() diff --git a/src/modalities/batch.py b/src/modalities/batch.py index bb55f245a..834ad6b1e 100644 --- a/src/modalities/batch.py +++ b/src/modalities/batch.py @@ -54,6 +54,26 @@ def __len__(self) -> int: return self.samples[key].shape[self.batch_dim] +def _apply_to(val, device): + if isinstance(val, torch.Tensor): + return val.to(device) + elif isinstance(val, dict): + return {k: _apply_to(v, device) for k, v in val.items()} + elif isinstance(val, list): + return [_apply_to(v, device) for v in val] + return val + + +def _apply_detach(val): + if isinstance(val, torch.Tensor): + return val.detach() + elif isinstance(val, dict): + return {k: _apply_detach(v) for k, v in val.items()} + elif isinstance(val, list): + return [_apply_detach(v) for v in val] + return val + + @dataclass class InferenceResultBatch(Batch, TorchDeviceMixin): """Stores targets and predictions of an entire batch.""" @@ -71,12 +91,12 @@ def device(self) -> torch.device: return self.targets[key].device def to(self, device: torch.device): - self.predictions = {k: v.to(device) for k, v in self.predictions.items()} - self.targets = {k: v.to(device) for k, v in self.targets.items()} + self.predictions = {k: _apply_to(v, device) for k, v in self.predictions.items()} + self.targets = {k: _apply_to(v, device) for k, v in self.targets.items()} def detach(self): - self.targets = {k: v.detach() for k, v in self.targets.items()} - self.predictions = {k: v.detach() for k, v in self.predictions.items()} + self.targets = {k: _apply_detach(v) for k, v in self.targets.items()} + self.predictions = {k: _apply_detach(v) for k, v in self.predictions.items()} def get_predictions(self, key: str) -> torch.Tensor: if key not in self.predictions: @@ -89,8 +109,13 @@ def get_targets(self, key: str) -> torch.Tensor: return self.targets[key] def __len__(self) -> int: - key = list(self.predictions.keys())[0] - return self.predictions[key].shape[self.batch_dim] + for v in self.predictions.values(): + if isinstance(v, torch.Tensor): + return v.shape[self.batch_dim] + for v in self.targets.values(): + if isinstance(v, torch.Tensor): + return v.shape[self.batch_dim] + raise ValueError("No tensor found in predictions or targets to determine batch length") @dataclass diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py index 42a19b99a..cc623e336 100644 --- a/src/modalities/config/config.py +++ b/src/modalities/config/config.py @@ -520,6 +520,22 @@ class ParallelDegreeConfig(BaseModel): parallelism_methods: list[ParallelismDegrees] +class ModelConverterConfig(BaseModel): + command_template: str + checkpoint_dir: Path + global_rank: Annotated[int, Field(strict=True, ge=0)] + eval_interval: Annotated[int, Field(strict=True, gt=0)] + + +class DownstreamEvaluatorConfig(BaseModel): + tokenizer: PydanticTokenizerIFType + tasks: list[str] + eval_interval: Annotated[int, Field(strict=True, gt=0)] + checkpoint_dir: Path + global_rank: Annotated[int, Field(strict=True, ge=0)] + olmes_command_template: str + + # Recursive type representing arbitrary-depth YAML config structures. YAMLPrimitive = str | int | float | bool | None YAMLValue: TypeAlias = YAMLPrimitive | Path | list["YAMLValue"] | dict[str, "YAMLValue"] diff --git a/src/modalities/config/instantiation_models.py b/src/modalities/config/instantiation_models.py index fd7fd3b78..107394349 100644 --- a/src/modalities/config/instantiation_models.py +++ b/src/modalities/config/instantiation_models.py @@ -22,6 +22,8 @@ PydanticSteppableProfilerIFType, PydanticTextInferenceComponentType, PydanticTokenizerIFType, + PydanticDownstreamEvaluatorType, + PydanticModelConverterType, ) from modalities.config.utils import parse_torch_device from modalities.dataloader.dataset import Dataset @@ -192,6 +194,8 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel mfu_calculator: PydanticMFUCalculatorABCType | None = None scheduled_pipeline: PydanticPipelineType | None = None device_mesh: PydanticDeviceMeshIFType | None = None + downstream_evaluator: Optional[PydanticDownstreamEvaluatorType] = None + model_converter: Optional[PydanticModelConverterType] = None model_raw: PydanticPytorchModuleType @model_validator(mode="after") diff --git a/src/modalities/config/pydantic_if_types.py b/src/modalities/config/pydantic_if_types.py index 90b7ca951..217c2f783 100644 --- a/src/modalities/config/pydantic_if_types.py +++ b/src/modalities/config/pydantic_if_types.py @@ -19,6 +19,8 @@ from modalities.checkpointing.stateful.app_state import AppState from modalities.dataloader.collate_fns.collate_if import CollateFnIF from modalities.dataloader.dataloader import LLMDataLoader +from modalities.conversion.model_converter import ModelConverter +from modalities.evaluator import DownstreamEvaluator from modalities.inference.text.inference_component import TextInferenceComponent from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.loss_functions import Loss @@ -98,3 +100,5 @@ def __get_pydantic_core_schema__( torch.utils.hooks.RemovableHandle, PydanticThirdPartyTypeIF(torch.utils.hooks.RemovableHandle) ] PydanticDebuggingType = Annotated[Debugging, PydanticThirdPartyTypeIF(Debugging)] +PydanticDownstreamEvaluatorType = Annotated[DownstreamEvaluator, PydanticThirdPartyTypeIF(DownstreamEvaluator)] +PydanticModelConverterType = Annotated[ModelConverter, PydanticThirdPartyTypeIF(ModelConverter)] diff --git a/src/modalities/conversion/gpt2/configuration_gpt2.py b/src/modalities/conversion/gpt2/configuration_gpt2.py index 7663cd227..d5ecbc8bc 100644 --- a/src/modalities/conversion/gpt2/configuration_gpt2.py +++ b/src/modalities/conversion/gpt2/configuration_gpt2.py @@ -181,10 +181,12 @@ def __init__( attention_dropout=0.0, mlp_bias=False, head_dim=None, + norm_type="layer_norm", + use_qk_norm=False, + qk_norm_dim=None, **kwargs, ): - if rms_norm_eps is not None: - raise ValueError("RMSNorm is not supported in GPT2 model.") + self.norm_type = norm_type self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -211,6 +213,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + self.use_qk_norm = use_qk_norm + self.qk_norm_dim = qk_norm_dim # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, copy it it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py index f44ff33e6..f5999faf4 100644 --- a/src/modalities/conversion/gpt2/conversion_model.py +++ b/src/modalities/conversion/gpt2/conversion_model.py @@ -23,7 +23,43 @@ def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, """ gpt2_config = convert_model_config(modalities_config) hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16) - modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL) + model_config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"] + checkpoint_path = None + if "checkpointed_model" in modalities_config: + checkpoint_path = modalities_config["checkpointed_model"].get("config", {}).get("checkpoint_path") + + if checkpoint_path and not ("variant_key" in modalities_config.get("checkpointed_model", {})): + # Load state dict manually if variant_key is missing + if "model" not in modalities_config and "model_raw" in modalities_config: + modalities_config["model"] = modalities_config["model_raw"] + modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.MODEL) + from pathlib import Path + if Path(checkpoint_path).is_dir(): + from torch.distributed.checkpoint.default_planner import _EmptyStateDictLoadPlanner + from torch.distributed.checkpoint.filesystem import FileSystemReader + from torch.distributed.checkpoint.state_dict_loader import _load_state_dict + sd = {} + planner = _EmptyStateDictLoadPlanner(keys=["app.model"], allow_partial_load=True) + _load_state_dict(sd, storage_reader=FileSystemReader(checkpoint_path), planner=planner, no_dist=True) + model_sd = sd.get("app", {}).get("model", sd) + else: + ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False) + model_sd = ckpt + for key in ("model_state_dict", "state_dict", "model"): + if key in ckpt and isinstance(ckpt[key], dict): + model_sd = ckpt[key] + break + + out = {} + for k, v in model_sd.items(): + if k.startswith("module."): + k = k[len("module."):] + out[k] = v + missing, unexpected = modalities_model.load_state_dict(out, strict=False) + print("Missing keys:", missing) + print("Unexpected keys:", unexpected) + else: + modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL) _copy_weights_model(hf_model, modalities_model) return hf_model, modalities_model @@ -43,6 +79,14 @@ def convert_model_config(modalities_config: dict) -> GPT2Config: _check_conversion_criteria(config) ffn_norm_key = "ffn_norm_config" + norm_type = config[ffn_norm_key].get("norm_type", "layer_norm") + + qk_norm_cfg = config.get("attention_config", {}).get("qk_norm_config") + use_qk_norm = qk_norm_cfg is not None + qk_norm_dim = None + if use_qk_norm: + qk_cfg = qk_norm_cfg.get("config", {}) + qk_norm_dim = qk_cfg.get("ndim", qk_cfg.get("normalized_shape")) return GPT2Config( vocab_size=config["vocab_size"], @@ -57,12 +101,15 @@ def convert_model_config(modalities_config: dict) -> GPT2Config: attention_bias=config["bias"], mlp_bias=config["bias"], hidden_act="silu", + norm_type=norm_type, layer_norm_eps=_get_layer_norm_value(config[ffn_norm_key]["config"], "eps"), layer_norm_elementwise_affine=_get_layer_norm_value(config[ffn_norm_key]["config"], "elementwise_affine"), layer_norm_bias=_get_layer_norm_value(config[ffn_norm_key]["config"], "bias"), max_position_embeddings=config["sequence_length"], rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"], _attn_implementation=_map_attention_type(config), + use_qk_norm=use_qk_norm, + qk_norm_dim=qk_norm_dim, output_attentions=False, ) @@ -80,6 +127,7 @@ def check_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, input_ids = torch.randint(0, vocab_size, (1, modalities_model.sequence_length), device=hf_model.device) inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)} + modalities_model.to(dtype=hf_model.dtype, device=hf_model.device) with torch.no_grad(): llama_logits = hf_model(input_ids=input_ids).logits.to("cpu") modalities_logits = modalities_model(inputs)[modalities_model.prediction_key].to("cpu") @@ -103,7 +151,7 @@ def _check_conversion_criteria(model_config: dict) -> None: norms = ["attention_norm_config", "ffn_norm_config", "lm_head_norm_config"] for norm in norms: - assert model_config[norm]["norm_type"] == "layer_norm" + assert model_config[norm]["norm_type"] in ["layer_norm", "rms_norm", "pytorch_rms_norm"] assert ( len(set(_get_layer_norm_value(model_config[norm]["config"], "bias") for norm in norms)) == 1 @@ -122,13 +170,13 @@ def _get_layer_norm_value(config: dict, field: str) -> bool | float | int: def _map_attention_type(config: dict): - if config["attention_implementation"] == "pytorch_flash": - attention_impl = "sdpa" - elif config["attention_implementation"] == "manual": - attention_impl = "eager" + impl = config.get("attention_implementation", "default") + if impl in ("pytorch_flash", "default"): + return "sdpa" + elif impl == "manual": + return "eager" else: - raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.") - return attention_impl + raise ValueError(f"Unknown attention_implementation: {impl}") def _copy_weights_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM): @@ -164,11 +212,16 @@ def _copy_weights_mlp(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block): def _copy_weights_layer_norms(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block): _copy_weights_base_modules(hf_layer.input_layernorm, modalities_layer.attention_norm) _copy_weights_base_modules(hf_layer.post_attention_layernorm, modalities_layer.ffn_norm) + if getattr(hf_layer.self_attn, "q_norm", None) is not None: + _copy_weights_base_modules(hf_layer.self_attn.q_norm, modalities_layer.attn.q_norm) + _copy_weights_base_modules(hf_layer.self_attn.k_norm, modalities_layer.attn.k_norm) -def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.LayerNorm): +def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm | nn.Module, m2: nn.Linear | nn.LayerNorm | nn.Module): assert m1.weight.shape == m2.weight.shape - assert (m1.bias is None and m2.bias is None) or m1.bias.shape == m2.bias.shape + m1_bias = getattr(m1, "bias", None) + m2_bias = getattr(m2, "bias", None) + assert (m1_bias is None and m2_bias is None) or m1_bias.shape == m2_bias.shape m1.weight.data.copy_(m2.weight.data) - if m1.bias is not None: - m1.bias.data.copy_(m2.bias.data) + if m1_bias is not None: + m1_bias.data.copy_(m2_bias.data) diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py index d16f55ce9..649b91ac7 100644 --- a/src/modalities/conversion/gpt2/convert_gpt2.py +++ b/src/modalities/conversion/gpt2/convert_gpt2.py @@ -38,6 +38,7 @@ def convert_gpt2( num_testruns: int = 0, device_modalities: str = "cpu", device_hf: str = "cpu", + checkpoint_path: str | None = None, ) -> None: """Takes a modalities gpt2 model and converts it to a Huggingface transformers model. The provided config yaml file should contain the model_raw or model section with the model configuration. @@ -56,6 +57,11 @@ def convert_gpt2( modalities_config = load_app_config_dict( Path(modalities_config_path), experiment_id="-1", experiments_root_path=Path(tmpdir) ) + if checkpoint_path is not None: + if "checkpointed_model" not in modalities_config: + modalities_config["checkpointed_model"] = {"config": {}} + modalities_config["checkpointed_model"]["config"]["checkpoint_path"] = checkpoint_path + hf_model, modalities_model = convert_model_checkpoint(modalities_config) if num_testruns > 0: @@ -69,12 +75,20 @@ def convert_gpt2( sentence_piece_tokenizer_configs = { key: subconfig for key, subconfig in modalities_config.items() - if "component_key" in subconfig + if isinstance(subconfig, dict) and "component_key" in subconfig and subconfig["component_key"] == "tokenizer" and subconfig["variant_key"] == "pretrained_sp_tokenizer" } - if len(sentence_piece_tokenizer_configs) > 1: + hf_tokenizer_configs = { + key: subconfig + for key, subconfig in modalities_config.items() + if isinstance(subconfig, dict) and "component_key" in subconfig + and subconfig["component_key"] == "tokenizer" + and subconfig["variant_key"] == "pretrained_hf_tokenizer" + } + + if len(sentence_piece_tokenizer_configs) + len(hf_tokenizer_configs) > 1: raise ValueError( "Multiple tokenizer configs found. Please specify only one tokenizer config in the modalities config file." ) @@ -88,6 +102,14 @@ def convert_gpt2( hf_model.config.bos_token_id = bos_token_id hf_model.config.eos_token_id = eos_token_id hf_model.config.pad_token_id = pad_token_id + elif len(hf_tokenizer_configs) == 1: + from transformers import AutoTokenizer + tokenizer_name = modalities_config["tokenizer"]["config"]["pretrained_model_name_or_path"] + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.save_pretrained(output_dir) + hf_model.config.bos_token_id = tokenizer.bos_token_id + hf_model.config.eos_token_id = tokenizer.eos_token_id + hf_model.config.pad_token_id = tokenizer.pad_token_id else: logger.warning("No tokenizer specified in the config. Skipping tokenizer conversion.") hf_model.config.auto_map = { @@ -110,6 +132,7 @@ def convert_gpt2( parser.add_argument("--num_testruns", type=int, default=0, help="Number of test runs to perform.") parser.add_argument("--device_modalities", type=str, default="cpu", help="Device for the modalities model.") parser.add_argument("--device_hf", type=str, default="cpu", help="Device for the Hugging Face model.") + parser.add_argument("--checkpoint_path", type=str, default=None, help="Path to the model checkpoint. Overrides config.") args = parser.parse_args() @@ -119,4 +142,5 @@ def convert_gpt2( args.num_testruns, args.device_modalities, args.device_hf, + args.checkpoint_path, ) diff --git a/src/modalities/conversion/gpt2/modeling_gpt2.py b/src/modalities/conversion/gpt2/modeling_gpt2.py index f6aa77ab1..98d4141d7 100644 --- a/src/modalities/conversion/gpt2/modeling_gpt2.py +++ b/src/modalities/conversion/gpt2/modeling_gpt2.py @@ -88,6 +88,23 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) +class GPT2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + GPT2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -203,6 +220,17 @@ def __init__(self, config: GPT2Config, layer_idx: int): self.o_proj = nn.Linear( config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) + if getattr(config, "use_qk_norm", False): + qk_norm_dim = getattr(config, "qk_norm_dim", None) or self.head_dim + if getattr(config, "norm_type", "layer_norm") == "pytorch_rms_norm": + self.q_norm = nn.RMSNorm(qk_norm_dim, eps=config.layer_norm_eps) + self.k_norm = nn.RMSNorm(qk_norm_dim, eps=config.layer_norm_eps) + else: + self.q_norm = GPT2RMSNorm(qk_norm_dim, eps=config.layer_norm_eps) + self.k_norm = GPT2RMSNorm(qk_norm_dim, eps=config.layer_norm_eps) + else: + self.q_norm = None + self.k_norm = None def forward( self, @@ -223,6 +251,10 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + if getattr(self, "q_norm", None) is not None: + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + if past_key_value is not None: # sin and cos are specific to RoPE models; cache_position needed for the static cache cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} @@ -256,18 +288,25 @@ def __init__(self, config: GPT2Config, layer_idx: int): self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx) self.mlp = LlamaMLP(config) - self.input_layernorm = nn.LayerNorm( - config.hidden_size, - eps=config.layer_norm_eps, - elementwise_affine=config.layer_norm_elementwise_affine, - bias=config.layer_norm_bias, - ) - self.post_attention_layernorm = nn.LayerNorm( - config.hidden_size, - eps=config.layer_norm_eps, - elementwise_affine=config.layer_norm_elementwise_affine, - bias=config.layer_norm_bias, - ) + if getattr(config, "norm_type", "layer_norm") == "pytorch_rms_norm": + self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + elif getattr(config, "norm_type", "layer_norm") == "rms_norm": + self.input_layernorm = GPT2RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + self.post_attention_layernorm = GPT2RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + else: + self.input_layernorm = nn.LayerNorm( + config.hidden_size, + eps=config.layer_norm_eps, + elementwise_affine=config.layer_norm_elementwise_affine, + bias=config.layer_norm_bias, + ) + self.post_attention_layernorm = nn.LayerNorm( + config.hidden_size, + eps=config.layer_norm_eps, + elementwise_affine=config.layer_norm_elementwise_affine, + bias=config.layer_norm_bias, + ) def forward( self, @@ -333,19 +372,23 @@ def __init__(self, config: GPT2Config): self.layers = nn.ModuleList( [GPT2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.norm = nn.LayerNorm( - config.hidden_size, - eps=config.layer_norm_eps, - elementwise_affine=config.layer_norm_elementwise_affine, - bias=config.layer_norm_bias, - ) + if getattr(config, "norm_type", "layer_norm") == "pytorch_rms_norm": + self.norm = nn.RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + elif getattr(config, "norm_type", "layer_norm") == "rms_norm": + self.norm = GPT2RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + else: + self.norm = nn.LayerNorm( + config.hidden_size, + eps=config.layer_norm_eps, + elementwise_affine=config.layer_norm_elementwise_affine, + bias=config.layer_norm_bias, + ) self.rotary_emb = LlamaRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() - @check_model_inputs @auto_docstring def forward( self, diff --git a/src/modalities/conversion/model_converter.py b/src/modalities/conversion/model_converter.py new file mode 100644 index 000000000..377d22292 --- /dev/null +++ b/src/modalities/conversion/model_converter.py @@ -0,0 +1,95 @@ +import json +import logging +import subprocess +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class ModelConverter: + """Converts a Modalities checkpoint to HF format by running a user-provided command template. + + The command_template is a shell command string with placeholders: + {checkpoint_path} - resolved from last_checkpoint_info.json + {output_dir} - /hf_checkpoint + {modalities_config} - path to the modalities YAML config (if available) + + Example command_templates: + "python convert_gpt2.py {modalities_config} {output_dir}" + "python convert_adaptive_gpt.py {checkpoint_path} {output_dir} --modalities_config {modalities_config}" + """ + + def __init__( + self, + command_template: str, + checkpoint_dir: Path, + global_rank: int, + eval_interval: int, + ) -> None: + self.command_template = command_template + self.checkpoint_dir = Path(checkpoint_dir) + self.global_rank = global_rank + self.eval_interval = eval_interval + + def convert(self, num_train_steps_done: int) -> None: + """Run the conversion command if the current step matches the eval interval. + + Args: + num_train_steps_done: Number of training steps completed so far. + """ + if num_train_steps_done == 0 or num_train_steps_done % self.eval_interval != 0: + return + if self.global_rank != 0: + return + + checkpoint_path = self._get_latest_checkpoint_path() + if checkpoint_path is None: + logger.warning("No checkpoint info found, skipping conversion.") + return + + output_dir = checkpoint_path / "hf_checkpoint" + if output_dir.exists(): + logger.info(f"HF checkpoint already exists at {output_dir}, skipping conversion.") + return + + cmd = self.command_template.format( + checkpoint_path=str(checkpoint_path), + output_dir=str(output_dir), + modalities_config=str(self._find_config_in_checkpoint(checkpoint_path)), + ) + + logger.info(f"Running model conversion: {cmd}") + try: + subprocess.run(cmd, shell=True, check=True) + logger.info(f"Conversion complete. HF checkpoint saved to {output_dir}") + except subprocess.CalledProcessError as e: + logger.error(f"Model conversion failed with return code {e.returncode}: {e}") + + def _get_latest_checkpoint_path(self) -> Path | None: + """Read last_checkpoint_info.json to find the latest checkpoint path.""" + info_file = self.checkpoint_dir / "last_checkpoint_info.json" + if not info_file.exists(): + return None + + with open(info_file, "r", encoding="utf-8") as f: + info = json.load(f) + + # DCP checkpoints use "checkpoint_folder_path", FSDP1 uses "model_checkpoint_path" + checkpoint_path_str = info.get("checkpoint_folder_path") or info.get("model_checkpoint_path") + if checkpoint_path_str is None: + return None + + path = Path(checkpoint_path_str) + # For FSDP1, model_checkpoint_path points to the .bin file; we want the parent directory + if path.is_file(): + path = path.parent + return path + + @staticmethod + def _find_config_in_checkpoint(checkpoint_path: Path) -> Path | None: + """Look for a YAML config file inside or next to the checkpoint directory.""" + for search_dir in [checkpoint_path, checkpoint_path.parent]: + for f in search_dir.iterdir(): + if f.suffix in (".yaml", ".yml") and not f.name.endswith(".resolved"): + return f + return Path("") diff --git a/src/modalities/evaluator.py b/src/modalities/evaluator.py index fb9bdc0d3..4fc7ee327 100644 --- a/src/modalities/evaluator.py +++ b/src/modalities/evaluator.py @@ -1,8 +1,14 @@ +import json +import logging +import subprocess +from pathlib import Path from typing import Callable import torch import torch.distributed as dist import torch.nn as nn + +from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper from torch.distributed.device_mesh import DeviceMesh from modalities.batch import DatasetBatch, EvaluationResultBatch, InferenceResultBatch, ResultItem @@ -15,6 +21,8 @@ from modalities.running_env.fsdp.reducer import Reducer from modalities.util import TimeRecorder +logger = logging.getLogger(__name__) + class Evaluator: """Evaluator class which is responsible for evaluating the model on a set of datasets""" @@ -197,3 +205,135 @@ def _publish_evaluation_result( evaluation_result_publisher.publish_message( payload=evaluation_result, message_type=MessageTypes.EVALUATION_RESULT ) + + +class DownstreamEvaluator: + """Evaluator that runs OLMES on HF checkpoints produced by the conversion callback. + + Checks if an ``hf_checkpoint`` folder exists inside the latest checkpoint directory + (as written by ``ModelConverter``). If it does, the configured OLMES command template + is executed via subprocess. + """ + + def __init__( + self, + tokenizer: TokenizerWrapper, + tasks: list[str], + eval_interval: int, + checkpoint_dir: Path, + global_rank: int, + olmes_command_template: str, + ) -> None: + self.tokenizer = tokenizer + self.tasks = tasks + self.eval_interval = eval_interval + self.checkpoint_dir = Path(checkpoint_dir) + self.global_rank = global_rank + self.olmes_command_template = olmes_command_template + self.active_processes: list[tuple[subprocess.Popen, int, Path]] = [] + + def evaluate(self, num_train_steps_done: int) -> None: + if num_train_steps_done == 0 or num_train_steps_done % self.eval_interval != 0: + return + if self.global_rank != 0: + return + + hf_model_dir = self._find_hf_checkpoint() + if hf_model_dir is None: + logger.warning( + f"No hf_checkpoint found in {self.checkpoint_dir} at step {num_train_steps_done}, " + "skipping downstream evaluation." + ) + return + + tasks_str = " ".join(self.tasks) + cmd = self.olmes_command_template.format( + hf_model_dir=str(hf_model_dir), + tasks=tasks_str, + step=num_train_steps_done, + ) + + logger.info(f"Running downstream evaluation: {cmd}") + try: + p = subprocess.Popen(cmd, shell=True) + self.active_processes.append((p, num_train_steps_done, hf_model_dir)) + logger.info(f"Downstream evaluation launched for step {num_train_steps_done}.") + except Exception as e: + logger.error(f"Failed to launch downstream evaluation: {e}") + + def wait_for_evaluations(self) -> None: + if not hasattr(self, "active_processes") or not self.active_processes: + return + + logger.info(f"Waiting for {len(self.active_processes)} downstream evaluations to finish...") + for p, step, hf_model_dir in self.active_processes: + p.wait() + if p.returncode == 0: + self._sync_metrics_to_wandb(step, hf_model_dir) + else: + logger.warning(f"Downstream evaluation for step {step} exited with code {p.returncode}, skipping W&B sync.") + logger.info("All downstream evaluations finished.") + self.active_processes = [] + + def _sync_metrics_to_wandb(self, step: int, hf_model_dir: Path) -> None: + """Parse OLMES metrics-all.jsonl and log primary scores to the active W&B run.""" + metrics_file = hf_model_dir / f"olmes_eval_{step}" / "metrics-all.jsonl" + if not metrics_file.exists(): + logger.warning(f"No metrics file found at {metrics_file}, skipping W&B sync for step {step}.") + return + + metrics_dict = {} + try: + with open(metrics_file, "r", encoding="utf-8") as f: + for line in f: + obj = json.loads(line) + alias = ( + obj.get("task_config", {}).get("metadata", {}).get("alias") + or obj.get("task_name") + ) + score = obj.get("metrics", {}).get("primary_score") + if alias and score is not None: + metrics_dict[f"downstream/{alias}"] = score + except Exception as e: + logger.error(f"Failed to parse metrics file {metrics_file}: {e}") + return + + if not metrics_dict: + logger.warning(f"No metrics extracted from {metrics_file} for step {step}.") + return + + try: + import wandb + + if wandb.run is not None: + # Define a custom step metric so downstream/* metrics are decoupled from + # the global training step counter (which is already past these steps). + wandb.run.define_metric("downstream_step", hidden=True) + wandb.run.define_metric("downstream/*", step_metric="downstream_step") + metrics_dict["downstream_step"] = step + wandb.run.log(metrics_dict) + logger.info(f"Synced {len(metrics_dict)} OLMES metrics to W&B at step {step}: {metrics_dict}") + else: + logger.info(f"W&B not active, skipping metric sync for step {step}.") + except ImportError: + logger.info(f"wandb not installed, skipping metric sync for step {step}.") + + def _find_hf_checkpoint(self) -> Path | None: + """Read last_checkpoint_info.json and check for hf_checkpoint subfolder.""" + info_file = self.checkpoint_dir / "last_checkpoint_info.json" + if not info_file.exists(): + return None + + with open(info_file, "r", encoding="utf-8") as f: + info = json.load(f) + + checkpoint_path_str = info.get("checkpoint_folder_path") or info.get("model_checkpoint_path") + if checkpoint_path_str is None: + return None + + checkpoint_path = Path(checkpoint_path_str) + if checkpoint_path.is_file(): + checkpoint_path = checkpoint_path.parent + + hf_dir = checkpoint_path / "hf_checkpoint" + return hf_dir if hf_dir.exists() else None diff --git a/src/modalities/gym.py b/src/modalities/gym.py index 010c4ca60..d1835c6c8 100644 --- a/src/modalities/gym.py +++ b/src/modalities/gym.py @@ -1,6 +1,6 @@ from datetime import datetime from functools import partial -from typing import Callable +from typing import Callable, Optional import torch.nn as nn @@ -42,6 +42,8 @@ def run( evaluation_data_loaders: list[LLMDataLoader], checkpoint_saving: CheckpointSaving, scheduled_pipeline: Pipeline | None = None, + conversion_callback: Optional[Callable[[int], None]] = None, + downstream_evaluation_callback: Optional[Callable[[int], None]] = None, ): """Runs the model training, including evaluation and checkpointing. @@ -55,6 +57,8 @@ def run( checkpoint_saving (CheckpointSaving): Routine for saving checkpoints. scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to operate the model. Defaults to None. + conversion_callback (Optional[Callable[[int], None]]): A callback function for checkpoint-to-HF conversion. + downstream_evaluation_callback (Optional[Callable[[int], None]]): A callback function for downstream evaluation. """ evaluation_callback: Callable[[int], None] = partial( self._run_evaluation, @@ -80,6 +84,8 @@ def run( checkpointing_callback=checkpointing_callback, training_log_interval_in_steps=training_log_interval_in_steps, scheduled_pipeline=scheduled_pipeline, + conversion_callback=conversion_callback, + downstream_evaluation_callback=downstream_evaluation_callback, ) print_rank_0(f"Training done at {datetime.now()}.") diff --git a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py index d924a2e78..3f707c287 100644 --- a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py +++ b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py @@ -93,12 +93,18 @@ def consume_message(self, message: Message[EvaluationResultBatch]): """Consumes a message from a message broker.""" eval_result = message.payload + def _wandb_key(tag: str, key: str) -> str: + """Route layer-specific metrics into a top-level layers/ panel.""" + if "layer" in key: + return f"layers/{key}" + return f"{tag}/{key}" + losses = { - f"{eval_result.dataloader_tag} {loss_key}": loss_values.value + _wandb_key(eval_result.dataloader_tag, loss_key): loss_values.value for loss_key, loss_values in eval_result.losses.items() } metrics = { - f"{eval_result.dataloader_tag} {metric_key}": metric_values.value + _wandb_key(eval_result.dataloader_tag, metric_key): metric_values.value for metric_key, metric_values in eval_result.metrics.items() } # TODO step is not semantically correct here. Need to check if we can rename step to num_samples @@ -109,7 +115,7 @@ def consume_message(self, message: Message[EvaluationResultBatch]): data=metrics, step=eval_result.num_train_steps_done ) # (eval_result.train_local_sample_id + 1) * self.num_ranks) throughput_metrics = { - f"{eval_result.dataloader_tag} {metric_key}": metric_values.value + _wandb_key(eval_result.dataloader_tag, metric_key): metric_values.value for metric_key, metric_values in eval_result.throughput_metrics.items() } diff --git a/src/modalities/loss_functions.py b/src/modalities/loss_functions.py index e3be6100d..b4cd1367f 100644 --- a/src/modalities/loss_functions.py +++ b/src/modalities/loss_functions.py @@ -31,17 +31,29 @@ def __init__(self, target_key: str, prediction_key: str, tag: str = "CLMCrossEnt self.prediction_key = prediction_key # Mean over the tokens in the local-batch (batch per rank) self.loss_fun = CrossEntropyLoss(reduction="mean") + self._last_ce_loss = torch.tensor(0.0) + self._last_metrics = None @overload def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor: ... @overload - def __call__(self, outputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + def __call__(self, outputs: torch.Tensor | dict, targets: torch.Tensor) -> torch.Tensor: ... def __call__(self, *args, **kwargs) -> torch.Tensor: - labels, lm_logits = self._parse_arguments(args, kwargs) + labels, outputs = self._parse_arguments(args, kwargs) + + if isinstance(outputs, dict): + if "logits" in outputs: + lm_logits = outputs["logits"] + else: + lm_logits = outputs[self.prediction_key] + metrics = outputs.get("metrics", None) + else: + lm_logits = outputs + metrics = None # move labels to correct device to enable model parallelism labels = labels.to(lm_logits.device) @@ -49,42 +61,56 @@ def __call__(self, *args, **kwargs) -> torch.Tensor: shift_labels = labels.contiguous().long() # Flatten the tokens. We compute here, the loss per token. loss = self.loss_fun(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + self._last_ce_loss = loss.detach() + self._last_metrics = _detach_metrics(metrics) return loss + def get_metrics(self) -> dict: + return { + "ce_loss": self._last_ce_loss, + "metrics": self._last_metrics, + } + def _parse_arguments( self, - args: list[torch.Tensor] | list[InferenceResultBatch], - kwargs: dict[str, torch.Tensor] | dict[str, InferenceResultBatch], - ) -> tuple[torch.Tensor, torch.Tensor]: + args: list[torch.Tensor | dict] | list[InferenceResultBatch], + kwargs: dict[str, torch.Tensor | dict] | dict[str, InferenceResultBatch], + ) -> tuple[torch.Tensor, torch.Tensor | dict]: if len(args) == 1 and isinstance(args[0], InferenceResultBatch): forward_batch = args[0] labels = forward_batch.get_targets(self.target_key) - lm_logits = forward_batch.get_predictions(self.prediction_key) + outputs = forward_batch.predictions elif "forward_batch" in kwargs and isinstance(kwargs["forward_batch"], InferenceResultBatch): forward_batch = kwargs["forward_batch"] labels = forward_batch.get_targets(self.target_key) - lm_logits = forward_batch.get_predictions(self.prediction_key) - elif len(args) == 2 and all(isinstance(arg, torch.Tensor) for arg in args): - lm_logits, labels = args - elif ( - "outputs" in kwargs - and "targets" in kwargs - and isinstance(kwargs["outputs"], torch.Tensor) - and isinstance(kwargs["targets"], torch.Tensor) - ): - lm_logits = kwargs["outputs"] + outputs = forward_batch.predictions + elif len(args) == 2: + outputs, labels = args + elif "outputs" in kwargs and "targets" in kwargs: + outputs = kwargs["outputs"] labels = kwargs["targets"] - elif ( - len(args) == 1 - and "targets" in kwargs - and isinstance(args[0], torch.Tensor) - and isinstance(kwargs["targets"], torch.Tensor) - ): - lm_logits = args[0] + elif len(args) == 1 and "targets" in kwargs: + outputs = args[0] labels = kwargs["targets"] else: raise TypeError("Invalid arguments for CLMCrossEntropyLoss.__call__") - return labels, lm_logits + return labels, outputs + + +def _detach_metrics(metrics: dict | None) -> dict | None: + """Recursively detach all tensors in a nested dict.""" + if metrics is None: + return None + result = {} + for key, value in metrics.items(): + if isinstance(value, torch.Tensor): + result[key] = value.detach() + elif isinstance(value, dict): + result[key] = _detach_metrics(value) + else: + result[key] = value + return result def nce_loss( diff --git a/src/modalities/main.py b/src/modalities/main.py index 49ac97b91..9398b4069 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -220,6 +220,14 @@ def run(self, components: TrainingComponentsInstantiationModel): print_rank_0(report) + conversion_callback = None + if components.model_converter is not None: + conversion_callback = components.model_converter.convert + + downstream_evaluation_callback = None + if components.downstream_evaluator is not None: + downstream_evaluation_callback = components.downstream_evaluator.evaluate + gym.run( train_data_loader=components.train_dataloader, evaluation_data_loaders=components.eval_dataloaders, @@ -229,8 +237,17 @@ def run(self, components: TrainingComponentsInstantiationModel): evaluation_interval_in_steps=components.settings.intervals.evaluation_interval_in_steps, training_log_interval_in_steps=components.settings.intervals.training_log_interval_in_steps, scheduled_pipeline=components.scheduled_pipeline, + conversion_callback=conversion_callback, + downstream_evaluation_callback=downstream_evaluation_callback, ) + if components.downstream_evaluator is not None: + print_rank_0("\n" + "="*80) + print_rank_0("Training loop complete! Waiting for background evaluations to finish...") + print_rank_0("="*80 + "\n") + components.downstream_evaluator.wait_for_evaluations() + print_rank_0("All background evaluations completed successfully!") + def get_logging_publishers( self, progress_subscriber: MessageSubscriberIF[ProgressUpdate], diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index f43e6e87b..70e8d3545 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -1122,7 +1122,7 @@ def __init__( ) # https://paperswithcode.com/method/weight-tying @overload - def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor | dict]: """ Forward pass of the GPT2LLM module. @@ -1131,8 +1131,8 @@ def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - sample_key (str): Key for the input tensor containing token ids. Returns: - dict[str, torch.Tensor]: A dictionary containing output tensors. - - prediction_key (str): Key for the output tensor containing logits. + dict[str, torch.Tensor | dict]: A dictionary containing output tensors and metrics. + - prediction_key (str): Key for the output containing logits and metrics dict. """ ... @@ -1149,7 +1149,7 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor: """ ... - def forward(self, inputs: dict[str, torch.Tensor] | torch.Tensor) -> dict[str, torch.Tensor] | torch.Tensor: + def forward(self, inputs: dict[str, torch.Tensor] | torch.Tensor) -> dict[str, torch.Tensor | dict] | torch.Tensor: """ Forward pass of the GPT2LLM module. @@ -1157,14 +1157,19 @@ def forward(self, inputs: dict[str, torch.Tensor] | torch.Tensor) -> dict[str, t inputs (dict[str, torch.Tensor] | torch.Tensor): Input data. Returns: - dict[str, torch.Tensor] | torch.Tensor: Model output. + dict[str, torch.Tensor | dict] | torch.Tensor: Model output. """ if isinstance(inputs, dict): - return {self.prediction_key: self.forward_impl(inputs[self.sample_key])} + logits, metrics = self.forward_impl(inputs[self.sample_key]) + return { + self.prediction_key: logits, + "metrics": metrics, + } else: - return self.forward_impl(inputs) + logits, _ = self.forward_impl(inputs) + return logits - def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: + def forward_impl(self, inputs: torch.Tensor) -> tuple[torch.Tensor, dict]: """ Forward pass implementation of the GPT2LLM module. @@ -1172,7 +1177,7 @@ def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: inputs (torch.Tensor): A tensor containing input token ids. Returns: - torch.Tensor: A tensor containing output logits. + tuple[torch.Tensor, dict]: A tuple containing output logits and custom metrics. """ device = inputs.device seq_len = inputs.size(1) @@ -1192,11 +1197,31 @@ def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: # TODO: use drop out also without absolute position embedding? h = self.transformer.drop(h) if hasattr(self.transformer, "drop") else h + layer_norms = [] for layer_idx in self.transformer.h: h = self.transformer.h[layer_idx](h) + layer_norms.append(h.detach().norm(dim=-1).mean()) + h = self.transformer.lm_head_norm(h) if hasattr(self.transformer, "lm_head_norm") else h - h = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h - return h + logits = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h + + with torch.no_grad(): + log_p = torch.log_softmax(logits, dim=-1) + p = torch.exp(log_p) + entropy = -torch.sum(p * log_p, dim=-1).mean() + layer_norms_tensor = torch.stack(layer_norms) + + metrics = { + "scalars": { + "logits_entropy": entropy + }, + "per_layer_scalars": { + "layer_activation_norm": layer_norms_tensor + } + } + + return logits, metrics + def manual_scaled_dot_product_attention( diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 26df9b432..3cab47143 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -33,7 +33,9 @@ DCPCheckpointSavingConfig, DebuggingEnrichedModelConfig, DistributedSamplerConfig, + DownstreamEvaluatorConfig, DummyLRSchedulerConfig, + ModelConverterConfig, DummyProgressSubscriberConfig, DummyResultSubscriberConfig, EvaluationResultToDiscSubscriberConfig, @@ -82,6 +84,8 @@ ProgressSubscriberFactory, ResultsSubscriberFactory, ) +from modalities.evaluator import DownstreamEvaluator +from modalities.conversion.model_converter import ModelConverter from modalities.loss_functions import CLMCrossEntropyLoss from modalities.models.coca.coca_model import CoCa, CoCaConfig from modalities.models.coca.collator import CoCaCollateFnConfig, CoCaCollatorFn @@ -528,4 +532,6 @@ class ComponentEntity: maybe_model_list(HookRegistration.register_print_forward_hooks), PrintForwardHookConfig, ), + ComponentEntity("downstream_evaluator", "default", DownstreamEvaluator, DownstreamEvaluatorConfig), + ComponentEntity("model_converter", "default", ModelConverter, ModelConverterConfig), ] diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 4ad54b226..1256707c4 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -51,6 +51,10 @@ class ThroughputAggregationKeys(Enum): FORWARD_BACKWARD_TIME = "FORWARD_BACKWARD_TIME" + +from modalities.training.logging import MetricsAccumulator, format_metrics + + class Trainer: def __init__( self, @@ -150,7 +154,7 @@ def _train_batch( operate the model. Defaults to None. Returns: - tuple[bool, int, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + tuple[bool, int, torch.Tensor, Optional[torch.Tensor]]: A tuple containing the following: - step_performed (bool): Indicates whether a training step was performed. - num_train_steps_done (int): The number of training steps done. @@ -207,6 +211,8 @@ def train( evaluation_callback: Callable[[int], None], checkpointing_callback: Callable[[TrainingProgress], None], scheduled_pipeline: Pipeline | None = None, + conversion_callback: Optional[Callable[[int], None]] = None, + downstream_evaluation_callback: Optional[Callable[[int], None]] = None, ): """ Trains the model. @@ -220,6 +226,8 @@ def train( checkpointing_callback (Callable[[TrainingProgress], None]): A callback function for checkpointing. scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to operate the model. Defaults to None. + conversion_callback (Optional[Callable[[int], None]]): A callback function for checkpoint-to-HF conversion. + downstream_evaluation_callback (Optional[Callable[[int], None]]): A callback function for downstream evaluation. Returns: None @@ -234,6 +242,7 @@ def train( local_num_seen_samples = 0 cumulated_losses = torch.zeros(3).cuda() + metrics_accum = MetricsAccumulator() # throughput device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -257,6 +266,10 @@ def train( num_target_tokens=self.num_target_tokens, ) checkpointing_callback(training_progress=training_progress) + if conversion_callback is not None: + conversion_callback(num_train_steps_done=self.num_seen_train_steps) + if downstream_evaluation_callback is not None: + downstream_evaluation_callback(num_train_steps_done=self.num_seen_train_steps) num_steps_todo = self.num_target_steps - self.num_seen_train_steps num_batches_todo = num_steps_todo * self.gradient_acc_steps @@ -291,6 +304,9 @@ def train( # it has less samples than the batch size cumulated_losses[-1] += 1 # number of local batches + if hasattr(loss_fun, "get_metrics"): + metrics_accum.accumulate(loss_fun.get_metrics()) + # gradient norm is already synced across all ranks if gradient_norm_score is not None: gradient_norm_scores.append(gradient_norm_score.item()) @@ -336,17 +352,52 @@ def train( reduced_losses[0], reduced_losses[1], ) + + adaptive_losses = {} + adaptive_metrics = {} + if metrics_accum.count > 0: + ( + sync_tensor, scalar_names, per_layer_names, per_layer_sizes, + hist_names, hist_shapes, + ) = metrics_accum.build_sync_tensor(device) + + reduce_scale = dist.get_world_size() / self.pp_degree + synced_tensor = Reducer.reduce( + tensor=sync_tensor, + operation=dist.ReduceOp.SUM, + post_processing_fun=lambda t: t / reduce_scale, + ) + + ( + synced_loss, synced_scalars, synced_per_layer, synced_hists, + ) = MetricsAccumulator.unpack_synced_tensor( + synced_tensor, scalar_names, per_layer_names, per_layer_sizes, + hist_names, hist_shapes, + ) + + adaptive_losses, adaptive_metrics = format_metrics( + loss=synced_loss, + scalars=synced_scalars, + per_layer_scalars=synced_per_layer, + per_layer_vectors=metrics_accum.last_per_layer_vectors, + per_layer_histograms=synced_hists, + ) + losses = { "train loss avg": ResultItem(train_loss_avg, decimal_places=2), "train loss last": ResultItem(train_loss_last_batch, decimal_places=2), + **adaptive_losses, } metrics = { "consumed tokens": ResultItem(torch.tensor(training_progress.num_seen_tokens_total), 0), "grad norm avg": ResultItem(torch.mean(torch.Tensor(gradient_norm_scores)), 2), "grad norm last": ResultItem(torch.tensor(gradient_norm_scores[-1]), 2), + **adaptive_metrics, } + gradient_norm_scores = [] + mfu_score = torch.tensor(-1.0) if self.mfu_calculator is not None: mfu_score = self.mfu_calculator.compute(num_samples_per_second=global_num_samples_per_second) @@ -384,10 +435,15 @@ def train( ) cumulated_losses.zero_() + metrics_accum.reset() if step_performed: self.gc.run(step_count=training_progress.num_seen_steps_total) evaluation_callback(num_train_steps_done=training_progress.num_seen_steps_total) checkpointing_callback(training_progress=training_progress) + if conversion_callback is not None: + conversion_callback(num_train_steps_done=training_progress.num_seen_steps_total) + if downstream_evaluation_callback is not None: + downstream_evaluation_callback(num_train_steps_done=training_progress.num_seen_steps_total) profiler_cm.step() diff --git a/src/modalities/training/logging.py b/src/modalities/training/logging.py new file mode 100644 index 000000000..18b632af9 --- /dev/null +++ b/src/modalities/training/logging.py @@ -0,0 +1,198 @@ +from typing import Optional +import torch +from modalities.batch import ResultItem + + +# ============================================================================= +# Generic Metrics Accumulator +# ============================================================================= + +class MetricsAccumulator: + """Accumulates metrics across batches and produces a single flat tensor + for cross-rank reduction. + """ + + def __init__(self): + self.reset() + + def reset(self): + self.loss_sum: float = 0.0 + self.scalar_sums: dict[str, float] = {} + self.per_layer_scalar_sums: dict[str, torch.Tensor] = {} + self.last_per_layer_vectors: dict[str, torch.Tensor] = {} + self.per_layer_hist_sums: dict[str, torch.Tensor] = {} + self.count: int = 0 + + def accumulate(self, loss_metrics: dict): + if "ce_loss" in loss_metrics: + self.loss_sum += loss_metrics["ce_loss"].item() + elif "loss" in loss_metrics: + self.loss_sum += loss_metrics["loss"].item() + self.count += 1 + + bag = loss_metrics.get("metrics") + if bag is None: + return + + for name, tensor in bag.get("scalars", {}).items(): + self.scalar_sums[name] = self.scalar_sums.get(name, 0.0) + tensor.item() + + for name, tensor in bag.get("per_layer_scalars", {}).items(): + if name not in self.per_layer_scalar_sums: + self.per_layer_scalar_sums[name] = torch.zeros_like(tensor, dtype=torch.float32) + self.per_layer_scalar_sums[name] += tensor.float() + + for name, tensor in bag.get("per_layer_vectors", {}).items(): + self.last_per_layer_vectors[name] = tensor + + for name, tensor in bag.get("per_layer_histograms", {}).items(): + if name not in self.per_layer_hist_sums: + self.per_layer_hist_sums[name] = torch.zeros_like(tensor, dtype=torch.float32) + self.per_layer_hist_sums[name] += tensor.float() + + def build_sync_tensor( + self, device: torch.device + ) -> tuple[ + torch.Tensor, + list[str], + list[str], + dict[str, int], + list[str], + dict[str, tuple], + ]: + if self.count == 0: + return torch.zeros(1, device=device), [], [], {}, [], {} + + n = self.count + values = [self.loss_sum / n] + + scalar_names = sorted(self.scalar_sums.keys()) + for name in scalar_names: + values.append(self.scalar_sums[name] / n) + + per_layer_names = sorted(self.per_layer_scalar_sums.keys()) + per_layer_sizes = {} + layer_tensors = [] + for name in per_layer_names: + t = self.per_layer_scalar_sums[name] / n + layer_tensors.append(t.to(device)) + per_layer_sizes[name] = t.numel() + + hist_names = sorted(self.per_layer_hist_sums.keys()) + hist_shapes: dict[str, tuple] = {} + hist_tensors = [] + for name in hist_names: + t = self.per_layer_hist_sums[name] / n + hist_shapes[name] = tuple(t.shape) + hist_tensors.append(t.to(device).flatten()) + + combined = torch.tensor(values, device=device, dtype=torch.float32) + if layer_tensors: + combined = torch.cat([combined, torch.cat(layer_tensors)]) + if hist_tensors: + combined = torch.cat([combined, torch.cat(hist_tensors)]) + + return combined, scalar_names, per_layer_names, per_layer_sizes, hist_names, hist_shapes + + @staticmethod + def unpack_synced_tensor( + synced: torch.Tensor, + scalar_names: list[str], + per_layer_names: list[str], + per_layer_sizes: dict[str, int], + hist_names: list[str] = None, + hist_shapes: dict[str, tuple] = None, + ) -> tuple[ + torch.Tensor, + dict[str, torch.Tensor], + dict[str, torch.Tensor], + dict[str, torch.Tensor], + ]: + hist_names = hist_names or [] + hist_shapes = hist_shapes or {} + + idx = 0 + loss = synced[idx]; idx += 1 + + scalars = {} + for name in scalar_names: + scalars[name] = synced[idx]; idx += 1 + + per_layer_scalars = {} + for name in per_layer_names: + size = per_layer_sizes[name] + per_layer_scalars[name] = synced[idx : idx + size]; idx += size + + per_layer_histograms = {} + for name in hist_names: + shape = hist_shapes[name] + size = 1 + for dim in shape: + size *= dim + per_layer_histograms[name] = synced[idx : idx + size].reshape(shape) + idx += size + + return loss, scalars, per_layer_scalars, per_layer_histograms + + +# ============================================================================= +# Metrics Formatter +# ============================================================================= + +def format_metrics( + loss: torch.Tensor, + scalars: dict[str, torch.Tensor], + per_layer_scalars: dict[str, torch.Tensor], + per_layer_vectors: dict[str, torch.Tensor], + summary_only: bool = False, + per_layer_histograms: Optional[dict[str, torch.Tensor]] = None, +) -> tuple[dict[str, ResultItem], dict[str, ResultItem]]: + per_layer_histograms = per_layer_histograms or {} + + losses = { + "loss/ce_avg": ResultItem(loss, decimal_places=2), + } + + metrics: dict[str, ResultItem] = {} + + for name, val in scalars.items(): + metrics[f"adaptive/{name}"] = ResultItem(val, 4) + + for name, vals in per_layer_scalars.items(): + metrics[f"summary/{name}"] = ResultItem(vals.mean(), 4) + if not summary_only: + for i, v in enumerate(vals): + metrics[f"layer_{i}/{name}"] = ResultItem(v, 4) + + for name, tensor in per_layer_vectors.items(): + if tensor.numel() == 0: + continue + t = tensor.float().cpu() + n_layers, n_loops = t.shape + + metrics[f"summary/{name}"] = ResultItem(t.mean(), 4) + + if not summary_only: + for i in range(n_layers): + metrics[f"layer_{i}/avg_{name}"] = ResultItem(t[i].mean(), 4) + for j in range(n_loops): + metrics[f"layer_{i}/{name}_{j}"] = ResultItem(t[i, j], 4) + + for j in range(n_loops): + metrics[f"loop_{j}/{name}"] = ResultItem(t[:, j].mean(), 4) + + for name, tensor in per_layer_histograms.items(): + if tensor.numel() == 0: + continue + t = tensor.float().cpu() + n_layers, n_bins = t.shape + + for b in range(n_bins): + metrics[f"hist/{name}/bin_{b}"] = ResultItem(t[:, b].mean(), 4) + + if not summary_only: + for i in range(n_layers): + for b in range(n_bins): + metrics[f"hist/{name}/layer_{i}/bin_{b}"] = ResultItem(t[i, b], 4) + + return losses, metrics diff --git a/tests/logging_broker/subscriber_impl/test_logging_rich.py b/tests/logging_broker/subscriber_impl/test_logging_rich.py new file mode 100644 index 000000000..89710fc2b --- /dev/null +++ b/tests/logging_broker/subscriber_impl/test_logging_rich.py @@ -0,0 +1,120 @@ +import torch +import pytest +from modalities.training.logging import MetricsAccumulator, format_metrics +from modalities.batch import ResultItem + + +def test_metrics_accumulator_accumulation_and_sync(): + device = torch.device("cpu") + accum = MetricsAccumulator() + + # Step 1: Accumulate first batch + metrics_1 = { + "scalars": { + "p_weight": torch.tensor(0.5) + }, + "per_layer_scalars": { + "cost": torch.tensor([1.0, 2.0]) + }, + "per_layer_vectors": { + "vec": torch.tensor([[0.1, 0.2], [0.3, 0.4]]) + }, + "per_layer_histograms": { + "hist": torch.tensor([[0.5, 0.5], [0.8, 0.2]]) + } + } + accum.accumulate({"ce_loss": torch.tensor(2.0), "metrics": metrics_1}) + + # Step 2: Accumulate second batch + metrics_2 = { + "scalars": { + "p_weight": torch.tensor(1.5) + }, + "per_layer_scalars": { + "cost": torch.tensor([3.0, 4.0]) + }, + "per_layer_vectors": { + # Vectors are last-batch-only in trainer design + "vec": torch.tensor([[1.1, 1.2], [1.3, 1.4]]) + }, + "per_layer_histograms": { + "hist": torch.tensor([[0.3, 0.7], [0.6, 0.4]]) + } + } + accum.accumulate({"ce_loss": torch.tensor(4.0), "metrics": metrics_2}) + + assert accum.count == 2 + + # Step 3: Build sync tensor + sync_tensor, scalar_names, pl_names, pl_sizes, hist_names, hist_shapes = accum.build_sync_tensor(device) + + # Expected averages: + # ce_loss = (2 + 4) / 2 = 3.0 + # scalars: p_weight = (0.5 + 1.5) / 2 = 1.0 + # per_layer_scalars: cost = [(1+3)/2, (2+4)/2] = [2.0, 3.0] + # per_layer_histograms: hist = [[(0.5+0.3)/2, (0.5+0.7)/2], [(0.8+0.6)/2, (0.2+0.4)/2]] = [[0.4, 0.6], [0.7, 0.3]] + + # Step 4: Unpack + loss, scalars, pl_scalars, pl_hist = MetricsAccumulator.unpack_synced_tensor( + sync_tensor, scalar_names, pl_names, pl_sizes, hist_names, hist_shapes + ) + + assert torch.allclose(loss, torch.tensor(3.0)) + assert torch.allclose(scalars["p_weight"], torch.tensor(1.0)) + assert torch.allclose(pl_scalars["cost"], torch.tensor([2.0, 3.0])) + assert torch.allclose(pl_hist["hist"], torch.tensor([[0.4, 0.6], [0.7, 0.3]])) + assert torch.allclose(accum.last_per_layer_vectors["vec"], torch.tensor([[1.1, 1.2], [1.3, 1.4]])) + + +def test_format_metrics(): + loss = torch.tensor(3.0) + scalars = {"p_weight": torch.tensor(1.0)} + pl_scalars = {"cost": torch.tensor([2.0, 3.0])} + pl_vectors = {"vec": torch.tensor([[1.1, 1.2], [1.3, 1.4]])} + pl_hists = {"hist": torch.tensor([[0.4, 0.6], [0.7, 0.3]])} + + # Test summary_only = False + losses, metrics = format_metrics( + loss=loss, + scalars=scalars, + per_layer_scalars=pl_scalars, + per_layer_vectors=pl_vectors, + summary_only=False, + per_layer_histograms=pl_hists + ) + + assert losses["loss/ce_avg"].value.item() == pytest.approx(3.0) + assert metrics["adaptive/p_weight"].value.item() == pytest.approx(1.0) + assert metrics["summary/cost"].value.item() == pytest.approx(2.5) + assert metrics["layer_0/cost"].value.item() == pytest.approx(2.0) + assert metrics["layer_1/cost"].value.item() == pytest.approx(3.0) + + # Vectors + assert metrics["summary/vec"].value.item() == pytest.approx(1.25) + assert metrics["layer_0/vec_0"].value.item() == pytest.approx(1.1) + assert metrics["layer_0/vec_1"].value.item() == pytest.approx(1.2) + assert metrics["layer_1/vec_0"].value.item() == pytest.approx(1.3) + assert metrics["layer_1/vec_1"].value.item() == pytest.approx(1.4) + + # Histograms + assert metrics["hist/hist/bin_0"].value.item() == pytest.approx(0.55) + assert metrics["hist/hist/bin_1"].value.item() == pytest.approx(0.45) + assert metrics["hist/hist/layer_0/bin_0"].value.item() == pytest.approx(0.4) + assert metrics["hist/hist/layer_1/bin_1"].value.item() == pytest.approx(0.3) + + # Test summary_only = True + losses, metrics = format_metrics( + loss=loss, + scalars=scalars, + per_layer_scalars=pl_scalars, + per_layer_vectors=pl_vectors, + summary_only=True, + per_layer_histograms=pl_hists + ) + + assert "layer_0/cost" not in metrics + assert "layer_0/vec_0" not in metrics + assert "hist/hist/layer_0/bin_0" not in metrics + assert metrics["summary/cost"].value.item() == pytest.approx(2.5) + assert metrics["summary/vec"].value.item() == pytest.approx(1.25) + assert metrics["hist/hist/bin_0"].value.item() == pytest.approx(0.55) diff --git a/tests/test_downstream_evaluator.py b/tests/test_downstream_evaluator.py new file mode 100644 index 000000000..ffd535d03 --- /dev/null +++ b/tests/test_downstream_evaluator.py @@ -0,0 +1,281 @@ +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import torch.nn as nn +from pydantic import BaseModel + +from modalities.config.component_factory import ComponentFactory +from modalities.conversion.model_converter import ModelConverter +from modalities.evaluator import DownstreamEvaluator +from modalities.registry.components import ComponentEntity +from modalities.registry.registry import Registry +from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper + + +# ---------- helpers ---------- + +class MockTokenizer(TokenizerWrapper): + def tokenize(self, text: str) -> list[int]: + return [] + + def decode(self, input_ids: list[int]) -> str: + return "" + + @property + def vocab_size(self) -> int: + return 0 + + def get_token_id(self, token: str) -> int: + return 0 + + def is_special_token_id(self, token_id: int) -> bool: + return False + + +# ---------- ModelConverter tests ---------- + +def test_model_converter_skips_non_matching_step(): + converter = ModelConverter( + command_template="echo {checkpoint_path} {output_dir}", + checkpoint_dir=Path("/tmp/fake"), + global_rank=0, + eval_interval=5, + ) + with patch("subprocess.run") as mock_run: + converter.convert(num_train_steps_done=3) + mock_run.assert_not_called() + + +def test_model_converter_skips_step_zero(): + converter = ModelConverter( + command_template="echo {checkpoint_path} {output_dir}", + checkpoint_dir=Path("/tmp/fake"), + global_rank=0, + eval_interval=5, + ) + with patch("subprocess.run") as mock_run: + converter.convert(num_train_steps_done=0) + mock_run.assert_not_called() + + +def test_model_converter_skips_non_rank_zero(): + converter = ModelConverter( + command_template="echo {checkpoint_path} {output_dir}", + checkpoint_dir=Path("/tmp/fake"), + global_rank=1, + eval_interval=5, + ) + with patch("subprocess.run") as mock_run: + converter.convert(num_train_steps_done=5) + mock_run.assert_not_called() + + +def test_model_converter_runs_command_on_matching_step(): + with tempfile.TemporaryDirectory() as tmpdir: + checkpoint_dir = Path(tmpdir) + ckpt_path = checkpoint_dir / "step_5" + ckpt_path.mkdir() + + info = {"checkpoint_folder_path": str(ckpt_path)} + with open(checkpoint_dir / "last_checkpoint_info.json", "w") as f: + json.dump(info, f) + + converter = ModelConverter( + command_template="echo {checkpoint_path} {output_dir}", + checkpoint_dir=checkpoint_dir, + global_rank=0, + eval_interval=5, + ) + + with patch("subprocess.run") as mock_run: + converter.convert(num_train_steps_done=5) + mock_run.assert_called_once() + cmd = mock_run.call_args[0][0] + assert str(ckpt_path) in cmd + assert "hf_checkpoint" in cmd + + +def test_model_converter_skips_if_hf_checkpoint_exists(): + with tempfile.TemporaryDirectory() as tmpdir: + checkpoint_dir = Path(tmpdir) + ckpt_path = checkpoint_dir / "step_5" + ckpt_path.mkdir() + (ckpt_path / "hf_checkpoint").mkdir() + + info = {"checkpoint_folder_path": str(ckpt_path)} + with open(checkpoint_dir / "last_checkpoint_info.json", "w") as f: + json.dump(info, f) + + converter = ModelConverter( + command_template="echo {checkpoint_path} {output_dir}", + checkpoint_dir=checkpoint_dir, + global_rank=0, + eval_interval=5, + ) + + with patch("subprocess.run") as mock_run: + converter.convert(num_train_steps_done=5) + mock_run.assert_not_called() + + +# ---------- DownstreamEvaluator tests ---------- + +def test_downstream_evaluator_skips_non_matching_step(): + evaluator = DownstreamEvaluator( + tokenizer=MockTokenizer(), + tasks=["arc_challenge::olmes"], + eval_interval=5, + checkpoint_dir=Path("/tmp/fake"), + global_rank=0, + olmes_command_template="echo {hf_model_dir} {tasks} {step}", + ) + with patch("subprocess.Popen") as mock_popen: + evaluator.evaluate(num_train_steps_done=3) + mock_popen.assert_not_called() + + +def test_downstream_evaluator_skips_non_rank_zero(): + evaluator = DownstreamEvaluator( + tokenizer=MockTokenizer(), + tasks=["arc_challenge::olmes"], + eval_interval=5, + checkpoint_dir=Path("/tmp/fake"), + global_rank=1, + olmes_command_template="echo {hf_model_dir} {tasks} {step}", + ) + with patch("subprocess.Popen") as mock_popen: + evaluator.evaluate(num_train_steps_done=5) + mock_popen.assert_not_called() + + +def test_downstream_evaluator_runs_when_hf_checkpoint_exists(): + with tempfile.TemporaryDirectory() as tmpdir: + checkpoint_dir = Path(tmpdir) + ckpt_path = checkpoint_dir / "step_10" + ckpt_path.mkdir() + hf_dir = ckpt_path / "hf_checkpoint" + hf_dir.mkdir() + + info = {"checkpoint_folder_path": str(ckpt_path)} + with open(checkpoint_dir / "last_checkpoint_info.json", "w") as f: + json.dump(info, f) + + evaluator = DownstreamEvaluator( + tokenizer=MockTokenizer(), + tasks=["arc_challenge::olmes", "hellaswag::olmes"], + eval_interval=10, + checkpoint_dir=checkpoint_dir, + global_rank=0, + olmes_command_template="olmes --model {hf_model_dir} --tasks {tasks} --step {step}", + ) + + with patch("subprocess.Popen") as mock_popen: + evaluator.evaluate(num_train_steps_done=10) + mock_popen.assert_called_once() + cmd = mock_popen.call_args[0][0] + assert str(hf_dir) in cmd + assert "arc_challenge::olmes,hellaswag::olmes" in cmd + assert "10" in cmd + + +def test_downstream_evaluator_skips_when_no_hf_checkpoint(): + with tempfile.TemporaryDirectory() as tmpdir: + checkpoint_dir = Path(tmpdir) + ckpt_path = checkpoint_dir / "step_10" + ckpt_path.mkdir() + # No hf_checkpoint folder + + info = {"checkpoint_folder_path": str(ckpt_path)} + with open(checkpoint_dir / "last_checkpoint_info.json", "w") as f: + json.dump(info, f) + + evaluator = DownstreamEvaluator( + tokenizer=MockTokenizer(), + tasks=["arc_challenge::olmes"], + eval_interval=10, + checkpoint_dir=checkpoint_dir, + global_rank=0, + olmes_command_template="echo {hf_model_dir} {tasks} {step}", + ) + + with patch("subprocess.Popen") as mock_popen: + evaluator.evaluate(num_train_steps_done=10) + mock_popen.assert_not_called() + + +# ---------- Factory instantiation tests ---------- + +def test_downstream_evaluator_factory_instantiation(): + from modalities.config.config import DownstreamEvaluatorConfig + from modalities.config.pydantic_if_types import PydanticDownstreamEvaluatorType + from modalities.registry.components import COMPONENTS + + registry = Registry(COMPONENTS) + component_factory = ComponentFactory(registry=registry) + + tokenizer_mock = MockTokenizer() + + class TrainingModel(BaseModel): + downstream_eval: PydanticDownstreamEvaluatorType + + config_dict = { + "downstream_eval": { + "component_key": "downstream_evaluator", + "variant_key": "default", + "config": { + "tokenizer": tokenizer_mock, + "tasks": ["task_a"], + "eval_interval": 10, + "checkpoint_dir": "/tmp/test_checkpoints", + "global_rank": 0, + "olmes_command_template": "echo {hf_model_dir}", + }, + } + } + + components = component_factory.build_components( + config_dict=config_dict, + components_model_type=TrainingModel, + ) + + assert isinstance(components.downstream_eval, DownstreamEvaluator) + assert components.downstream_eval.tokenizer == tokenizer_mock + assert components.downstream_eval.tasks == ["task_a"] + assert components.downstream_eval.eval_interval == 10 + + +def test_model_converter_factory_instantiation(): + from modalities.config.config import ModelConverterConfig + from modalities.config.pydantic_if_types import PydanticModelConverterType + from modalities.registry.components import COMPONENTS + + registry = Registry(COMPONENTS) + component_factory = ComponentFactory(registry=registry) + + class ConverterModel(BaseModel): + converter: PydanticModelConverterType + + config_dict = { + "converter": { + "component_key": "model_converter", + "variant_key": "default", + "config": { + "command_template": "echo {checkpoint_path} {output_dir}", + "checkpoint_dir": "/tmp/test_checkpoints", + "global_rank": 0, + "eval_interval": 100, + }, + } + } + + components = component_factory.build_components( + config_dict=config_dict, + components_model_type=ConverterModel, + ) + + assert isinstance(components.converter, ModelConverter) + assert components.converter.eval_interval == 100