dataset: s3://scale-ml/users/niklas/swe-agent/train/qwen-2000imitation-student/with_pr/masked/961dagger-2000imitation
debug: false
eval_at_step_zero: null
eval_case_report: false
eval_case_report_only_splits: []
eval_mode: false
eval_steps: 0.2
eval_strategy: steps
experimental:
  activation_checkpointing: true
  enable_context_parallel: false
  mask_input_ids_by_flag: true
  pad_to_max_length: false
  pipeline_parallel_reshard: false
  pipeline_parallel_schedule: gpipe
  pipeline_parallel_size: 1
  pp_last_stage_offset: 0
  pp_share_train_eval_schedule: true
  torch_compile: false
hyperparams:
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  adam_weight_decay: null
  constant_pack: false
  eval_num_rollouts_per_prompt: 1
  gradient_accumulation_steps: 1
  learning_rate: 5.0e-05
  loss_form: null
  lr_scheduler_kwargs: null
  lr_scheduler_type: cosine
  mask_instruct: true
  max_grad_norm: 1.0
  max_length: 32768
  num_rollouts_per_prompt: 1
  num_train_epochs: 3
  num_train_steps: -1
  online: false
  optimizer: adam
  per_device_eval_batch_size: 1
  per_device_micro_batch_size: 1
  per_device_train_batch_size: 1
  sleep_level: 2
  warmup_ratio: 0.05
  weight_decay: 0.01
local_output_path: /mnt/nvme
logging_rollouts: 0
logging_steps: 1
model_squad:
  lm:
    activation_checkpointing: true
    model_path: s3://scale-ml/users/niklas/models/smith-claude-expert/2000imitation-lr5e-5-batch16/checkpoints/checkpoint-375/
    parallel_state:
      cp_mesh: null
      cp_size: 1
      device_type: cuda
      dp_size: 16
      pp_size: 1
      world_mesh: !!python/object:torch.distributed.device_mesh.DeviceMesh
        _coordinate_on_dim:
        - 0
        _dim_group_infos:
        - !!python/tuple
          - ptd:0
          - - 0
            - 1
            - 2
            - 3
            - 4
            - 5
            - 6
            - 7
            - 8
            - 9
            - 10
            - 11
            - 12
            - 13
            - 14
            - 15
          - '0'
        _flatten_mesh_list: !!python/tuple
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        _hash: -8305722318908533129
        _thread_id: null
        device_type: cuda
        mesh: !!python/object/apply:torch._utils._rebuild_tensor_v2
        - !!python/object/apply:torch.storage._load_from_bytes
          - !!binary |
            gAKKCmz8nEb5IGqoUBkugAJN6QMugAJ9cQAoWBAAAABwcm90b2NvbF92ZXJzaW9ucQFN6QNYDQAA
            AGxpdHRsZV9lbmRpYW5xAohYCgAAAHR5cGVfc2l6ZXNxA31xBChYBQAAAHNob3J0cQVLAlgDAAAA
            aW50cQZLBFgEAAAAbG9uZ3EHSwR1dS6AAihYBwAAAHN0b3JhZ2VxAGN0b3JjaApJbnRTdG9yYWdl
            CnEBWA8AAAAxMDI1MzIxMjY3NjY4MDBxAlgDAAAAY3B1cQNLEE50cQRRLoACXXEAWA8AAAAxMDI1
            MzIxMjY3NjY4MDBxAWEuEAAAAAAAAAAAAAAAAQAAAAIAAAADAAAABAAAAAUAAAAGAAAABwAAAAgA
            AAAJAAAACgAAAAsAAAAMAAAADQAAAA4AAAAPAAAA
        - 0
        - !!python/tuple
          - 16
        - !!python/tuple
          - 1
        - false
        - !!python/object/apply:collections.OrderedDict
          - []
        mesh_dim_names: !!python/tuple
        - dp
      world_size: 16
    torch_compile: false
    use_fsdp2: true
    use_scale_llama: false
processing_interface:
  class_name: BaseProcessingInterface
  module_path: trainers.processing_interface
remote_object: {}
resume: false
s3_output_path: s3://scale-ml/users/niklas/models/qwen-2000imitation-student/with_pr/masked/961dagger-2000imitation
save_at_step_zero: null
save_final_model: true
save_hf: true
save_lr_scheduler: true
save_optimizer: true
save_s3_async: true
save_steps: 0.2
save_strategy: epoch
task: sft
use_device_mesh: true
use_fsdp2: true
use_scale_llama: false
wandb:
  entity: gen-ai
  name: qwen2.5-961dagger-2000imitation-with_pr-masked-lr5e-5-batch16
  project: agent-rlxf
wandb_host: https://scaleai.wandb.io/
wandb_key_name: NIKLAS_WANDB_API_KEY
wandb_secretsmanager_location: team/GENAIML/secret-store-key