subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import nemo_run as run
from nemo.collections import llm
def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
env_vars = {
"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
}
executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
return executor
def slurm_executor(
user: str,
host: str,
remote_job_dir: str,
account: str,
partition: str,
nodes: int,
devices: int,
time: str = "01:00:00",
custom_mounts: Optional[list[str]] = None,
custom_env_vars: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:dev",
retries: int = 0,
) -> run.SlurmExecutor:
if not (user and host and remote_job_dir and account and partition and nodes and devices):
raise RuntimeError(
"Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
)
mounts = []
if custom_mounts:
mounts.extend(custom_mounts)
env_vars = {
"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
}
if custom_env_vars:
env_vars |= custom_env_vars
executor = run.SlurmExecutor(
account=account,
partition=partition,
tunnel=run.SSHTunnel(
user=user,
host=host,
job_dir=remote_job_dir,
),
nodes=nodes,
ntasks_per_node=devices,
gpus_per_node=devices,
mem="0",
exclusive=True,
gres="gpu:8",
packager=run.GitArchivePackager(subpath="examples/llm/run"),
)
executor.container_image = container_image
executor.container_mounts = mounts
executor.env_vars = env_vars
executor.retries = retries
executor.time = time
return executor
def my_slurm_executor():
# TODO: Set your custom parameters for the Slurm Executor.
return slurm_executor(
user="",
host="",
remote_job_dir="",
account="",
partition="",
nodes=1,
devices=2,
)
if __name__ == "__main__":
run.cli.main(llm.pretrain, default_executor=local_executor_torchrun)
# This will re-expose the pretrain entrypoint with your custom local executor as default.
# To run, for instance, the llama3_8b recipe, use the following command:
# python default_executor.py --factory llama3_8b
# To run with any overrides, use the following command:
# python default_executor.py --factory llama3_8b trainer.max_steps=2000
# To use your custom Slurm executor, use the following command:
# python default_executor.py --executor my_slurm_executor --factory llama3_8b