{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Trigger a run from a notebook" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.\n", " warnings.warn(\n", " \n", "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/clip_grads.py:31: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale\n", " warnings.warn(\n", " \n" ] } ], "source": [ "# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# http://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License.\n", "\n", "import nemo_run as run\n", "from nemo.collections import llm\n", "from nemo.collections.llm.recipes import llama3_8b\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2\n", "\n", "\n", "Config:\n", " Llama3Config8B\n", "\n", "\n", "no arguments\n", "\n", "\n", "\n", "1\n", "\n", "\n", "Config:\n", " LlamaModel\n", "\n", "\n", "config\n", "\n", "\n", "\n", "\n", "\n", "\n", "1:c--2:c\n", "\n", "\n", "\n", "\n", "0\n", "\n", "\n", "Partial:\n", " pretrain\n", "\n", "\n", "model\n", "\n", "\n", "\n", "\n", "\n", "data\n", "\n", "\n", "\n", "\n", "\n", "trainer\n", "\n", "\n", "\n", "\n", "\n", "log\n", "\n", "\n", "\n", "\n", "\n", "resume\n", "\n", "\n", "\n", "\n", "\n", "optim\n", "\n", "\n", "\n", "\n", "\n", "\n", "0:c--1:c\n", "\n", "\n", "\n", "\n", "3\n", "\n", "\n", "Config:\n", " MockDataModule\n", "\n", "\n", "seq_length\n", "\n", "8192\n", "\n", "\n", "micro_batch_size\n", "\n", "1\n", "\n", "\n", "global_batch_size\n", "\n", "512\n", "\n", "\n", "\n", "0:c--3:c\n", "\n", "\n", "\n", "\n", "4\n", "\n", "\n", "Config:\n", " Trainer\n", "\n", "\n", "accelerator\n", "\n", "'gpu'\n", "\n", "\n", "accumulate_grad_batches\n", "\n", "1\n", "\n", "\n", "callbacks\n", "\n", "\n", "\n", "list\n", "\n", "\n", "\n", "\n", "\n", "0\n", "\n", "\n", "devices\n", "\n", "8\n", "\n", "\n", "gradient_clip_val\n", "\n", "1.0\n", "\n", "\n", "limit_test_batches\n", "\n", "50\n", "\n", "\n", "limit_val_batches\n", "\n", "32\n", "\n", "\n", "log_every_n_steps\n", "\n", "10\n", "\n", "\n", "max_steps\n", "\n", "1168251\n", "\n", "\n", "num_nodes\n", "\n", "1\n", "\n", "\n", "plugins\n", "\n", "\n", "\n", "\n", "\n", "strategy\n", "\n", "\n", "\n", "\n", "\n", "use_distributed_sampler\n", "\n", "False\n", "\n", "\n", "val_check_interval\n", "\n", "2000\n", "\n", "\n", "\n", "0:c--4:c\n", "\n", "\n", "\n", "\n", "9\n", "\n", "\n", "Config:\n", " NeMoLogger\n", "\n", "\n", "name\n", "\n", "'default'\n", "\n", "\n", "dir\n", "\n", "None\n", "\n", "\n", "ckpt\n", "\n", "\n", "\n", "\n", "\n", "tensorboard\n", "\n", "\n", "\n", "\n", "\n", "wandb\n", "\n", "None\n", "\n", "\n", "\n", "0:c--9:c\n", "\n", "\n", "\n", "\n", "12\n", "\n", "\n", "Config:\n", " AutoResume\n", "\n", "\n", "resume_if_exists\n", "\n", "True\n", "\n", "\n", "resume_ignore_no_checkpoint\n", "\n", "True\n", "\n", "\n", "\n", "0:c--12:c\n", "\n", "\n", "\n", "\n", "13\n", "\n", "\n", "Config:\n", " MegatronOptimizerModule\n", "\n", "\n", "config\n", "\n", "\n", "\n", "\n", "\n", "lr_scheduler\n", "\n", "\n", "\n", "\n", "\n", "\n", "0:c--13:c\n", "\n", "\n", "\n", "\n", "5\n", "\n", "\n", "Config:\n", " TimingCallback\n", "\n", "\n", "no arguments\n", "\n", "\n", "\n", "4:c--5:c\n", "\n", "\n", "\n", "\n", "6\n", "\n", "\n", "Config:\n", " MegatronMixedPrecision\n", "\n", "\n", "precision\n", "\n", "'bf16-mixed'\n", "\n", "\n", "params_dtype\n", "\n", "\n", "\n", "\n", "\n", "pipeline_dtype\n", "\n", "\n", "\n", "\n", "\n", "autocast_enabled\n", "\n", "False\n", "\n", "\n", "grad_reduce_in_fp32\n", "\n", "True\n", "\n", "\n", "\n", "4:c--6:c\n", "\n", "\n", "\n", "\n", "8\n", "\n", "\n", "Config:\n", " MegatronStrategy\n", "\n", "\n", "tensor_model_parallel_size\n", "\n", "1\n", "\n", "\n", "pipeline_model_parallel_size\n", "\n", "1\n", "\n", "\n", "virtual_pipeline_model_parallel_size\n", "\n", "None\n", "\n", "\n", "context_parallel_size\n", "\n", "2\n", "\n", "\n", "sequence_parallel\n", "\n", "False\n", "\n", "\n", "ckpt_include_optimizer\n", "\n", "True\n", "\n", "\n", "pipeline_dtype\n", "\n", "None\n", "\n", "\n", "ckpt_async_save\n", "\n", "True\n", "\n", "\n", "ckpt_parallel_load\n", "\n", "True\n", "\n", "\n", "gradient_as_bucket_view\n", "\n", "True\n", "\n", "\n", "\n", "4:c--8:c\n", "\n", "\n", "\n", "\n", "7\n", "\n", "\n", "dtype\n", "\n", "torch.bfloat16\n", "\n", "\n", "\n", "6:c--7:c\n", "\n", "\n", "\n", "\n", "6:c--7:c\n", "\n", "\n", "\n", "\n", "10\n", "\n", "\n", "Config:\n", " ModelCheckpoint\n", "\n", "\n", "save_last\n", "\n", "True\n", "\n", "\n", "save_top_k\n", "\n", "10\n", "\n", "\n", "every_n_train_steps\n", "\n", "200\n", "\n", "\n", "save_best_model\n", "\n", "False\n", "\n", "\n", "filename\n", "\n", "'{model_name}--{val_loss:.2f}-{step}-{consumed_samples}'\n", "\n", "\n", "\n", "9:c--10:c\n", "\n", "\n", "\n", "\n", "11\n", "\n", "\n", "Config:\n", " TensorBoardLogger\n", "\n", "\n", "save_dir\n", "\n", "'tb_logs'\n", "\n", "\n", "name\n", "\n", "'default'\n", "\n", "\n", "\n", "9:c--11:c\n", "\n", "\n", "\n", "\n", "14\n", "\n", "\n", "Config:\n", " OptimizerConfig\n", "\n", "\n", "optimizer\n", "\n", "'adam'\n", "\n", "\n", "lr\n", "\n", "0.0003\n", "\n", "\n", "weight_decay\n", "\n", "0.1\n", "\n", "\n", "bf16\n", "\n", "True\n", "\n", "\n", "adam_beta1\n", "\n", "0.9\n", "\n", "\n", "adam_beta2\n", "\n", "0.95\n", "\n", "\n", "adam_eps\n", "\n", "1e-05\n", "\n", "\n", "use_distributed_optimizer\n", "\n", "True\n", "\n", "\n", "overlap_grad_reduce\n", "\n", "True\n", "\n", "\n", "overlap_param_gather\n", "\n", "True\n", "\n", "\n", "\n", "13:c--14:c\n", "\n", "\n", "\n", "\n", "15\n", "\n", "\n", "Config:\n", " CosineAnnealingScheduler\n", "\n", "\n", "warmup_steps\n", "\n", "2000\n", "\n", "\n", "constant_steps\n", "\n", "0\n", "\n", "\n", "min_lr\n", "\n", "2.9999999999999997e-05\n", "\n", "\n", "\n", "13:c--15:c\n", "\n", "\n", "\n", "\n" ], "text/plain": [ ")]>,\n", " data=,\n", " trainer=],\n", " devices=8,\n", " gradient_clip_val=1.0,\n", " limit_test_batches=50,\n", " limit_val_batches=32,\n", " log_every_n_steps=10,\n", " max_steps=1168251,\n", " num_nodes=1,\n", " plugins=,\n", " strategy=,\n", " use_distributed_sampler=False,\n", " val_check_interval=2000)]>,\n", " log=,\n", " tensorboard=,\n", " wandb=None)]>,\n", " resume=,\n", " optim=,\n", " lr_scheduler=)]>)]>" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)\n", "\n", "pretrain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }