Spaces:
Runtime error
Runtime error
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import argparse | |
| import os | |
| from dataclasses import dataclass | |
| from functools import partial | |
| import fiddle as fdl | |
| import nemo_run as run | |
| from nemo.collections import llm | |
| from nemo.collections.llm.gpt.model.llama import Llama3Config, LlamaModel | |
| from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model_type", type=str, choices=["llama", "bert", "t5"], help="Model type to run") | |
| parser.add_argument("--run_number", type=int, help="Number of config to run") | |
| parser.add_argument("--log_dir", type=str, help="Path where to save training logs") | |
| parser.add_argument("--get_results", action="store_true") | |
| parser.add_argument("--extra_metrics", action="store_true") | |
| return parser.parse_args() | |
| class Llama3Config145M(Llama3Config): | |
| num_layers: int = 12 | |
| hidden_size: int = 768 | |
| num_attention_heads: int = 16 | |
| num_query_groups: int = 8 | |
| ffn_hidden_size: int = 2688 | |
| def llama3_145m(num_nodes=1, num_gpus_per_node=1): | |
| # Setup Llama3 145M config | |
| recipe = partial(llm.llama3_8b.pretrain_recipe, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)() | |
| recipe = run.Partial( | |
| llm.pretrain, | |
| model=run.Config(LlamaModel, config=run.Config(Llama3Config145M)), | |
| trainer=recipe.trainer, | |
| data=recipe.data, | |
| log=recipe.log, | |
| optim=recipe.optim, | |
| resume=None, | |
| ) | |
| return recipe | |
| def train_config(args): | |
| # This example will generate 3 configs. | |
| # It is expected that this script will be run 3 times with changing --run_number flag for each run from 1 to 3. | |
| # After all configurations are trained, please trigger the script using --get_results flag. | |
| # This script provides only Auto Configurator example using only a single GPU. | |
| # Get Auto Conf runner | |
| calculate_model_size = False | |
| if args.model_type == "llama": | |
| recipe = partial(llama3_145m)() | |
| recipe.data.seq_length = recipe.model.config.seq_length = 2048 | |
| elif args.model_type == "bert": | |
| recipe = partial(llm.bert_110m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)() | |
| elif args.model_type == "t5": | |
| recipe = partial(llm.t5_220m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)() | |
| # Set to False if you don't want Auto Configurator to calculate model size | |
| calculate_model_size = True | |
| else: | |
| raise ValueError(f"Unsupported model type for this script: {args.model_type}") | |
| recipe.data.global_batch_size = 16 | |
| runner = AutoConfigurator( | |
| recipe=recipe, | |
| gpu_memory_gb=40, | |
| tensor_parallel_sizes=[1], | |
| pipeline_parallel_sizes=[1], | |
| micro_batch_sizes=[1, 2, 4], | |
| max_training_days=1, | |
| max_steps_per_run=10, | |
| num_tokens_in_b=10, | |
| vocab_size=32000, | |
| path_to_logs=args.log_dir, | |
| calculate_model_size=calculate_model_size, | |
| ) | |
| base_cfg, configs = generate_configs(runner) | |
| if not args.get_results: | |
| # Get generated configs | |
| partials = list(configs.values()) | |
| names = list(configs.keys()) | |
| # Run pre-training | |
| pretrain_cfg = partials[args.run_number - 1] | |
| if args.extra_metrics: | |
| from nemo.lightning.pytorch.callbacks import ( | |
| MemoryMonitor, | |
| OptimizerMonitor, | |
| RuntimeEstimator, | |
| SpeedMonitor, | |
| ) | |
| # add callbacks | |
| pretrain_cfg.trainer.callbacks.append(run.Config(SpeedMonitor, window_size=5)) | |
| pretrain_cfg.trainer.callbacks.append(run.Config(RuntimeEstimator)) | |
| pretrain_cfg.trainer.callbacks.append(run.Config(OptimizerMonitor)) | |
| pretrain_cfg.trainer.callbacks.append(run.Config(MemoryMonitor)) | |
| pretrain = fdl.build(pretrain_cfg) | |
| pretrain() | |
| else: | |
| # # Get Auto Configurator results | |
| get_results(base_cfg, runner, args.log_dir, log_file_prefix="nemo_error") | |
| print(f"The results were successfully saved to {args.log_dir}.") | |
| def main(): | |
| args = get_args() | |
| train_config(args) | |
| if __name__ == '__main__': | |
| main() | |