# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
from dataclasses import dataclass
from functools import partial

import fiddle as fdl
import nemo_run as run

from nemo.collections import llm
from nemo.collections.llm.gpt.model.llama import Llama3Config, LlamaModel
from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_type", type=str, choices=["llama", "bert", "t5"], help="Model type to run")
    parser.add_argument("--run_number", type=int, help="Number of config to run")
    parser.add_argument("--log_dir", type=str, help="Path where to save training logs")
    parser.add_argument("--get_results", action="store_true")
    parser.add_argument("--extra_metrics", action="store_true")

    return parser.parse_args()


@dataclass
class Llama3Config145M(Llama3Config):
    num_layers: int = 12
    hidden_size: int = 768
    num_attention_heads: int = 16
    num_query_groups: int = 8
    ffn_hidden_size: int = 2688


@run.cli.factory(target=llm.pretrain, name="llama3_145m")
def llama3_145m(num_nodes=1, num_gpus_per_node=1):
    # Setup Llama3 145M config
    recipe = partial(llm.llama3_8b.pretrain_recipe, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)()
    recipe = run.Partial(
        llm.pretrain,
        model=run.Config(LlamaModel, config=run.Config(Llama3Config145M)),
        trainer=recipe.trainer,
        data=recipe.data,
        log=recipe.log,
        optim=recipe.optim,
        resume=None,
    )

    return recipe


def train_config(args):
    # This example will generate 3 configs.
    # It is expected that this script will be run 3 times with changing --run_number flag for each run from 1 to 3.
    # After all configurations are trained, please trigger the script using --get_results flag.
    # This script provides only Auto Configurator example using only a single GPU.

    # Get Auto Conf runner
    calculate_model_size = False
    if args.model_type == "llama":
        recipe = partial(llama3_145m)()
        recipe.data.seq_length = recipe.model.config.seq_length = 2048
    elif args.model_type == "bert":
        recipe = partial(llm.bert_110m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)()
    elif args.model_type == "t5":
        recipe = partial(llm.t5_220m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)()
        # Set to False if you don't want Auto Configurator to calculate model size
        calculate_model_size = True
    else:
        raise ValueError(f"Unsupported model type for this script: {args.model_type}")
    recipe.data.global_batch_size = 16

    runner = AutoConfigurator(
        recipe=recipe,
        gpu_memory_gb=40,
        tensor_parallel_sizes=[1],
        pipeline_parallel_sizes=[1],
        micro_batch_sizes=[1, 2, 4],
        max_training_days=1,
        max_steps_per_run=10,
        num_tokens_in_b=10,
        vocab_size=32000,
        path_to_logs=args.log_dir,
        calculate_model_size=calculate_model_size,
    )

    base_cfg, configs = generate_configs(runner)
    if not args.get_results:
        # Get generated configs
        partials = list(configs.values())
        names = list(configs.keys())

        # Run pre-training
        pretrain_cfg = partials[args.run_number - 1]
        if args.extra_metrics:
            from nemo.lightning.pytorch.callbacks import (
                MemoryMonitor,
                OptimizerMonitor,
                RuntimeEstimator,
                SpeedMonitor,
            )

            # add callbacks
            pretrain_cfg.trainer.callbacks.append(run.Config(SpeedMonitor, window_size=5))
            pretrain_cfg.trainer.callbacks.append(run.Config(RuntimeEstimator))
            pretrain_cfg.trainer.callbacks.append(run.Config(OptimizerMonitor))
            pretrain_cfg.trainer.callbacks.append(run.Config(MemoryMonitor))

        pretrain = fdl.build(pretrain_cfg)
        pretrain()
    else:
        # # Get Auto Configurator results
        get_results(base_cfg, runner, args.log_dir, log_file_prefix="nemo_error")
        print(f"The results were successfully saved to {args.log_dir}.")


def main():
    args = get_args()
    train_config(args)


if __name__ == '__main__':
    main()