Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from lightgbm import LGBMRegressor | |
| import gc | |
| from numerapi import NumerAPI | |
| from pathlib import Path | |
| from utils import ( | |
| save_model, | |
| load_model, | |
| neutralize, | |
| get_biggest_change_features, | |
| get_time_series_cross_val_splits, | |
| validation_metrics, | |
| load_model_config, | |
| save_model_config, | |
| save_prediction, | |
| TARGET_COL, | |
| ) | |
| EXAMPLE_PREDS_COL = "example_preds" | |
| ERA_COL = "era" | |
| # params we'll use to train all of our models. | |
| # Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is | |
| model_params = {"n_estimators": 2000, | |
| "learning_rate": 0.01, | |
| "max_depth": 5, | |
| "num_leaves": 2 ** 5, | |
| "colsample_bytree": 0.1} | |
| # the amount of downsampling we'll use to speed up cross validation and full train. | |
| # a value of 1 means no downsampling | |
| # a value of 10 means use every 10th row | |
| downsample_cross_val = 20 | |
| downsample_full_train = 2 | |
| # if model_selection_loop=True get OOS performance for training_data | |
| # and use that to select best model | |
| # if model_selection_loop=False, just predict on tournament data using existing models and model config | |
| model_selection_loop = True | |
| model_config_name = "advanced_example_model" | |
| napi = NumerAPI() | |
| current_round = napi.get_current_round() | |
| Path("./v4").mkdir(parents=False, exist_ok=True) | |
| napi.download_dataset("v4/train.parquet") | |
| napi.download_dataset("v4/features.json") | |
| print("Entering model selection loop. This may take awhile.") | |
| if model_selection_loop: | |
| model_config = {} | |
| print('reading training_data') | |
| training_data = pd.read_parquet('v4/train.parquet') | |
| # keep track of some prediction columns | |
| ensemble_cols = set() | |
| pred_cols = set() | |
| # pick some targets to use | |
| possible_targets = [c for c in training_data.columns if c.startswith("target_")] | |
| # randomly pick a handful of targets | |
| # this can be vastly improved | |
| targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"] | |
| # all the possible features to train on | |
| feature_cols = [c for c in training_data if c.startswith("feature_")] | |
| """ do cross val to get out of sample training preds""" | |
| cv = 3 | |
| train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12) | |
| # get out of sample training preds via embargoed time series cross validation | |
| # optionally downsample training data to speed up this section. | |
| print("entering time series cross validation loop") | |
| for split, train_test_split in enumerate(train_test_zip): | |
| gc.collect() | |
| print(f"doing split {split+1} out of {cv}") | |
| train_split, test_split = train_test_split | |
| train_split_index = training_data[ERA_COL].isin(train_split) | |
| test_split_index = training_data[ERA_COL].isin(test_split) | |
| downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val] | |
| # getting the per era correlation of each feature vs the primary target across the training split | |
| print("getting feature correlations over time and identifying riskiest features") | |
| all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply( | |
| lambda d: d[feature_cols].corrwith(d[TARGET_COL])) | |
| # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data | |
| # there are probably more clever ways to do this | |
| riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50) | |
| print(f"entering model training loop for split {split+1}") | |
| for target in targets: | |
| model_name = f"model_{target}" | |
| print(f"model: {model_name}") | |
| # train a model on the training split (and save it for future use) | |
| split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}" | |
| split_model = load_model(split_model_name) | |
| if not split_model: | |
| print(f"training model: {model_name}") | |
| split_model = LGBMRegressor(**model_params) | |
| split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols], | |
| training_data.loc[downsampled_train_split_index, | |
| [target]]) | |
| save_model(split_model, split_model_name) | |
| # now we can predict on the test part of the split | |
| model_expected_features = split_model.booster_.feature_name() | |
| if set(model_expected_features) != set(feature_cols): | |
| print(f"New features are available! Might want to retrain model {split_model_name}.") | |
| print(f"predicting {model_name}") | |
| training_data.loc[test_split_index, f"preds_{model_name}"] = \ | |
| split_model.predict(training_data.loc[test_split_index, model_expected_features]) | |
| # do neutralization | |
| print("doing neutralization to riskiest features") | |
| training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize( | |
| df=training_data.loc[test_split_index, :], | |
| columns=[f"preds_{model_name}"], | |
| neutralizers=riskiest_features_split, | |
| proportion=1.0, | |
| normalize=True, | |
| era_col=ERA_COL)[f"preds_{model_name}"] | |
| # remember that we made all of these different pred columns | |
| pred_cols.add(f"preds_{model_name}") | |
| pred_cols.add(f"preds_{model_name}_neutral_riskiest_50") | |
| print("creating ensembles") | |
| # ranking per era for all of our pred cols so we can combine safely on the same scales | |
| training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply( | |
| lambda d: d[list(pred_cols)].rank(pct=True)) | |
| # do ensembles | |
| training_data["ensemble_neutral_riskiest_50"] = sum( | |
| [training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( | |
| pct=True) | |
| training_data["ensemble_not_neutral"] = sum( | |
| [training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) | |
| training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True) | |
| ensemble_cols.add("ensemble_neutral_riskiest_50") | |
| ensemble_cols.add("ensemble_not_neutral") | |
| ensemble_cols.add("ensemble_all") | |
| """ Now get some stats and pick our favorite model""" | |
| print("gathering validation metrics for out of sample training results") | |
| all_model_cols = list(pred_cols) + list(ensemble_cols) | |
| # use example_col preds_model_target as an estimates since no example preds provided for training | |
| # fast_mode=True so that we skip some of the stats that are slower to calculate | |
| training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target", | |
| fast_mode=True, target_col=TARGET_COL) | |
| print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown()) | |
| # pick the model that has the highest correlation sharpe | |
| best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0] | |
| print(f"selecting model {best_pred_col} as our highest sharpe model in validation") | |
| """ Now do a full train""" | |
| print("entering full training section") | |
| # getting the per era correlation of each feature vs the target across all of training data | |
| print("getting feature correlations with target and identifying riskiest features") | |
| all_feature_corrs = training_data.groupby(ERA_COL).apply( | |
| lambda d: d[feature_cols].corrwith(d[TARGET_COL])) | |
| # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data | |
| riskiest_features = get_biggest_change_features(all_feature_corrs, 50) | |
| for target in targets: | |
| gc.collect() | |
| model_name = f"model_{target}_downsample{downsample_full_train}" | |
| model = load_model(model_name) | |
| if not model: | |
| print(f"training {model_name}") | |
| model = LGBMRegressor(**model_params) | |
| # train on all of train, predict on val, predict on tournament | |
| model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols], | |
| training_data.iloc[::downsample_full_train][target]) | |
| save_model(model, model_name) | |
| gc.collect() | |
| model_config["feature_cols"] = feature_cols | |
| model_config["targets"] = targets | |
| model_config["best_pred_col"] = best_pred_col | |
| model_config["riskiest_features"] = riskiest_features | |
| print(f"saving model config for {model_config_name}") | |
| save_model_config(model_config, model_config_name) | |
| else: | |
| # load model config from previous model selection loop | |
| print(f"loading model config for {model_config_name}") | |
| model_config = load_model_config(model_config_name) | |
| feature_cols = model_config["feature_cols"] | |
| targets = model_config["targets"] | |
| best_pred_col = model_config["best_pred_col"] | |
| riskiest_features = model_config["riskiest_features"] | |
| """ Things that we always do even if we've already trained """ | |
| gc.collect() | |
| print("reading tournament_data") | |
| live_data = pd.read_parquet('v4/live.parquet') | |
| print("reading validation_data") | |
| validation_data = pd.read_parquet('v4/validation.parquet') | |
| print("reading example_predictions") | |
| example_preds = pd.read_parquet('v4/live_example_preds.parquet') | |
| print("reading example_validaton_predictions") | |
| validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet') | |
| # set the example predictions | |
| validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"] | |
| # check for nans and fill nans | |
| print("checking for nans in the tournament data") | |
| if live_data.loc[:, feature_cols].isna().sum().sum(): | |
| cols_w_nan = live_data.loc[:, feature_cols].isna().sum() | |
| total_rows = len(live_data) | |
| print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}") | |
| print(f"out of {total_rows} total rows") | |
| print(f"filling nans with 0.5") | |
| live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5) | |
| else: | |
| print("No nans in the features this week!") | |
| pred_cols = set() | |
| ensemble_cols = set() | |
| for target in targets: | |
| gc.collect() | |
| model_name = f"model_{target}_downsample{downsample_full_train}" | |
| print(f"loading {model_name}") | |
| model = load_model(model_name) | |
| if not model: | |
| raise ValueError(f"{model_name} is not trained yet!") | |
| model_expected_features = model.booster_.feature_name() | |
| if set(model_expected_features) != set(feature_cols): | |
| print(f"New features are available! Might want to retrain model {model_name}.") | |
| print(f"predicting tournament and validation for {model_name}") | |
| validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features]) | |
| live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features]) | |
| # do different neutralizations | |
| # neutralize our predictions to the riskiest features only | |
| print("neutralizing to riskiest_50 for validation and tournament") | |
| validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data, | |
| columns=[f"preds_{model_name}"], | |
| neutralizers=riskiest_features, | |
| proportion=1.0, | |
| normalize=True, | |
| era_col=ERA_COL)[f"preds_{model_name}"] | |
| live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data, | |
| columns=[f"preds_{model_name}"], | |
| neutralizers=riskiest_features, | |
| proportion=1.0, | |
| normalize=True, | |
| era_col=ERA_COL)[f"preds_{model_name}"] | |
| pred_cols.add(f"preds_{model_name}") | |
| pred_cols.add(f"preds_{model_name}_neutral_riskiest_50") | |
| # rank per era for each prediction column so that we can combine safely | |
| validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True)) | |
| live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True)) | |
| # make ensembles for val and tournament | |
| print('creating ensembles for tournament and validation') | |
| validation_data["ensemble_neutral_riskiest_50"] = sum( | |
| [validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( | |
| pct=True) | |
| live_data["ensemble_neutral_riskiest_50"] = sum( | |
| [live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank( | |
| pct=True) | |
| ensemble_cols.add("ensemble_neutral_riskiest_50") | |
| validation_data["ensemble_not_neutral"] = sum( | |
| [validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) | |
| live_data["ensemble_not_neutral"] = sum( | |
| [live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True) | |
| ensemble_cols.add("ensemble_not_neutral") | |
| validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True) | |
| live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True) | |
| ensemble_cols.add("ensemble_all") | |
| gc.collect() | |
| print("getting final validation stats") | |
| # get our final validation stats for our chosen model | |
| validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL, | |
| fast_mode=False, target_col=TARGET_COL) | |
| print(validation_stats.to_markdown()) | |
| # rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements | |
| validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True) | |
| live_data["prediction"] = live_data[best_pred_col].rank(pct=True) | |
| save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}") | |
| save_prediction(live_data["prediction"], f"live_data_{current_round}") | |