Spaces:
Runtime error
Runtime error
add python files from official repo
Browse files- example-scripts +0 -1
- example_model_advanced.py +296 -0
- utils.py +312 -0
example-scripts
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1
|
|
|
|
|
|
example_model_advanced.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from lightgbm import LGBMRegressor
|
| 3 |
+
import gc
|
| 4 |
+
from numerapi import NumerAPI
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from utils import (
|
| 7 |
+
save_model,
|
| 8 |
+
load_model,
|
| 9 |
+
neutralize,
|
| 10 |
+
get_biggest_change_features,
|
| 11 |
+
get_time_series_cross_val_splits,
|
| 12 |
+
validation_metrics,
|
| 13 |
+
load_model_config,
|
| 14 |
+
save_model_config,
|
| 15 |
+
save_prediction,
|
| 16 |
+
TARGET_COL,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
EXAMPLE_PREDS_COL = "example_preds"
|
| 21 |
+
ERA_COL = "era"
|
| 22 |
+
# params we'll use to train all of our models.
|
| 23 |
+
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
|
| 24 |
+
model_params = {"n_estimators": 2000,
|
| 25 |
+
"learning_rate": 0.01,
|
| 26 |
+
"max_depth": 5,
|
| 27 |
+
"num_leaves": 2 ** 5,
|
| 28 |
+
"colsample_bytree": 0.1}
|
| 29 |
+
|
| 30 |
+
# the amount of downsampling we'll use to speed up cross validation and full train.
|
| 31 |
+
# a value of 1 means no downsampling
|
| 32 |
+
# a value of 10 means use every 10th row
|
| 33 |
+
downsample_cross_val = 20
|
| 34 |
+
downsample_full_train = 2
|
| 35 |
+
|
| 36 |
+
# if model_selection_loop=True get OOS performance for training_data
|
| 37 |
+
# and use that to select best model
|
| 38 |
+
# if model_selection_loop=False, just predict on tournament data using existing models and model config
|
| 39 |
+
model_selection_loop = True
|
| 40 |
+
model_config_name = "advanced_example_model"
|
| 41 |
+
|
| 42 |
+
napi = NumerAPI()
|
| 43 |
+
|
| 44 |
+
current_round = napi.get_current_round()
|
| 45 |
+
|
| 46 |
+
Path("./v4").mkdir(parents=False, exist_ok=True)
|
| 47 |
+
napi.download_dataset("v4/train.parquet")
|
| 48 |
+
napi.download_dataset("v4/features.json")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
print("Entering model selection loop. This may take awhile.")
|
| 52 |
+
if model_selection_loop:
|
| 53 |
+
model_config = {}
|
| 54 |
+
print('reading training_data')
|
| 55 |
+
training_data = pd.read_parquet('v4/train.parquet')
|
| 56 |
+
|
| 57 |
+
# keep track of some prediction columns
|
| 58 |
+
ensemble_cols = set()
|
| 59 |
+
pred_cols = set()
|
| 60 |
+
|
| 61 |
+
# pick some targets to use
|
| 62 |
+
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
|
| 63 |
+
# randomly pick a handful of targets
|
| 64 |
+
# this can be vastly improved
|
| 65 |
+
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
|
| 66 |
+
|
| 67 |
+
# all the possible features to train on
|
| 68 |
+
feature_cols = [c for c in training_data if c.startswith("feature_")]
|
| 69 |
+
|
| 70 |
+
""" do cross val to get out of sample training preds"""
|
| 71 |
+
cv = 3
|
| 72 |
+
train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
|
| 73 |
+
# get out of sample training preds via embargoed time series cross validation
|
| 74 |
+
# optionally downsample training data to speed up this section.
|
| 75 |
+
print("entering time series cross validation loop")
|
| 76 |
+
for split, train_test_split in enumerate(train_test_zip):
|
| 77 |
+
gc.collect()
|
| 78 |
+
print(f"doing split {split+1} out of {cv}")
|
| 79 |
+
train_split, test_split = train_test_split
|
| 80 |
+
train_split_index = training_data[ERA_COL].isin(train_split)
|
| 81 |
+
test_split_index = training_data[ERA_COL].isin(test_split)
|
| 82 |
+
downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]
|
| 83 |
+
|
| 84 |
+
# getting the per era correlation of each feature vs the primary target across the training split
|
| 85 |
+
print("getting feature correlations over time and identifying riskiest features")
|
| 86 |
+
all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
|
| 87 |
+
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
|
| 88 |
+
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
|
| 89 |
+
# there are probably more clever ways to do this
|
| 90 |
+
riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
|
| 91 |
+
|
| 92 |
+
print(f"entering model training loop for split {split+1}")
|
| 93 |
+
for target in targets:
|
| 94 |
+
model_name = f"model_{target}"
|
| 95 |
+
print(f"model: {model_name}")
|
| 96 |
+
|
| 97 |
+
# train a model on the training split (and save it for future use)
|
| 98 |
+
split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
|
| 99 |
+
split_model = load_model(split_model_name)
|
| 100 |
+
if not split_model:
|
| 101 |
+
print(f"training model: {model_name}")
|
| 102 |
+
split_model = LGBMRegressor(**model_params)
|
| 103 |
+
split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
|
| 104 |
+
training_data.loc[downsampled_train_split_index,
|
| 105 |
+
[target]])
|
| 106 |
+
save_model(split_model, split_model_name)
|
| 107 |
+
# now we can predict on the test part of the split
|
| 108 |
+
model_expected_features = split_model.booster_.feature_name()
|
| 109 |
+
if set(model_expected_features) != set(feature_cols):
|
| 110 |
+
print(f"New features are available! Might want to retrain model {split_model_name}.")
|
| 111 |
+
print(f"predicting {model_name}")
|
| 112 |
+
training_data.loc[test_split_index, f"preds_{model_name}"] = \
|
| 113 |
+
split_model.predict(training_data.loc[test_split_index, model_expected_features])
|
| 114 |
+
|
| 115 |
+
# do neutralization
|
| 116 |
+
print("doing neutralization to riskiest features")
|
| 117 |
+
training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
|
| 118 |
+
df=training_data.loc[test_split_index, :],
|
| 119 |
+
columns=[f"preds_{model_name}"],
|
| 120 |
+
neutralizers=riskiest_features_split,
|
| 121 |
+
proportion=1.0,
|
| 122 |
+
normalize=True,
|
| 123 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
| 124 |
+
|
| 125 |
+
# remember that we made all of these different pred columns
|
| 126 |
+
pred_cols.add(f"preds_{model_name}")
|
| 127 |
+
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
|
| 128 |
+
|
| 129 |
+
print("creating ensembles")
|
| 130 |
+
# ranking per era for all of our pred cols so we can combine safely on the same scales
|
| 131 |
+
training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
|
| 132 |
+
lambda d: d[list(pred_cols)].rank(pct=True))
|
| 133 |
+
# do ensembles
|
| 134 |
+
training_data["ensemble_neutral_riskiest_50"] = sum(
|
| 135 |
+
[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
| 136 |
+
pct=True)
|
| 137 |
+
training_data["ensemble_not_neutral"] = sum(
|
| 138 |
+
[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
| 139 |
+
training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
| 140 |
+
|
| 141 |
+
ensemble_cols.add("ensemble_neutral_riskiest_50")
|
| 142 |
+
ensemble_cols.add("ensemble_not_neutral")
|
| 143 |
+
ensemble_cols.add("ensemble_all")
|
| 144 |
+
|
| 145 |
+
""" Now get some stats and pick our favorite model"""
|
| 146 |
+
print("gathering validation metrics for out of sample training results")
|
| 147 |
+
all_model_cols = list(pred_cols) + list(ensemble_cols)
|
| 148 |
+
# use example_col preds_model_target as an estimates since no example preds provided for training
|
| 149 |
+
# fast_mode=True so that we skip some of the stats that are slower to calculate
|
| 150 |
+
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
|
| 151 |
+
fast_mode=True, target_col=TARGET_COL)
|
| 152 |
+
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
|
| 153 |
+
|
| 154 |
+
# pick the model that has the highest correlation sharpe
|
| 155 |
+
best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
|
| 156 |
+
print(f"selecting model {best_pred_col} as our highest sharpe model in validation")
|
| 157 |
+
|
| 158 |
+
""" Now do a full train"""
|
| 159 |
+
print("entering full training section")
|
| 160 |
+
# getting the per era correlation of each feature vs the target across all of training data
|
| 161 |
+
print("getting feature correlations with target and identifying riskiest features")
|
| 162 |
+
all_feature_corrs = training_data.groupby(ERA_COL).apply(
|
| 163 |
+
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
|
| 164 |
+
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
|
| 165 |
+
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
|
| 166 |
+
|
| 167 |
+
for target in targets:
|
| 168 |
+
gc.collect()
|
| 169 |
+
model_name = f"model_{target}_downsample{downsample_full_train}"
|
| 170 |
+
model = load_model(model_name)
|
| 171 |
+
if not model:
|
| 172 |
+
print(f"training {model_name}")
|
| 173 |
+
model = LGBMRegressor(**model_params)
|
| 174 |
+
# train on all of train, predict on val, predict on tournament
|
| 175 |
+
model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
|
| 176 |
+
training_data.iloc[::downsample_full_train][target])
|
| 177 |
+
save_model(model, model_name)
|
| 178 |
+
gc.collect()
|
| 179 |
+
|
| 180 |
+
model_config["feature_cols"] = feature_cols
|
| 181 |
+
model_config["targets"] = targets
|
| 182 |
+
model_config["best_pred_col"] = best_pred_col
|
| 183 |
+
model_config["riskiest_features"] = riskiest_features
|
| 184 |
+
print(f"saving model config for {model_config_name}")
|
| 185 |
+
save_model_config(model_config, model_config_name)
|
| 186 |
+
else:
|
| 187 |
+
# load model config from previous model selection loop
|
| 188 |
+
print(f"loading model config for {model_config_name}")
|
| 189 |
+
model_config = load_model_config(model_config_name)
|
| 190 |
+
feature_cols = model_config["feature_cols"]
|
| 191 |
+
targets = model_config["targets"]
|
| 192 |
+
best_pred_col = model_config["best_pred_col"]
|
| 193 |
+
riskiest_features = model_config["riskiest_features"]
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
""" Things that we always do even if we've already trained """
|
| 197 |
+
gc.collect()
|
| 198 |
+
|
| 199 |
+
print("reading tournament_data")
|
| 200 |
+
live_data = pd.read_parquet('v4/live.parquet')
|
| 201 |
+
print("reading validation_data")
|
| 202 |
+
validation_data = pd.read_parquet('v4/validation.parquet')
|
| 203 |
+
print("reading example_predictions")
|
| 204 |
+
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
|
| 205 |
+
print("reading example_validaton_predictions")
|
| 206 |
+
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
|
| 207 |
+
# set the example predictions
|
| 208 |
+
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
|
| 209 |
+
|
| 210 |
+
# check for nans and fill nans
|
| 211 |
+
print("checking for nans in the tournament data")
|
| 212 |
+
if live_data.loc[:, feature_cols].isna().sum().sum():
|
| 213 |
+
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
|
| 214 |
+
total_rows = len(live_data)
|
| 215 |
+
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
|
| 216 |
+
print(f"out of {total_rows} total rows")
|
| 217 |
+
print(f"filling nans with 0.5")
|
| 218 |
+
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
|
| 219 |
+
|
| 220 |
+
else:
|
| 221 |
+
print("No nans in the features this week!")
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
pred_cols = set()
|
| 225 |
+
ensemble_cols = set()
|
| 226 |
+
for target in targets:
|
| 227 |
+
gc.collect()
|
| 228 |
+
model_name = f"model_{target}_downsample{downsample_full_train}"
|
| 229 |
+
print(f"loading {model_name}")
|
| 230 |
+
model = load_model(model_name)
|
| 231 |
+
if not model:
|
| 232 |
+
raise ValueError(f"{model_name} is not trained yet!")
|
| 233 |
+
|
| 234 |
+
model_expected_features = model.booster_.feature_name()
|
| 235 |
+
if set(model_expected_features) != set(feature_cols):
|
| 236 |
+
print(f"New features are available! Might want to retrain model {model_name}.")
|
| 237 |
+
print(f"predicting tournament and validation for {model_name}")
|
| 238 |
+
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
|
| 239 |
+
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
|
| 240 |
+
|
| 241 |
+
# do different neutralizations
|
| 242 |
+
# neutralize our predictions to the riskiest features only
|
| 243 |
+
print("neutralizing to riskiest_50 for validation and tournament")
|
| 244 |
+
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
|
| 245 |
+
columns=[f"preds_{model_name}"],
|
| 246 |
+
neutralizers=riskiest_features,
|
| 247 |
+
proportion=1.0,
|
| 248 |
+
normalize=True,
|
| 249 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
| 250 |
+
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
|
| 251 |
+
columns=[f"preds_{model_name}"],
|
| 252 |
+
neutralizers=riskiest_features,
|
| 253 |
+
proportion=1.0,
|
| 254 |
+
normalize=True,
|
| 255 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
| 256 |
+
|
| 257 |
+
pred_cols.add(f"preds_{model_name}")
|
| 258 |
+
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# rank per era for each prediction column so that we can combine safely
|
| 262 |
+
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
|
| 263 |
+
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
|
| 264 |
+
# make ensembles for val and tournament
|
| 265 |
+
print('creating ensembles for tournament and validation')
|
| 266 |
+
validation_data["ensemble_neutral_riskiest_50"] = sum(
|
| 267 |
+
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
| 268 |
+
pct=True)
|
| 269 |
+
live_data["ensemble_neutral_riskiest_50"] = sum(
|
| 270 |
+
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
| 271 |
+
pct=True)
|
| 272 |
+
ensemble_cols.add("ensemble_neutral_riskiest_50")
|
| 273 |
+
|
| 274 |
+
validation_data["ensemble_not_neutral"] = sum(
|
| 275 |
+
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
| 276 |
+
live_data["ensemble_not_neutral"] = sum(
|
| 277 |
+
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
| 278 |
+
ensemble_cols.add("ensemble_not_neutral")
|
| 279 |
+
|
| 280 |
+
validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
| 281 |
+
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
| 282 |
+
|
| 283 |
+
ensemble_cols.add("ensemble_all")
|
| 284 |
+
|
| 285 |
+
gc.collect()
|
| 286 |
+
print("getting final validation stats")
|
| 287 |
+
# get our final validation stats for our chosen model
|
| 288 |
+
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
|
| 289 |
+
fast_mode=False, target_col=TARGET_COL)
|
| 290 |
+
print(validation_stats.to_markdown())
|
| 291 |
+
|
| 292 |
+
# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
|
| 293 |
+
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
|
| 294 |
+
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
|
| 295 |
+
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
|
| 296 |
+
save_prediction(live_data["prediction"], f"live_data_{current_round}")
|
utils.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import scipy
|
| 4 |
+
from halo import Halo
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import json
|
| 7 |
+
from scipy.stats import skew
|
| 8 |
+
|
| 9 |
+
ERA_COL = "era"
|
| 10 |
+
TARGET_COL = "target_nomi_v4_20"
|
| 11 |
+
DATA_TYPE_COL = "data_type"
|
| 12 |
+
EXAMPLE_PREDS_COL = "example_preds"
|
| 13 |
+
|
| 14 |
+
spinner = Halo(text='', spinner='dots')
|
| 15 |
+
|
| 16 |
+
MODEL_FOLDER = "models"
|
| 17 |
+
MODEL_CONFIGS_FOLDER = "model_configs"
|
| 18 |
+
PREDICTION_FILES_FOLDER = "prediction_files"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def save_prediction(df, name):
|
| 22 |
+
try:
|
| 23 |
+
Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
|
| 24 |
+
except Exception as ex:
|
| 25 |
+
pass
|
| 26 |
+
df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def save_model(model, name):
|
| 30 |
+
try:
|
| 31 |
+
Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
|
| 32 |
+
except Exception as ex:
|
| 33 |
+
pass
|
| 34 |
+
pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def load_model(name):
|
| 38 |
+
path = Path(f"{MODEL_FOLDER}/{name}.pkl")
|
| 39 |
+
if path.is_file():
|
| 40 |
+
model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
|
| 41 |
+
else:
|
| 42 |
+
model = False
|
| 43 |
+
return model
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def save_model_config(model_config, model_name):
|
| 47 |
+
try:
|
| 48 |
+
Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
|
| 49 |
+
except Exception as ex:
|
| 50 |
+
pass
|
| 51 |
+
with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp:
|
| 52 |
+
json.dump(model_config, fp)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def load_model_config(model_name):
|
| 56 |
+
path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
|
| 57 |
+
path = Path(path_str)
|
| 58 |
+
if path.is_file():
|
| 59 |
+
with open(path_str, 'r') as fp:
|
| 60 |
+
model_config = json.load(fp)
|
| 61 |
+
else:
|
| 62 |
+
model_config = False
|
| 63 |
+
return model_config
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_biggest_change_features(corrs, n):
|
| 67 |
+
all_eras = corrs.index.sort_values()
|
| 68 |
+
h1_eras = all_eras[:len(all_eras) // 2]
|
| 69 |
+
h2_eras = all_eras[len(all_eras) // 2:]
|
| 70 |
+
|
| 71 |
+
h1_corr_means = corrs.loc[h1_eras, :].mean()
|
| 72 |
+
h2_corr_means = corrs.loc[h2_eras, :].mean()
|
| 73 |
+
|
| 74 |
+
corr_diffs = h2_corr_means - h1_corr_means
|
| 75 |
+
worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
|
| 76 |
+
return worst_n
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_time_series_cross_val_splits(data, cv=3, embargo=12):
|
| 80 |
+
all_train_eras = data[ERA_COL].unique()
|
| 81 |
+
len_split = len(all_train_eras) // cv
|
| 82 |
+
test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
|
| 83 |
+
# fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
|
| 84 |
+
remainder = len(all_train_eras) % cv
|
| 85 |
+
if remainder != 0:
|
| 86 |
+
test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
|
| 87 |
+
|
| 88 |
+
train_splits = []
|
| 89 |
+
for test_split in test_splits:
|
| 90 |
+
test_split_max = int(np.max(test_split))
|
| 91 |
+
test_split_min = int(np.min(test_split))
|
| 92 |
+
# get all of the eras that aren't in the test split
|
| 93 |
+
train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)]
|
| 94 |
+
# embargo the train split so we have no leakage.
|
| 95 |
+
# one era is length 5, so we need to embargo by target_length/5 eras.
|
| 96 |
+
# To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
|
| 97 |
+
train_split = [e for e in train_split_not_embargoed if
|
| 98 |
+
abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
|
| 99 |
+
train_splits.append(train_split)
|
| 100 |
+
|
| 101 |
+
# convenient way to iterate over train and test splits
|
| 102 |
+
train_test_zip = zip(train_splits, test_splits)
|
| 103 |
+
return train_test_zip
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def neutralize(df,
|
| 107 |
+
columns,
|
| 108 |
+
neutralizers=None,
|
| 109 |
+
proportion=1.0,
|
| 110 |
+
normalize=True,
|
| 111 |
+
era_col="era"):
|
| 112 |
+
if neutralizers is None:
|
| 113 |
+
neutralizers = []
|
| 114 |
+
unique_eras = df[era_col].unique()
|
| 115 |
+
computed = []
|
| 116 |
+
for u in unique_eras:
|
| 117 |
+
df_era = df[df[era_col] == u]
|
| 118 |
+
scores = df_era[columns].values
|
| 119 |
+
if normalize:
|
| 120 |
+
scores2 = []
|
| 121 |
+
for x in scores.T:
|
| 122 |
+
x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
|
| 123 |
+
x = scipy.stats.norm.ppf(x)
|
| 124 |
+
scores2.append(x)
|
| 125 |
+
scores = np.array(scores2).T
|
| 126 |
+
exposures = df_era[neutralizers].values
|
| 127 |
+
|
| 128 |
+
scores -= proportion * exposures.dot(
|
| 129 |
+
np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32)))
|
| 130 |
+
|
| 131 |
+
scores /= scores.std(ddof=0)
|
| 132 |
+
|
| 133 |
+
computed.append(scores)
|
| 134 |
+
|
| 135 |
+
return pd.DataFrame(np.concatenate(computed),
|
| 136 |
+
columns=columns,
|
| 137 |
+
index=df.index)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def neutralize_series(series, by, proportion=1.0):
|
| 141 |
+
scores = series.values.reshape(-1, 1)
|
| 142 |
+
exposures = by.values.reshape(-1, 1)
|
| 143 |
+
|
| 144 |
+
# this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
|
| 145 |
+
exposures = np.hstack(
|
| 146 |
+
(exposures,
|
| 147 |
+
np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
|
| 148 |
+
|
| 149 |
+
correction = proportion * (exposures.dot(
|
| 150 |
+
np.linalg.lstsq(exposures, scores, rcond=None)[0]))
|
| 151 |
+
corrected_scores = scores - correction
|
| 152 |
+
neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
|
| 153 |
+
return neutralized
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def unif(df):
|
| 157 |
+
x = (df.rank(method="first") - 0.5) / len(df)
|
| 158 |
+
return pd.Series(x, index=df.index)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None):
|
| 162 |
+
if features_for_neutralization is None:
|
| 163 |
+
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
|
| 164 |
+
df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col],
|
| 165 |
+
features_for_neutralization)[prediction_col]
|
| 166 |
+
scores = df.groupby("era").apply(
|
| 167 |
+
lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean()
|
| 168 |
+
return np.mean(scores)
|
| 169 |
+
|
| 170 |
+
def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None):
|
| 171 |
+
if features_for_neutralization is None:
|
| 172 |
+
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
|
| 173 |
+
temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later
|
| 174 |
+
temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col],
|
| 175 |
+
features_for_neutralization)[prediction_col]
|
| 176 |
+
temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort()
|
| 177 |
+
temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb],
|
| 178 |
+
temp_df_argsort.iloc[-tb:]])
|
| 179 |
+
temp_df_tb = temp_df.loc[temp_df_tb_idx]
|
| 180 |
+
tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col])
|
| 181 |
+
return tb_fnc
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
|
| 185 |
+
unique_eras = df[era_col].unique()
|
| 186 |
+
computed = []
|
| 187 |
+
for u in unique_eras:
|
| 188 |
+
df_era = df[df[era_col] == u]
|
| 189 |
+
era_pred = np.float64(df_era[columns].values.T)
|
| 190 |
+
era_target = np.float64(df_era[target].values.T)
|
| 191 |
+
|
| 192 |
+
if tb is None:
|
| 193 |
+
ccs = np.corrcoef(era_target, era_pred)[0, 1:]
|
| 194 |
+
else:
|
| 195 |
+
tbidx = np.argsort(era_pred, axis=1)
|
| 196 |
+
tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
|
| 197 |
+
ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)]
|
| 198 |
+
ccs = np.array(ccs)
|
| 199 |
+
|
| 200 |
+
computed.append(ccs)
|
| 201 |
+
|
| 202 |
+
return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
|
| 203 |
+
|
| 204 |
+
def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
|
| 205 |
+
if feature_cols is None:
|
| 206 |
+
feature_cols = [c for c in df.columns if c.startswith("feature")]
|
| 207 |
+
u = df.loc[:, feature_cols].corrwith(df[prediction_col])
|
| 208 |
+
e = df.loc[:, feature_cols].corrwith(df[example_col])
|
| 209 |
+
return (1 - (np.dot(u,e)/np.dot(e,e)))
|
| 210 |
+
|
| 211 |
+
def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False,
|
| 212 |
+
target_col=TARGET_COL, features_for_neutralization=None):
|
| 213 |
+
validation_stats = pd.DataFrame()
|
| 214 |
+
feature_cols = [c for c in validation_data if c.startswith("feature_")]
|
| 215 |
+
for pred_col in pred_cols:
|
| 216 |
+
# Check the per-era correlations on the validation set (out of sample)
|
| 217 |
+
validation_correlations = validation_data.groupby(ERA_COL).apply(
|
| 218 |
+
lambda d: unif(d[pred_col]).corr(d[target_col]))
|
| 219 |
+
|
| 220 |
+
mean = validation_correlations.mean()
|
| 221 |
+
std = validation_correlations.std(ddof=0)
|
| 222 |
+
sharpe = mean / std
|
| 223 |
+
|
| 224 |
+
validation_stats.loc["mean", pred_col] = mean
|
| 225 |
+
validation_stats.loc["std", pred_col] = std
|
| 226 |
+
validation_stats.loc["sharpe", pred_col] = sharpe
|
| 227 |
+
|
| 228 |
+
rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large
|
| 229 |
+
min_periods=1).max()
|
| 230 |
+
daily_value = (validation_correlations + 1).cumprod()
|
| 231 |
+
max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
|
| 232 |
+
validation_stats.loc["max_drawdown", pred_col] = max_drawdown
|
| 233 |
+
|
| 234 |
+
payout_scores = validation_correlations.clip(-0.25, 0.25)
|
| 235 |
+
payout_daily_value = (payout_scores + 1).cumprod()
|
| 236 |
+
|
| 237 |
+
apy = (
|
| 238 |
+
(
|
| 239 |
+
(payout_daily_value.dropna().iloc[-1])
|
| 240 |
+
** (1 / len(payout_scores))
|
| 241 |
+
)
|
| 242 |
+
** 49 # 52 weeks of compounding minus 3 for stake compounding lag
|
| 243 |
+
- 1
|
| 244 |
+
) * 100
|
| 245 |
+
|
| 246 |
+
validation_stats.loc["apy", pred_col] = apy
|
| 247 |
+
|
| 248 |
+
if not fast_mode:
|
| 249 |
+
# Check the feature exposure of your validation predictions
|
| 250 |
+
max_per_era = validation_data.groupby(ERA_COL).apply(
|
| 251 |
+
lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
|
| 252 |
+
max_feature_exposure = max_per_era.mean()
|
| 253 |
+
validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure
|
| 254 |
+
|
| 255 |
+
# Check feature neutral mean
|
| 256 |
+
feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col,
|
| 257 |
+
target_col, features_for_neutralization)
|
| 258 |
+
validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean
|
| 259 |
+
|
| 260 |
+
# Check TB200 feature neutral mean
|
| 261 |
+
tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \
|
| 262 |
+
get_feature_neutral_mean_tb_era(df, pred_col,
|
| 263 |
+
target_col, 200,
|
| 264 |
+
features_for_neutralization))
|
| 265 |
+
validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean()
|
| 266 |
+
|
| 267 |
+
# Check top and bottom 200 metrics (TB200)
|
| 268 |
+
tb200_validation_correlations = fast_score_by_date(
|
| 269 |
+
validation_data,
|
| 270 |
+
[pred_col],
|
| 271 |
+
target_col,
|
| 272 |
+
tb=200,
|
| 273 |
+
era_col=ERA_COL
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
tb200_mean = tb200_validation_correlations.mean()[pred_col]
|
| 277 |
+
tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
|
| 278 |
+
tb200_sharpe = tb200_mean / tb200_std
|
| 279 |
+
|
| 280 |
+
validation_stats.loc["tb200_mean", pred_col] = tb200_mean
|
| 281 |
+
validation_stats.loc["tb200_std", pred_col] = tb200_std
|
| 282 |
+
validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
|
| 283 |
+
|
| 284 |
+
# MMC over validation
|
| 285 |
+
mmc_scores = []
|
| 286 |
+
corr_scores = []
|
| 287 |
+
for _, x in validation_data.groupby(ERA_COL):
|
| 288 |
+
series = neutralize_series(unif(x[pred_col]), (x[example_col]))
|
| 289 |
+
mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2))
|
| 290 |
+
corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
|
| 291 |
+
|
| 292 |
+
val_mmc_mean = np.mean(mmc_scores)
|
| 293 |
+
val_mmc_std = np.std(mmc_scores)
|
| 294 |
+
corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
|
| 295 |
+
corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
|
| 296 |
+
|
| 297 |
+
validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
|
| 298 |
+
validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
|
| 299 |
+
|
| 300 |
+
# Check correlation with example predictions
|
| 301 |
+
per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col])))
|
| 302 |
+
corr_with_example_preds = per_era_corrs.mean()
|
| 303 |
+
validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds
|
| 304 |
+
|
| 305 |
+
#Check exposure dissimilarity per era
|
| 306 |
+
tdf = validation_data.groupby(ERA_COL).apply(lambda df: \
|
| 307 |
+
exposure_dissimilarity_per_era(df, pred_col,
|
| 308 |
+
example_col, feature_cols))
|
| 309 |
+
validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
|
| 310 |
+
|
| 311 |
+
# .transpose so that stats are columns and the model_name is the row
|
| 312 |
+
return validation_stats.transpose()
|