Examples¶
This section demonstrates the core functionalities of the Lab class for managing machine learning experiments.
1. Initializing Lab¶
The Lab class is the central orchestrator. It requires a data source, an evaluation metric, a cross-validation strategy, and comparison criteria.
from empml.lab import Lab, ComparisonCriteria
from empml.data import ParquetDownloader
from empml.cv import KFold
from empml.metrics import RMSE
# Initialize the Lab environment
lab = Lab(
train_downloader=ParquetDownloader('./data/train.parquet'),
metric=RMSE(),
cv_generator=KFold(n_splits=5, random_state=7),
target='target_column',
comparison_criteria=ComparisonCriteria(
n_folds_threshold=2, # Experiment considered better if it improves >2 folds
pct_threshold=0.01 # and improves metric by at least 1%
),
minimize=True, # True for errors (RMSE), False for scores (Accuracy)
row_id='row_id' # Unique identifier for each row
)
2. Running a Pipeline Experiment¶
Create a pipeline typically involving transformers and a model wrapper, then run it.
from lightgbm import LGBMRegressor
from empml.pipeline import Pipeline
from empml.wrappers import SKlearnWrapper
from empml.transformers import Log1pFeatures
# Define a pipeline with feature engineering and model
pipeline = Pipeline([
('log_features', Log1pFeatures(features=['feature1', 'feature2'])),
('model', SKlearnWrapper(
estimator=LGBMRegressor(verbose=-1),
features=['feature1', 'feature2', 'feature3'],
target='target_column'
))
], name='LGBM_Experiment', description='LGBM with Log1p transformation')
# Run the experiment
lab.run_experiment(pipeline)
3. Running Base Experiments¶
Run a suite of default baseline models (Linear Regression, KNN, Random Forest, etc.) to establish performance benchmarks.
# Run a suite of baseline models
lab.run_base_experiments(
features=['feature1', 'feature2', 'feature3'],
problem_type='regression' # or 'classification'
)
4. Setting the Best Experiment¶
Mark a specific experiment as the current "best" to compare future experiments against.
5. Comparing Against Best Experiment¶
Run a new experiment and automatically compare it against the set baseline.
# Define another pipeline with different parameters
pipeline_v2 = Pipeline([
('model', SKlearnWrapper(
estimator=LGBMRegressor(n_estimators=200, verbose=-1),
features=['feature1', 'feature2', 'feature3'],
target='target_column'
))
], name='LGBM_v2')
# Run and compare against the best experiment (ID 1)
lab.run_experiment(pipeline_v2, compare_against=1)
# Alternatively, use auto_mode to automatically compare against the current best experiment
# and update the best_experiment attribute if the new pipeline performs better according to the comparison criteria
lab._set_best_experiment(1)
lab.run_experiment(pipeline_v2, auto_mode=True)
6. Hyperparameter Optimization (HPO)¶
Perform grid or random search over a managed search space.
# Define hyperparameter search space
params_list = {
'n_estimators': [100, 200, 500],
'learning_rate': [0.01, 0.05, 0.1],
'num_leaves': [31, 64]
}
# Launch optimization
best_result = lab.hpo(
features=['feature1', 'feature2', 'feature3'],
params_list=params_list,
estimator=LGBMRegressor,
search_type='random',
num_samples=10,
store_preds=False
)
7. Permutation Feature Importance¶
Analyze which features contribute most to the model's performance.
# Retrieve the best pipeline
pipeline = lab.retrieve_pipeline(experiment_id=1)
# Calculate feature importance
pfi : pl.DataFrame= lab.permutation_feature_importance(
pipeline=pipeline,
features=['feature1', 'feature2', 'feature3'],
n_iters=5
)
pfi
8. Retrieving Predictions & Error Analysis¶
Load out-of-fold predictions to analyze where the model fails.
import polars as pl
# Retrieve predictions from specific experiments
preds = lab.retrieve_predictions(
experiment_ids=[1], # list of experiment IDs
extra_features=['date'] # Optional: add extra columns from training data
)
9. Multi-Metric Lab¶
Evaluate models on multiple metrics simultaneously. The Lab tracks all metrics and requires improvement on all of them for a model to be considered better.
from empml.lab import Lab, ComparisonCriteria
from empml.data import ParquetDownloader
from empml.cv import KFold
from empml.metrics import RMSE, MAE
# Initialize Lab with multiple metrics
lab = Lab(
train_downloader=ParquetDownloader('./data/train.parquet'),
metric=[RMSE(), MAE()],
cv_generator=KFold(n_splits=5, random_state=7),
target='target_column',
comparison_criteria=ComparisonCriteria(
n_folds_threshold=2,
pct_threshold=0.01
),
minimize=[True, True], # both RMSE and MAE should be minimized
name='multi_metric_lab'
)
# Run experiments as usual - results will contain suffixed columns
# (cv_mean_score_1 for RMSE, cv_mean_score_2 for MAE)
lab.run_experiment(pipeline, auto_mode=True)
# View best score for each metric
lab.show_best_score(metric_idx=0) # best by RMSE
lab.show_best_score(metric_idx=1) # best by MAE
# HPO with multi-metric: optimize based on a specific metric or all
best_result = lab.hpo(
features=['feature1', 'feature2', 'feature3'],
params_list={'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]},
estimator=LGBMRegressor,
primary_metric_idx='all' # or 0 for RMSE only, 1 for MAE only
)
10. Loading Data from SQL Databases¶
EmpiricML provides built-in downloaders for SQL databases and cloud data warehouses.
from empml.data import PostgreSQLDownloader, SQLiteDownloader, SQLDownloader
# PostgreSQL
pg_downloader = PostgreSQLDownloader(
query='SELECT * FROM my_table',
host='localhost',
user='my_user',
password='my_password',
database='my_database',
port=5432
)
# SQLite
sqlite_downloader = SQLiteDownloader(
query='SELECT * FROM my_table',
path='/path/to/database.db'
)
# Generic SQL with connection URI
sql_downloader = SQLDownloader(
query='SELECT * FROM my_table',
connection_uri='postgresql://user:password@host:5432/database'
)
# Use any downloader as train or test data source
lab = Lab(
train_downloader=pg_downloader,
metric=RMSE(),
cv_generator=KFold(n_splits=5),
target='target_column',
comparison_criteria=ComparisonCriteria(n_folds_threshold=1, pct_threshold=0.01),
minimize=True
)
11. Recursive Feature Selection¶
Automatically identify and remove features that hurt model performance using permutation-based importance.
from lightgbm import LGBMRegressor
# Recursively remove features with negative importance
selected_features = lab.recursive_permutation_feature_selection(
estimator=LGBMRegressor(verbose=-1),
features=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'],
n_iters=5,
verbose=True
)
# selected_features contains only the features that contribute positively
print(f"Selected features: {selected_features}")
12. Using Transformers in Pipelines¶
EmpiricML provides a rich set of transformers for feature engineering.
Feature Interactions¶
from empml.transformers import InteractionFeatures
# Create pairwise multiplication features
interactions = InteractionFeatures(
feature_pairs=[('feature1', 'feature2'), ('feature2', 'feature3')],
separator='_x_'
)
# Creates columns: feature1_x_feature2, feature2_x_feature3
Frequency Encoding¶
from empml.transformers import FrequencyEncoder
# Encode categories by their frequency
freq_encoder = FrequencyEncoder(
features=['category_col'],
normalize=True, # proportion instead of raw count
replace_original=False # keep original column
)
Robust Scaling¶
from empml.transformers import RobustScaler
# Scale features using median/IQR (outlier-resistant)
scaler = RobustScaler(features=['feature1', 'feature2'])
Quantile Binning¶
from empml.transformers import QuantileBinning
# Discretize into quantile-based bins
binner = QuantileBinning(
features=['continuous_feature'],
num_bins=5,
labels=['very_low', 'low', 'medium', 'high', 'very_high']
)
Clustering and Dimensionality Reduction¶
from empml.transformers import KMeansCluster, PCATransformer
# Add cluster labels as a new feature
kmeans = KMeansCluster(
features=['feature1', 'feature2', 'feature3'],
num_clusters=5,
new_feature='cluster_label'
)
# Reduce dimensions with PCA
pca = PCATransformer(
features=['feature1', 'feature2', 'feature3', 'feature4'],
n_components=2,
prefix='pc_'
)
Combining Transformers in a Pipeline¶
from empml.pipeline import Pipeline
from empml.transformers import StandardScaler, InteractionFeatures, PCATransformer
from empml.wrappers import SKlearnWrapper
from lightgbm import LGBMRegressor
# Build a feature engineering + model pipeline
pipeline = Pipeline([
('interactions', InteractionFeatures(
feature_pairs=[('f1', 'f2'), ('f2', 'f3')]
)),
('scaler', StandardScaler(features=['f1', 'f2', 'f3'])),
('pca', PCATransformer(features=['f1', 'f2', 'f3'], n_components=2)),
('model', SKlearnWrapper(
estimator=LGBMRegressor(verbose=-1),
features=['f1', 'f2', 'f3', 'f1_x_f2', 'f2_x_f3', 'pc_0', 'pc_1'],
target='target'
))
], name='Advanced Pipeline')
lab.run_experiment(pipeline)