Examples¶

This section demonstrates the core functionalities of the Lab class for managing machine learning experiments.

1. Initializing Lab¶

The Lab class is the central orchestrator. It requires a data source, an evaluation metric, a cross-validation strategy, and comparison criteria.

from empml.lab import Lab, ComparisonCriteria
from empml.data import ParquetDownloader
from empml.cv import KFold
from empml.metrics import RMSE

# Initialize the Lab environment
lab = Lab(
    train_downloader=ParquetDownloader('./data/train.parquet'),
    metric=RMSE(),
    cv_generator=KFold(n_splits=5, random_state=7),
    target='target_column',
    comparison_criteria=ComparisonCriteria(
        n_folds_threshold=2, # Experiment considered better if it improves >2 folds
        pct_threshold=0.01   # and improves metric by at least 1%
    ),
    minimize=True,  # True for errors (RMSE), False for scores (Accuracy)
    row_id='row_id' # Unique identifier for each row
)

2. Running a Pipeline Experiment¶

Create a pipeline typically involving transformers and a model wrapper, then run it.

from lightgbm import LGBMRegressor
from empml.pipeline import Pipeline
from empml.wrappers import SKlearnWrapper
from empml.transformers import Log1pFeatures

# Define a pipeline with feature engineering and model
pipeline = Pipeline([
    ('log_features', Log1pFeatures(features=['feature1', 'feature2'])),
    ('model', SKlearnWrapper(
        estimator=LGBMRegressor(verbose=-1),
        features=['feature1', 'feature2', 'feature3'],
        target='target_column'
    ))
], name='LGBM_Experiment', description='LGBM with Log1p transformation')

# Run the experiment
lab.run_experiment(pipeline)

3. Running Base Experiments¶

Run a suite of default baseline models (Linear Regression, KNN, Random Forest, etc.) to establish performance benchmarks.

# Run a suite of baseline models
lab.run_base_experiments(
    features=['feature1', 'feature2', 'feature3'],
    problem_type='regression' # or 'classification'
)

4. Setting the Best Experiment¶

Mark a specific experiment as the current "best" to compare future experiments against.

# Manually set the best experiment ID (e.g., ID 1)
lab._set_best_experiment(experiment_id=1)

5. Comparing Against Best Experiment¶

Run a new experiment and automatically compare it against the set baseline.

# Define another pipeline with different parameters
pipeline_v2 = Pipeline([
    ('model', SKlearnWrapper(
        estimator=LGBMRegressor(n_estimators=200, verbose=-1),
        features=['feature1', 'feature2', 'feature3'],
        target='target_column'
    ))
], name='LGBM_v2')

# Run and compare against the best experiment (ID 1)
lab.run_experiment(pipeline_v2, compare_against=1)

# Alternatively, use auto_mode to automatically compare against the current best experiment
# and update the best_experiment attribute if the new pipeline performs better according to the comparison criteria
lab._set_best_experiment(1)
lab.run_experiment(pipeline_v2, auto_mode=True)

6. Hyperparameter Optimization (HPO)¶

Perform grid or random search over a managed search space.

# Define hyperparameter search space
params_list = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 64]
}

# Launch optimization
best_result = lab.hpo(
    features=['feature1', 'feature2', 'feature3'],
    params_list=params_list,
    estimator=LGBMRegressor,
    search_type='random',
    num_samples=10,
    store_preds=False
)

7. Permutation Feature Importance¶

Analyze which features contribute most to the model's performance.

# Retrieve the best pipeline
pipeline = lab.retrieve_pipeline(experiment_id=1)

# Calculate feature importance
pfi : pl.DataFrame= lab.permutation_feature_importance(
    pipeline=pipeline,
    features=['feature1', 'feature2', 'feature3'],
    n_iters=5
)

pfi

8. Retrieving Predictions & Error Analysis¶

Load out-of-fold predictions to analyze where the model fails.

import polars as pl

# Retrieve predictions from specific experiments
preds = lab.retrieve_predictions(
    experiment_ids=[1], # list of experiment IDs
    extra_features=['date'] # Optional: add extra columns from training data
)

9. Multi-Metric Lab¶

Evaluate models on multiple metrics simultaneously. The Lab tracks all metrics and requires improvement on all of them for a model to be considered better.

from empml.lab import Lab, ComparisonCriteria
from empml.data import ParquetDownloader
from empml.cv import KFold
from empml.metrics import RMSE, MAE

# Initialize Lab with multiple metrics
lab = Lab(
    train_downloader=ParquetDownloader('./data/train.parquet'),
    metric=[RMSE(), MAE()],
    cv_generator=KFold(n_splits=5, random_state=7),
    target='target_column',
    comparison_criteria=ComparisonCriteria(
        n_folds_threshold=2,
        pct_threshold=0.01
    ),
    minimize=[True, True],  # both RMSE and MAE should be minimized
    name='multi_metric_lab'
)

# Run experiments as usual - results will contain suffixed columns
# (cv_mean_score_1 for RMSE, cv_mean_score_2 for MAE)
lab.run_experiment(pipeline, auto_mode=True)

# View best score for each metric
lab.show_best_score(metric_idx=0)  # best by RMSE
lab.show_best_score(metric_idx=1)  # best by MAE

# HPO with multi-metric: optimize based on a specific metric or all
best_result = lab.hpo(
    features=['feature1', 'feature2', 'feature3'],
    params_list={'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]},
    estimator=LGBMRegressor,
    primary_metric_idx='all'  # or 0 for RMSE only, 1 for MAE only
)

10. Loading Data from SQL Databases¶

EmpiricML provides built-in downloaders for SQL databases and cloud data warehouses.

from empml.data import PostgreSQLDownloader, SQLiteDownloader, SQLDownloader

# PostgreSQL
pg_downloader = PostgreSQLDownloader(
    query='SELECT * FROM my_table',
    host='localhost',
    user='my_user',
    password='my_password',
    database='my_database',
    port=5432
)

# SQLite
sqlite_downloader = SQLiteDownloader(
    query='SELECT * FROM my_table',
    path='/path/to/database.db'
)

# Generic SQL with connection URI
sql_downloader = SQLDownloader(
    query='SELECT * FROM my_table',
    connection_uri='postgresql://user:password@host:5432/database'
)

# Use any downloader as train or test data source
lab = Lab(
    train_downloader=pg_downloader,
    metric=RMSE(),
    cv_generator=KFold(n_splits=5),
    target='target_column',
    comparison_criteria=ComparisonCriteria(n_folds_threshold=1, pct_threshold=0.01),
    minimize=True
)

11. Recursive Feature Selection¶

Automatically identify and remove features that hurt model performance using permutation-based importance.

from lightgbm import LGBMRegressor

# Recursively remove features with negative importance
selected_features = lab.recursive_permutation_feature_selection(
    estimator=LGBMRegressor(verbose=-1),
    features=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'],
    n_iters=5,
    verbose=True
)

# selected_features contains only the features that contribute positively
print(f"Selected features: {selected_features}")

12. Using Transformers in Pipelines¶

EmpiricML provides a rich set of transformers for feature engineering.

Feature Interactions¶

from empml.transformers import InteractionFeatures

# Create pairwise multiplication features
interactions = InteractionFeatures(
    feature_pairs=[('feature1', 'feature2'), ('feature2', 'feature3')],
    separator='_x_'
)
# Creates columns: feature1_x_feature2, feature2_x_feature3

Frequency Encoding¶

from empml.transformers import FrequencyEncoder

# Encode categories by their frequency
freq_encoder = FrequencyEncoder(
    features=['category_col'],
    normalize=True,         # proportion instead of raw count
    replace_original=False  # keep original column
)

Robust Scaling¶

from empml.transformers import RobustScaler

# Scale features using median/IQR (outlier-resistant)
scaler = RobustScaler(features=['feature1', 'feature2'])

Quantile Binning¶

from empml.transformers import QuantileBinning

# Discretize into quantile-based bins
binner = QuantileBinning(
    features=['continuous_feature'],
    num_bins=5,
    labels=['very_low', 'low', 'medium', 'high', 'very_high']
)

Clustering and Dimensionality Reduction¶

from empml.transformers import KMeansCluster, PCATransformer

# Add cluster labels as a new feature
kmeans = KMeansCluster(
    features=['feature1', 'feature2', 'feature3'],
    num_clusters=5,
    new_feature='cluster_label'
)

# Reduce dimensions with PCA
pca = PCATransformer(
    features=['feature1', 'feature2', 'feature3', 'feature4'],
    n_components=2,
    prefix='pc_'
)

Combining Transformers in a Pipeline¶

from empml.pipeline import Pipeline
from empml.transformers import StandardScaler, InteractionFeatures, PCATransformer
from empml.wrappers import SKlearnWrapper
from lightgbm import LGBMRegressor

# Build a feature engineering + model pipeline
pipeline = Pipeline([
    ('interactions', InteractionFeatures(
        feature_pairs=[('f1', 'f2'), ('f2', 'f3')]
    )),
    ('scaler', StandardScaler(features=['f1', 'f2', 'f3'])),
    ('pca', PCATransformer(features=['f1', 'f2', 'f3'], n_components=2)),
    ('model', SKlearnWrapper(
        estimator=LGBMRegressor(verbose=-1),
        features=['f1', 'f2', 'f3', 'f1_x_f2', 'f2_x_f3', 'pc_0', 'pc_1'],
        target='target'
    ))
], name='Advanced Pipeline')

lab.run_experiment(pipeline)