Model Building with BoFire

This notebooks shows how to setup and analyze models trained with BoFire. It is still WIP.

Imports

from pydantic import TypeAdapter

import bofire.surrogates.api as surrogates
from bofire.benchmarks.multi import CrossCoupling
from bofire.benchmarks.single import Himmelblau
from bofire.data_models.domain.api import Outputs
from bofire.data_models.enum import CategoricalEncodingEnum
from bofire.data_models.surrogates.api import (
    AnySurrogate,
    EmpiricalSurrogate,
    MixedSingleTaskGPSurrogate,
    RandomForestSurrogate,
    RegressionMLPEnsemble,
    SingleTaskGPSurrogate,
)

Problem Setup

For didactic purposes, we sample data from a Himmelblau benchmark function and use them to train a SingleTaskGP.

benchmark = Himmelblau()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)
x_1 x_2 y valid_y
0 -3.971659 -5.350046 311.901427 1
1 5.117836 1.145212 267.238115 1
2 -1.322156 -5.054546 501.419285 1
3 0.999569 3.605967 89.930657 1
4 5.441070 -2.289502 279.767243 1
5 -1.916543 0.831947 109.824780 1
6 1.750353 -0.932619 97.840374 1
7 1.947252 -5.695781 916.679727 1
8 5.596591 3.599441 705.689191 1
9 -5.088315 -0.560774 343.977551 1

Model Fitting

input_features = benchmark.domain.inputs
output_features = benchmark.domain.outputs
input_features.model_dump_json()
'{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]}'
output_features.model_dump_json()
'{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]}'

Single Task GP

Generate the json spec

# we setup the data model, here a Single Task GP
surrogate_data = SingleTaskGPSurrogate(inputs=input_features, outputs=output_features)

# we generate the json spec
jspec = surrogate_data.model_dump_json()

jspec
'{"hyperconfig":{"type":"SingleTaskGPHyperconfig","hyperstrategy":"FractionalFactorialStrategy","inputs":{"type":"Inputs","features":[{"type":"CategoricalInput","key":"kernel","categories":["rbf","matern_1.5","matern_2.5"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"prior","categories":["mbo","threesix","hvarfner"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"scalekernel","categories":["True","False"],"allowed":[true,true]},{"type":"CategoricalInput","key":"ard","categories":["True","False"],"allowed":[true,true]}]},"n_iterations":null,"target_metric":"MAE","lengthscale_constraint":null,"outputscale_constraint":null},"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"SingleTaskGPSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":"NORMALIZE","output_scaler":"STANDARDIZE","kernel":{"type":"RBFKernel","features":null,"ard":true,"lengthscale_prior":{"type":"DimensionalityScaledLogNormalPrior","loc":1.4142135623730951,"loc_scaling":0.5,"scale":1.7320508075688772,"scale_scaling":0.0},"lengthscale_constraint":null},"noise_prior":{"type":"LogNormalPrior","loc":-4.0,"scale":1.0}}'

Load it from the spec

surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)

Map it

surrogate = surrogates.map(surrogate_data)

Fit it. This is not 100% finished. In the future we will call here hyperfit which will return the CV results etc. This has to be finished. So ignore this for now and just call fit.

surrogate.fit(experiments=experiments)

Dump it.

# dump it
dump = surrogate.dumps()

Make predictions.

# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

Load again from spec and dump and make predictions.

surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions == predictions2
True

Random Forest

Generate the json spec

# we setup the data model, here a Single Task GP
surrogate_data = RandomForestSurrogate(
    inputs=input_features,
    outputs=output_features,
    random_state=42,
)

# we generate the json spec
jspec = surrogate_data.model_dump_json()

jspec
'{"hyperconfig":null,"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"RandomForestSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":"NORMALIZE","output_scaler":"STANDARDIZE","n_estimators":100,"criterion":"squared_error","max_depth":null,"min_samples_split":2,"min_samples_leaf":1,"min_weight_fraction_leaf":0.0,"max_features":1.0,"max_leaf_nodes":null,"min_impurity_decrease":0.0,"bootstrap":true,"oob_score":false,"random_state":42,"ccp_alpha":0.0,"max_samples":null}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions == predictions2
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:

Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
True

MLP Ensemble

Generate the json spec

# we setup the data model, here a Single Task GP
surrogate_data = RegressionMLPEnsemble(
    inputs=input_features,
    outputs=output_features,
    n_estimators=2,
)

# we generate the json spec
jspec = surrogate_data.model_dump_json()

jspec
'{"hyperconfig":null,"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"RegressionMLPEnsemble","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":"IDENTITY","output_scaler":"IDENTITY","n_estimators":2,"hidden_layer_sizes":[100],"activation":"relu","dropout":0.0,"batch_size":10,"n_epochs":200,"lr":0.0001,"weight_decay":0.0,"subsample_fraction":1.0,"shuffle":true,"final_activation":"identity"}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions == predictions2
True

Empirical Surrogate

The empirical model is special as it has per default no fit and you need cloudpickle. There can be empirical models which implement a fit, but for this they also have to inherit from Trainable. The current example is the default without any fit functionality.

from botorch.models.deterministic import DeterministicModel
from torch import Tensor


class HimmelblauModel(DeterministicModel):
    def __init__(self):
        super().__init__()
        self._num_outputs = 1

    def forward(self, X: Tensor) -> Tensor:
        return (
            (X[..., 0] ** 2 + X[..., 1] - 11.0) ** 2
            + (X[..., 0] + X[..., 1] ** 2 - 7.0) ** 2
        ).unsqueeze(-1)
surrogate_data = EmpiricalSurrogate(

    inputs=input_features,
    outputs=output_features,
)

# we generate the json spec
jspec = surrogate_data.model_dump_json()

jspec
'{"type":"EmpiricalSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{}}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# attach the actual model to it
surrogate.model = HimmelblauModel()
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions == predictions2
True

Mixed GP

Generate data for a mixed problem.

benchmark = CrossCoupling()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)
base_eq t_res temperature base catalyst yield cost valid_cost valid_yield
0 1.237318 174.085695 45.903435 BTMG tBuXPhos 0.853496 0.302285 1 1
1 1.381075 571.704217 60.349985 TMG tBuBrettPhos 0.143075 0.278321 1 1
2 1.437364 1332.871140 69.652016 BTMG tBuXPhos 0.936670 0.311023 1 1
3 1.185575 683.002135 47.330111 BTMG tBuXPhos 0.873881 0.300025 1 1
4 1.606041 1240.813114 37.372715 DBU tBuXPhos 0.210268 0.249993 1 1
5 1.907710 630.816136 35.735909 DBU tBuBrettPhos 0.387710 0.280354 1 1
6 2.452970 577.157637 43.619563 BTMG AlPhos 1.007552 0.526157 1 1
7 1.383725 1336.089780 50.740549 BTMG tBuXPhos 0.906308 0.308680 1 1
8 2.396409 487.287544 64.252560 DBU AlPhos 0.923412 0.421628 1 1
9 1.883643 1286.205877 32.061843 BTMG tBuXPhos 0.882746 0.330517 1 1
# we setup the data model, here a Single Task GP
surrogate_data = MixedSingleTaskGPSurrogate(
    inputs=benchmark.domain.inputs,
    outputs=Outputs(features=[benchmark.domain.outputs.features[0]]),
    categorical_encodings={"catalyst": CategoricalEncodingEnum.ORDINAL},
)

# we generate the json spec
jspec = surrogate_data.model_dump_json()

jspec
'{"hyperconfig":{"type":"MixedSingleTaskGPHyperconfig","hyperstrategy":"FractionalFactorialStrategy","inputs":{"type":"Inputs","features":[{"type":"CategoricalInput","key":"continuous_kernel","categories":["rbf","matern_1.5","matern_2.5"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"prior","categories":["mbo","threesix","hvarfner"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"ard","categories":["True","False"],"allowed":[true,true]}]},"n_iterations":null,"target_metric":"MAE"},"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"MixedSingleTaskGPSurrogate","inputs":{"type":"Inputs","features":[{"type":"CategoricalDescriptorInput","key":"catalyst","categories":["tBuXPhos","tBuBrettPhos","AlPhos"],"allowed":[true,true,true],"descriptors":["area_cat","M2_cat"],"values":[[460.7543,67.2057],[518.8408,89.8738],[819.933,129.0808]]},{"type":"CategoricalDescriptorInput","key":"base","categories":["TEA","TMG","BTMG","DBU"],"allowed":[true,true,true,true],"descriptors":["area","M2"],"values":[[162.2992,25.8165],[165.5447,81.4847],[227.3523,30.554],[192.4693,59.8367]]},{"type":"ContinuousInput","key":"base_eq","unit":null,"bounds":[1.0,2.5],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"temperature","unit":null,"bounds":[30.0,100.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"t_res","unit":null,"bounds":[60.0,1800.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"yield","unit":null,"objective":{"type":"MaximizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{"base":"ORDINAL","catalyst":"ORDINAL"},"dump":null,"categorical_encodings":{"catalyst":"ORDINAL","base":"DESCRIPTOR"},"scaler":"NORMALIZE","output_scaler":"STANDARDIZE","continuous_kernel":{"type":"RBFKernel","features":["t_res","temperature","base","base_eq"],"ard":true,"lengthscale_prior":{"type":"DimensionalityScaledLogNormalPrior","loc":1.4142135623730951,"loc_scaling":0.5,"scale":1.7320508075688772,"scale_scaling":0.0},"lengthscale_constraint":{"type":"GreaterThan","lower_bound":0.025}},"categorical_kernel":{"type":"HammingDistanceKernel","features":["catalyst"],"ard":true,"lengthscale_prior":null,"lengthscale_constraint":{"type":"GreaterThan","lower_bound":1e-6}},"noise_prior":{"type":"LogNormalPrior","loc":-4.0,"scale":1.0}}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions == predictions2
True