from pydantic import TypeAdapter
import bofire.surrogates.api as surrogates
from bofire.benchmarks.multi import CrossCoupling
from bofire.benchmarks.single import Himmelblau
from bofire.data_models.domain.api import Outputs
from bofire.data_models.enum import CategoricalEncodingEnum
from bofire.data_models.surrogates.api import (
AnySurrogate,
EmpiricalSurrogate,
MixedSingleTaskGPSurrogate,
RandomForestSurrogate,
RegressionMLPEnsemble,
SingleTaskGPSurrogate,
)Model Building with BoFire
This notebooks shows how to setup and analyze models trained with BoFire. It is still WIP.
Imports
Problem Setup
For didactic purposes, we sample data from a Himmelblau benchmark function and use them to train a SingleTaskGP.
benchmark = Himmelblau()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)
experiments.head(10)| x_1 | x_2 | y | valid_y | |
|---|---|---|---|---|
| 0 | -0.896581 | 4.599956 | 207.224798 | 1 |
| 1 | -5.897887 | 0.548125 | 750.799868 | 1 |
| 2 | -4.274336 | -1.807946 | 93.924244 | 1 |
| 3 | -2.002621 | 5.486409 | 447.387662 | 1 |
| 4 | -0.227918 | -5.297903 | 698.230703 | 1 |
| 5 | 3.520381 | 2.851489 | 39.651614 | 1 |
| 6 | -4.564471 | -4.542268 | 110.230228 | 1 |
| 7 | -4.615929 | -1.903287 | 134.513965 | 1 |
| 8 | -4.307464 | 0.646029 | 185.839013 | 1 |
| 9 | 4.551324 | -0.243335 | 95.413389 | 1 |
Model Fitting
input_features = benchmark.domain.inputs
output_features = benchmark.domain.outputsinput_features.model_dump_json()'{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]}'
output_features.model_dump_json()'{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]}'
Single Task GP
Generate the json spec
# we setup the data model, here a Single Task GP
surrogate_data = SingleTaskGPSurrogate(inputs=input_features, outputs=output_features)
# we generate the json spec
jspec = surrogate_data.model_dump_json()
jspec'{"hyperconfig":{"type":"SingleTaskGPHyperconfig","hyperstrategy":"FractionalFactorialStrategy","inputs":{"type":"Inputs","features":[{"type":"CategoricalInput","key":"kernel","categories":["rbf","matern_1.5","matern_2.5"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"prior","categories":["mbo","threesix","hvarfner"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"scalekernel","categories":["True","False"],"allowed":[true,true]},{"type":"CategoricalInput","key":"ard","categories":["True","False"],"allowed":[true,true]}]},"n_iterations":null,"target_metric":"MAE","lengthscale_constraint":null,"outputscale_constraint":null},"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"SingleTaskGPSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":{"type":"Normalize","features":[]},"output_scaler":"STANDARDIZE","kernel":{"type":"RBFKernel","features":null,"ard":true,"lengthscale_prior":{"type":"DimensionalityScaledLogNormalPrior","loc":1.4142135623730951,"loc_scaling":0.5,"scale":1.7320508075688772,"scale_scaling":0.0},"lengthscale_constraint":null},"noise_prior":{"type":"LogNormalPrior","loc":-4.0,"scale":1.0}}'
Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)Map it
surrogate = surrogates.map(surrogate_data)Fit it. This is not 100% finished. In the future we will call here hyperfit which will return the CV results etc. This has to be finished. So ignore this for now and just call fit.
surrogate.fit(experiments=experiments)/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/bofire/surrogates/botorch.py:181: UserWarning:
The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:213.)
Dump it.
# dump it
dump = surrogate.dumps()Make predictions.
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)Load again from spec and dump and make predictions.
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)
# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)
# check for equality
predictions == predictions2True
Random Forest
Generate the json spec
# we setup the data model, here a Single Task GP
surrogate_data = RandomForestSurrogate(
inputs=input_features,
outputs=output_features,
random_state=42,
)
# we generate the json spec
jspec = surrogate_data.model_dump_json()
jspec'{"hyperconfig":null,"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"RandomForestSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":{"type":"Normalize","features":[]},"output_scaler":"STANDARDIZE","n_estimators":100,"criterion":"squared_error","max_depth":null,"min_samples_split":2,"min_samples_leaf":1,"min_weight_fraction_leaf":0.0,"max_features":1.0,"max_leaf_nodes":null,"min_impurity_decrease":0.0,"bootstrap":true,"oob_score":false,"random_state":42,"ccp_alpha":0.0,"max_samples":null}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)
# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)
# check for equality
predictions == predictions2/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/botorch/models/ensemble.py:82: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/torch/nn/modules/module.py:2916: RuntimeWarning:
Could not update `train_inputs` with transformed inputs since _RandomForest does not have a `train_inputs` attribute. Make sure that the `input_transform` is applied to both the train inputs and test inputs.
True
MLP Ensemble
Generate the json spec
# we setup the data model, here a Single Task GP
surrogate_data = RegressionMLPEnsemble(
inputs=input_features,
outputs=output_features,
n_estimators=2,
)
# we generate the json spec
jspec = surrogate_data.model_dump_json()
jspec'{"hyperconfig":null,"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"RegressionMLPEnsemble","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{},"scaler":null,"output_scaler":"IDENTITY","n_estimators":2,"hidden_layer_sizes":[100],"activation":"relu","dropout":0.0,"batch_size":10,"n_epochs":200,"lr":0.0001,"weight_decay":0.0,"subsample_fraction":1.0,"shuffle":true,"final_activation":"identity"}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)
# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)
# check for equality
predictions == predictions2True
Empirical Surrogate
The empirical model is special as it has per default no fit and you need cloudpickle. There can be empirical models which implement a fit, but for this they also have to inherit from Trainable. The current example is the default without any fit functionality.
from botorch.models.deterministic import DeterministicModel
from torch import Tensor
class HimmelblauModel(DeterministicModel):
def __init__(self):
super().__init__()
self._num_outputs = 1
def forward(self, X: Tensor) -> Tensor:
return (
(X[..., 0] ** 2 + X[..., 1] - 11.0) ** 2
+ (X[..., 0] + X[..., 1] ** 2 - 7.0) ** 2
).unsqueeze(-1)surrogate_data = EmpiricalSurrogate(
inputs=input_features,
outputs=output_features,
)
# we generate the json spec
jspec = surrogate_data.model_dump_json()
jspec'{"type":"EmpiricalSurrogate","inputs":{"type":"Inputs","features":[{"type":"ContinuousInput","key":"x_1","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"x_2","unit":null,"bounds":[-6.0,6.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"y","unit":null,"objective":{"type":"MinimizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{},"dump":null,"categorical_encodings":{}}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# attach the actual model to it
surrogate.model = HimmelblauModel()
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)
# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)
# check for equality
predictions == predictions2True
Mixed GP
Generate data for a mixed problem.
benchmark = CrossCoupling()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)
experiments.head(10)| base_eq | t_res | temperature | base | catalyst | yield | cost | valid_cost | valid_yield | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.010750 | 648.489731 | 65.673108 | DBU | AlPhos | 0.897967 | 0.421207 | 1 | 1 |
| 1 | 2.370391 | 1257.594951 | 43.114574 | DBU | tBuXPhos | 0.345410 | 0.250828 | 1 | 1 |
| 2 | 2.168319 | 746.509995 | 80.135612 | TEA | tBuBrettPhos | 0.048166 | 0.279060 | 1 | 1 |
| 3 | 2.057409 | 1692.915895 | 82.970650 | DBU | tBuXPhos | 1.103953 | 0.250486 | 1 | 1 |
| 4 | 1.666127 | 605.646729 | 41.158891 | TMG | tBuBrettPhos | 0.045550 | 0.278331 | 1 | 1 |
| 5 | 2.394261 | 1598.818912 | 50.759191 | DBU | AlPhos | 0.884705 | 0.421626 | 1 | 1 |
| 6 | 1.404624 | 831.862561 | 56.338940 | BTMG | AlPhos | 0.968719 | 0.480365 | 1 | 1 |
| 7 | 1.145203 | 463.327576 | 86.214229 | TMG | tBuXPhos | 0.330253 | 0.248281 | 1 | 1 |
| 8 | 1.663057 | 1360.198430 | 54.162292 | TMG | AlPhos | 0.229147 | 0.419072 | 1 | 1 |
| 9 | 1.771235 | 896.557303 | 98.890117 | DBU | tBuBrettPhos | 1.051289 | 0.280205 | 1 | 1 |
# we setup the data model, here a Single Task GP
surrogate_data = MixedSingleTaskGPSurrogate(
inputs=benchmark.domain.inputs,
outputs=Outputs(features=[benchmark.domain.outputs.features[0]]),
categorical_encodings={"catalyst": CategoricalEncodingEnum.ORDINAL},
)
# we generate the json spec
jspec = surrogate_data.model_dump_json()
jspec'{"hyperconfig":{"type":"MixedSingleTaskGPHyperconfig","hyperstrategy":"FractionalFactorialStrategy","inputs":{"type":"Inputs","features":[{"type":"CategoricalInput","key":"continuous_kernel","categories":["rbf","matern_1.5","matern_2.5"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"prior","categories":["mbo","threesix","hvarfner"],"allowed":[true,true,true]},{"type":"CategoricalInput","key":"ard","categories":["True","False"],"allowed":[true,true]}]},"n_iterations":null,"target_metric":"MAE"},"engineered_features":{"type":"EngineeredFeatures","features":[]},"type":"MixedSingleTaskGPSurrogate","inputs":{"type":"Inputs","features":[{"type":"CategoricalDescriptorInput","key":"catalyst","categories":["tBuXPhos","tBuBrettPhos","AlPhos"],"allowed":[true,true,true],"descriptors":["area_cat","M2_cat"],"values":[[460.7543,67.2057],[518.8408,89.8738],[819.933,129.0808]]},{"type":"CategoricalDescriptorInput","key":"base","categories":["TEA","TMG","BTMG","DBU"],"allowed":[true,true,true,true],"descriptors":["area","M2"],"values":[[162.2992,25.8165],[165.5447,81.4847],[227.3523,30.554],[192.4693,59.8367]]},{"type":"ContinuousInput","key":"base_eq","unit":null,"bounds":[1.0,2.5],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"temperature","unit":null,"bounds":[30.0,100.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false},{"type":"ContinuousInput","key":"t_res","unit":null,"bounds":[60.0,1800.0],"local_relative_bounds":null,"stepsize":null,"allow_zero":false}]},"outputs":{"type":"Outputs","features":[{"type":"ContinuousOutput","key":"yield","unit":null,"objective":{"type":"MaximizeObjective","w":1.0,"bounds":[0.0,1.0]}}]},"input_preprocessing_specs":{"base":"ORDINAL","catalyst":"ORDINAL"},"dump":null,"categorical_encodings":{"catalyst":"ORDINAL","base":"DESCRIPTOR"},"scaler":{"type":"Normalize","features":[]},"output_scaler":"STANDARDIZE","continuous_kernel":{"type":"RBFKernel","features":["base_eq","temperature","base","t_res"],"ard":true,"lengthscale_prior":{"type":"DimensionalityScaledLogNormalPrior","loc":1.4142135623730951,"loc_scaling":0.5,"scale":1.7320508075688772,"scale_scaling":0.0},"lengthscale_constraint":{"type":"GreaterThan","lower_bound":0.025}},"categorical_kernel":{"type":"HammingDistanceKernel","features":["catalyst"],"ard":true,"lengthscale_prior":null,"lengthscale_constraint":{"type":"GreaterThan","lower_bound":1e-6}},"noise_prior":{"type":"LogNormalPrior","loc":-4.0,"scale":1.0}}'
# Load it from the spec
surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
# Map it
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)surrogate_data = TypeAdapter(AnySurrogate).validate_json(jspec)
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)
# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)
# check for equality
predictions == predictions2True