Bayesian Optimisation Over Molecules

An example notebook for Bayesian optimisation on a molecular dataset using a Tanimoto fingerprint kernel and the photoswitch dataset.$\newline$ Paper: https://pubs.rsc.org/en/content/articlelanding/2022/sc/d2sc04306h $\newline$ Code: https://github.com/Ryan-Rhys/The-Photoswitch-Dataset $\newline$ This notebook is adapted from https://github.com/leojklarner/gauche/blob/main/notebooks/Bayesian%20Optimisation%20Over%20Molecules.ipynb $\newline$ The method of obtaining new data from a discrete dataset is explained in the notebook and the details of the dataset and the method are explained in the code and the paper respectively.

Imports

import os
import warnings

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import bofire.strategies.api as strategies
from bofire.benchmarks.data.photoswitches import EXPERIMENTS
from bofire.benchmarks.LookupTableBenchmark import LookupTableBenchmark
from bofire.data_models.acquisition_functions.api import qLogEI
from bofire.data_models.domain.api import Domain, Inputs, Outputs
from bofire.data_models.features.api import CategoricalMolecularInput, ContinuousOutput
from bofire.data_models.molfeatures.api import FingerprintsFragments
from bofire.data_models.objectives.api import MaximizeObjective
from bofire.data_models.strategies.api import RandomStrategy, SoboStrategy
from bofire.data_models.surrogates.api import BotorchSurrogates, TanimotoGPSurrogate
from bofire.runners.api import run


warnings.filterwarnings("ignore")

SMOKE_TEST = os.environ.get("SMOKE_TEST")

Benchmark

input and output feature keys and extract them to get LookUpTable

benchmark = {
    "input": "SMILES",
    "output": "E isomer pi-pi* wavelength in nm",
}
df = pd.read_json(EXPERIMENTS)
main_file = pd.DataFrame(columns=[benchmark["input"], benchmark["output"]])
nans = df[benchmark["output"]].isnull().to_list()
nan_indices = [nan for nan, x in enumerate(nans) if x]
main_file[benchmark["input"]] = df[benchmark["input"]].drop(nan_indices).to_list()
main_file[benchmark["output"]] = (
    df[benchmark["output"]].dropna().to_numpy().reshape(-1, 1)
)
input_feature = CategoricalMolecularInput(
    key=benchmark["input"],
    categories=list(set(main_file[benchmark["input"]].to_list())),
)
objective = MaximizeObjective(
    w=1.0,
)
inputs = Inputs(features=[input_feature])
output_feature = ContinuousOutput(key=benchmark["output"], objective=objective)
outputs = Outputs(features=[output_feature])
domain = Domain(inputs=inputs, outputs=outputs)

Random vs SOBO optimization

For molecules, we use Tanimoto GP which has a Tanimoto kernel as default

def sample(domain):
    datamodel = RandomStrategy(domain=domain)
    sampler = strategies.map(data_model=datamodel)
    sampled = sampler.ask(20)
    return sampled


def best(domain: Domain, experiments: pd.DataFrame) -> float:
    return experiments[domain.outputs.get_keys()[0]].max()


n_iter = 20 if not SMOKE_TEST else 1
bo_results_set = []
random_results_set = []
n_iterations = 49 if not SMOKE_TEST else 1

for _ in range(n_iter):
    Benchmark = LookupTableBenchmark(domain=domain, lookup_table=main_file)
    sampled = sample(Benchmark.domain)
    sampled_xy = Benchmark.f(sampled, return_complete=True)
    random_results = run(
        Benchmark,
        strategy_factory=lambda domain: strategies.map(RandomStrategy(domain=domain)),
        n_iterations=n_iterations,
        metric=best,
        initial_sampler=sampled_xy,
        n_runs=1,
        n_procs=1,
    )

    specs = {Benchmark.domain.inputs.get_keys()[0]: FingerprintsFragments(n_bits=2048)}
    surrogate = TanimotoGPSurrogate(
        inputs=Benchmark.domain.inputs,
        outputs=Benchmark.domain.outputs,
        categorical_encodings=specs,
    )

    def sobo_factory(domain: Domain, surrogate=surrogate):
        return strategies.map(
            SoboStrategy(
                domain=domain,
                acquisition_function=qLogEI(),
                surrogate_specs=BotorchSurrogates(surrogates=[surrogate]),
            ),
        )

    qExpectedImprovement = qLogEI()
    bo_results = run(
        Benchmark,
        strategy_factory=sobo_factory,
        n_iterations=n_iterations,
        metric=best,
        initial_sampler=sampled_xy,
        n_runs=1,
        n_procs=1,
    )
    random_results_new = np.insert(
        random_results[0][1].to_numpy(),
        0,
        best(Benchmark.domain, sampled_xy),
    )
    bo_results_new = np.insert(
        bo_results[0][1].to_numpy(),
        0,
        best(Benchmark.domain, sampled_xy),
    )
    random_results_set.append(random_results_new)
    bo_results_set.append(bo_results_new)

  0%|          | 0/1 [00:00<?, ?it/s]Run 0:   0%|          | 0/1 [00:00<?, ?it/s]Run 0:   0%|          | 0/1 [00:00<?, ?it/s, Current Best:=438.000]Run 0: 100%|██████████| 1/1 [00:00<00:00, 75.42it/s, Current Best:=438.000]
  0%|          | 0/1 [00:00<?, ?it/s]Run 0:   0%|          | 0/1 [00:00<?, ?it/s]Run 0:   0%|          | 0/1 [00:00<?, ?it/s, Current Best:=438.000]Run 0: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s, Current Best:=438.000]Run 0: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s, Current Best:=438.000]

Performance

SOBO outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength.

# Define a confience interval function for plotting.
def ci(y):
    return 1.96 * y.std(axis=0) / np.sqrt(n_iter)


if not SMOKE_TEST:
    iters = np.arange(n_iterations + 1)
    y_rnd = np.asarray(random_results_set)
    y_ei = np.asarray(bo_results_set)

    y_rnd_mean = y_rnd.mean(axis=0)
    y_ei_mean = y_ei.mean(axis=0)
    y_rnd_std = y_rnd.std(axis=0)
    y_ei_std = y_ei.std(axis=0)

    lower_rnd = y_rnd_mean - y_rnd_std
    upper_rnd = y_rnd_mean + y_rnd_std
    lower_ei = y_ei_mean - y_ei_std
    upper_ei = y_ei_mean + y_ei_std

    plt.plot(iters, y_rnd_mean, label="Random")
    plt.fill_between(iters, lower_rnd, upper_rnd, alpha=0.2)
    plt.plot(iters, y_ei_mean, label="SOBO")
    plt.fill_between(iters, lower_ei, upper_ei, alpha=0.2)
    plt.xlabel("Number of Iterations")
    plt.ylabel("Best Objective Value")
    plt.legend(loc="lower right")
    plt.show()

--- title: Bayesian Optimisation Over Molecules jupyter: python3 --- An example notebook for Bayesian optimisation on a molecular dataset using a Tanimoto fingerprint kernel and the photoswitch dataset.$\newline$ Paper: https://pubs.rsc.org/en/content/articlelanding/2022/sc/d2sc04306h $\newline$ Code: https://github.com/Ryan-Rhys/The-Photoswitch-Dataset $\newline$ This notebook is adapted from https://github.com/leojklarner/gauche/blob/main/notebooks/Bayesian%20Optimisation%20Over%20Molecules.ipynb $\newline$ The method of obtaining new data from a discrete dataset is explained in the notebook and the details of the dataset and the method are explained in the code and the paper respectively. ## Imports ```{python} #| papermill: {duration: 3.217913, end_time: '2024-10-10T20:35:24.355488', exception: false, start_time: '2024-10-10T20:35:21.137575', status: completed} #| tags: [] import os import warnings import numpy as np import pandas as pd from matplotlib import pyplot as plt import bofire.strategies.api as strategies from bofire.benchmarks.data.photoswitches import EXPERIMENTS from bofire.benchmarks.LookupTableBenchmark import LookupTableBenchmark from bofire.data_models.acquisition_functions.api import qLogEI from bofire.data_models.domain.api import Domain, Inputs, Outputs from bofire.data_models.features.api import CategoricalMolecularInput, ContinuousOutput from bofire.data_models.molfeatures.api import FingerprintsFragments from bofire.data_models.objectives.api import MaximizeObjective from bofire.data_models.strategies.api import RandomStrategy, SoboStrategy from bofire.data_models.surrogates.api import BotorchSurrogates, TanimotoGPSurrogate from bofire.runners.api import run warnings.filterwarnings("ignore") SMOKE_TEST = os.environ.get("SMOKE_TEST") ``` ## Benchmark input and output feature keys and extract them to get LookUpTable ```{python} #| papermill: {duration: 0.052545, end_time: '2024-10-10T20:35:24.411570', exception: false, start_time: '2024-10-10T20:35:24.359025', status: completed} #| tags: [] benchmark = { "input": "SMILES", "output": "E isomer pi-pi* wavelength in nm", } df = pd.read_json(EXPERIMENTS) main_file = pd.DataFrame(columns=[benchmark["input"], benchmark["output"]]) nans = df[benchmark["output"]].isnull().to_list() nan_indices = [nan for nan, x in enumerate(nans) if x] main_file[benchmark["input"]] = df[benchmark["input"]].drop(nan_indices).to_list() main_file[benchmark["output"]] = ( df[benchmark["output"]].dropna().to_numpy().reshape(-1, 1) ) input_feature = CategoricalMolecularInput( key=benchmark["input"], categories=list(set(main_file[benchmark["input"]].to_list())), ) objective = MaximizeObjective( w=1.0, ) inputs = Inputs(features=[input_feature]) output_feature = ContinuousOutput(key=benchmark["output"], objective=objective) outputs = Outputs(features=[output_feature]) domain = Domain(inputs=inputs, outputs=outputs) ``` ## Random vs SOBO optimization For molecules, we use Tanimoto GP which has a Tanimoto kernel as default ```{python} #| papermill: {duration: 0.640563, end_time: '2024-10-10T20:35:25.055378', exception: true, start_time: '2024-10-10T20:35:24.414815', status: failed} #| tags: [] def sample(domain): datamodel = RandomStrategy(domain=domain) sampler = strategies.map(data_model=datamodel) sampled = sampler.ask(20) return sampled def best(domain: Domain, experiments: pd.DataFrame) -> float: return experiments[domain.outputs.get_keys()[0]].max() n_iter = 20 if not SMOKE_TEST else 1 bo_results_set = [] random_results_set = [] n_iterations = 49 if not SMOKE_TEST else 1 for _ in range(n_iter): Benchmark = LookupTableBenchmark(domain=domain, lookup_table=main_file) sampled = sample(Benchmark.domain) sampled_xy = Benchmark.f(sampled, return_complete=True) random_results = run( Benchmark, strategy_factory=lambda domain: strategies.map(RandomStrategy(domain=domain)), n_iterations=n_iterations, metric=best, initial_sampler=sampled_xy, n_runs=1, n_procs=1, ) specs = {Benchmark.domain.inputs.get_keys()[0]: FingerprintsFragments(n_bits=2048)} surrogate = TanimotoGPSurrogate( inputs=Benchmark.domain.inputs, outputs=Benchmark.domain.outputs, categorical_encodings=specs, ) def sobo_factory(domain: Domain, surrogate=surrogate): return strategies.map( SoboStrategy( domain=domain, acquisition_function=qLogEI(), surrogate_specs=BotorchSurrogates(surrogates=[surrogate]), ), ) qExpectedImprovement = qLogEI() bo_results = run( Benchmark, strategy_factory=sobo_factory, n_iterations=n_iterations, metric=best, initial_sampler=sampled_xy, n_runs=1, n_procs=1, ) random_results_new = np.insert( random_results[0][1].to_numpy(), 0, best(Benchmark.domain, sampled_xy), ) bo_results_new = np.insert( bo_results[0][1].to_numpy(), 0, best(Benchmark.domain, sampled_xy), ) random_results_set.append(random_results_new) bo_results_set.append(bo_results_new) ``` ## Performance SOBO outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength. ```{python} #| papermill: {duration: null, end_time: null, exception: null, start_time: null, status: pending} #| tags: [] # Define a confience interval function for plotting. def ci(y): return 1.96 * y.std(axis=0) / np.sqrt(n_iter) if not SMOKE_TEST: iters = np.arange(n_iterations + 1) y_rnd = np.asarray(random_results_set) y_ei = np.asarray(bo_results_set) y_rnd_mean = y_rnd.mean(axis=0) y_ei_mean = y_ei.mean(axis=0) y_rnd_std = y_rnd.std(axis=0) y_ei_std = y_ei.std(axis=0) lower_rnd = y_rnd_mean - y_rnd_std upper_rnd = y_rnd_mean + y_rnd_std lower_ei = y_ei_mean - y_ei_std upper_ei = y_ei_mean + y_ei_std plt.plot(iters, y_rnd_mean, label="Random") plt.fill_between(iters, lower_rnd, upper_rnd, alpha=0.2) plt.plot(iters, y_ei_mean, label="SOBO") plt.fill_between(iters, lower_ei, upper_ei, alpha=0.2) plt.xlabel("Number of Iterations") plt.ylabel("Best Objective Value") plt.legend(loc="lower right") plt.show() ```