Skip to content

Domain

categorical

CategoricalInput (Input)

Base class for all categorical input features.

Attributes:

Name Type Description
categories List[str]

Names of the categories.

allowed List[bool]

List of bools indicating if a category is allowed within the optimization.

Source code in bofire/data_models/features/categorical.py
class CategoricalInput(Input):
    """Base class for all categorical input features.

    Attributes:
        categories (List[str]): Names of the categories.
        allowed (List[bool]): List of bools indicating if a category is allowed within the optimization.
    """

    type: Literal["CategoricalInput"] = "CategoricalInput"
    # order_id: ClassVar[int] = 5
    order_id: ClassVar[int] = 7

    categories: CategoryVals
    allowed: Optional[Annotated[List[bool], Field(min_length=2)]] = Field(
        default=None, validate_default=True
    )

    @field_validator("allowed")
    @classmethod
    def generate_allowed(cls, allowed, info):
        """Generates the list of allowed categories if not provided."""
        if allowed is None and "categories" in info.data.keys():
            return [True for _ in range(len(info.data["categories"]))]
        return allowed

    @model_validator(mode="after")
    def validate_categories_fitting_allowed(self):
        if len(self.allowed) != len(self.categories):  # type: ignore
            raise ValueError("allowed must have same length as categories")
        if sum(self.allowed) == 0:  # type: ignore
            raise ValueError("no category is allowed")
        return self

    @staticmethod
    def valid_transform_types() -> List[CategoricalEncodingEnum]:
        return [
            CategoricalEncodingEnum.ONE_HOT,
            CategoricalEncodingEnum.DUMMY,
            CategoricalEncodingEnum.ORDINAL,
        ]

    def is_fixed(self) -> bool:
        """Returns True if there is only one allowed category.

        Returns:
            [bool]: True if there is only one allowed category
        """
        if self.allowed is None:
            return False
        return sum(self.allowed) == 1

    def fixed_value(
        self, transform_type: Optional[TTransform] = None
    ) -> Union[List[str], List[float], None]:
        """Returns the categories to which the feature is fixed, None if the feature is not fixed

        Returns:
            List[str]: List of categories or None
        """
        if self.is_fixed():
            val = self.get_allowed_categories()[0]
            if transform_type is None:
                return [val]
            elif transform_type == CategoricalEncodingEnum.ONE_HOT:
                return self.to_onehot_encoding(pd.Series([val])).values[0].tolist()
            elif transform_type == CategoricalEncodingEnum.DUMMY:
                return self.to_dummy_encoding(pd.Series([val])).values[0].tolist()
            elif transform_type == CategoricalEncodingEnum.ORDINAL:
                return self.to_ordinal_encoding(pd.Series([val])).tolist()
            else:
                raise ValueError(
                    f"Unkwon transform type {transform_type} for categorical input {self.key}"
                )
        else:
            return None

    def get_allowed_categories(self):
        """Returns the allowed categories.

        Returns:
            list of str: The allowed categories
        """
        if self.allowed is None:
            return []
        return [c for c, a in zip(self.categories, self.allowed) if a]

    def validate_experimental(
        self, values: pd.Series, strict: bool = False
    ) -> pd.Series:
        """Method to validate the experimental dataFrame

        Args:
            values (pd.Series): A dataFrame with experiments
            strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

        Raises:
            ValueError: when an entry is not in the list of allowed categories
            ValueError: when there is no variation in a feature provided by the experimental data

        Returns:
            pd.Series: A dataFrame with experiments
        """
        values = values.map(str)
        if sum(values.isin(self.categories)) != len(values):
            raise ValueError(
                f"invalid values for `{self.key}`, allowed are: `{self.categories}`"
            )
        if strict:
            possible_categories = self.get_possible_categories(values)
            if len(possible_categories) != len(self.categories):
                raise ValueError(
                    f"Categories {list(set(self.categories)-set(possible_categories))} of feature {self.key} not used. Remove them."
                )
        return values

    def validate_candidental(self, values: pd.Series) -> pd.Series:
        """Method to validate the suggested candidates

        Args:
            values (pd.Series): A dataFrame with candidates

        Raises:
            ValueError: when not all values for a feature are one of the allowed categories

        Returns:
            pd.Series: The passed dataFrame with candidates
        """
        values = values.map(str)
        if sum(values.isin(self.get_allowed_categories())) != len(values):
            raise ValueError(
                f"not all values of input feature `{self.key}` are a valid allowed category from {self.get_allowed_categories()}"
            )
        return values

    def get_forbidden_categories(self):
        """Returns the non-allowed categories

        Returns:
            List[str]: List of the non-allowed categories
        """
        return list(set(self.categories) - set(self.get_allowed_categories()))

    def get_possible_categories(self, values: pd.Series) -> list:
        """Return the superset of categories that have been used in the experimental dataset and
        that can be used in the optimization

        Args:
            values (pd.Series): Series with the values for this feature

        Returns:
            list: list of possible categories
        """
        return sorted(set(list(set(values.tolist())) + self.get_allowed_categories()))

    def to_onehot_encoding(self, values: pd.Series) -> pd.DataFrame:
        """Converts values to a one-hot encoding.

        Args:
            values (pd.Series): Series to be transformed.

        Returns:
            pd.DataFrame: One-hot transformed data frame.
        """
        return pd.DataFrame(
            {get_encoded_name(self.key, c): values == c for c in self.categories},
            dtype=float,
            index=values.index,
        )

    def from_onehot_encoding(self, values: pd.DataFrame) -> pd.Series:
        """Converts values back from one-hot encoding.

        Args:
            values (pd.DataFrame): One-hot encoded values.

        Raises:
            ValueError: If one-hot columns not present in `values`.

        Returns:
            pd.Series: Series with categorical values.
        """
        cat_cols = [get_encoded_name(self.key, c) for c in self.categories]
        # we allow here explicitly that the dataframe can have more columns than needed to have it
        # easier in the backtransform.
        if np.any([c not in values.columns for c in cat_cols]):
            raise ValueError(
                f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
            )
        s = values[cat_cols].idxmax(1).str[(len(self.key) + 1) :]
        s.name = self.key
        return s

    def to_dummy_encoding(self, values: pd.Series) -> pd.DataFrame:
        """Converts values to a dummy-hot encoding, dropping the first categorical level.

        Args:
            values (pd.Series): Series to be transformed.

        Returns:
            pd.DataFrame: Dummy-hot transformed data frame.
        """
        return pd.DataFrame(
            {get_encoded_name(self.key, c): values == c for c in self.categories[1:]},
            dtype=float,
            index=values.index,
        )

    def from_dummy_encoding(self, values: pd.DataFrame) -> pd.Series:
        """Convert points back from dummy encoding.

        Args:
            values (pd.DataFrame): Dummy-hot encoded values.

        Raises:
            ValueError: If one-hot columns not present in `values`.

        Returns:
            pd.Series: Series with categorical values.
        """
        cat_cols = [get_encoded_name(self.key, c) for c in self.categories]
        # we allow here explicitly that the dataframe can have more columns than needed to have it
        # easier in the backtransform.
        if np.any([c not in values.columns for c in cat_cols[1:]]):
            raise ValueError(
                f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols[1:]}."
            )
        values = values.copy()
        values[cat_cols[0]] = 1 - values[cat_cols[1:]].sum(axis=1)
        s = values[cat_cols].idxmax(1).str[(len(self.key) + 1) :]
        s.name = self.key
        return s

    def to_ordinal_encoding(self, values: pd.Series) -> pd.Series:
        """Converts values to an ordinal integer based encoding.

        Args:
            values (pd.Series): Series to be transformed.

        Returns:
            pd.Series: Ordinal encoded values.
        """
        enc = pd.Series(range(len(self.categories)), index=list(self.categories))
        s = enc[values]
        s.index = values.index
        s.name = self.key
        return s

    def from_ordinal_encoding(self, values: pd.Series) -> pd.Series:
        """Convertes values back from ordinal encoding.

        Args:
            values (pd.Series): Ordinal encoded series.

        Returns:
            pd.Series: Series with categorical values.
        """
        enc = np.array(self.categories)
        return pd.Series(enc[values], index=values.index, name=self.key)

    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
        """Draw random samples from the feature.

        Args:
            n (int): number of samples.

        Returns:
            pd.Series: drawn samples.
        """
        return pd.Series(
            name=self.key,
            data=np.random.default_rng(seed=seed).choice(
                self.get_allowed_categories(), n
            ),
        )

    def get_bounds(
        self,
        transform_type: TTransform,
        values: Optional[pd.Series] = None,
        reference_value: Optional[str] = None,
    ) -> Tuple[List[float], List[float]]:
        assert isinstance(transform_type, CategoricalEncodingEnum)
        if transform_type == CategoricalEncodingEnum.ORDINAL:
            return [0], [len(self.categories) - 1]
        if transform_type == CategoricalEncodingEnum.ONE_HOT:
            # in the case that values are None, we return the bounds
            # based on the optimization bounds, else we return the true
            # bounds as this is for model fitting.
            if values is None:
                lower = [0.0 for _ in self.categories]
                upper = [
                    1.0 if self.allowed[i] is True else 0.0  # type: ignore
                    for i, _ in enumerate(self.categories)
                ]
            else:
                lower = [0.0 for _ in self.categories]
                upper = [1.0 for _ in self.categories]
            return lower, upper
        if transform_type == CategoricalEncodingEnum.DUMMY:
            lower = [0.0 for _ in range(len(self.categories) - 1)]
            upper = [1.0 for _ in range(len(self.categories) - 1)]
            return lower, upper
        if transform_type == CategoricalEncodingEnum.DESCRIPTOR:
            raise ValueError(
                f"Invalid descriptor transform for categorical {self.key}."
            )
        else:
            raise ValueError(
                f"Invalid transform_type {transform_type} provided for categorical {self.key}."
            )

    def __str__(self) -> str:
        """Returns the number of categories as str

        Returns:
            str: Number of categories
        """
        return f"{len(self.categories)} categories"

__str__(self) special

Returns the number of categories as str

Returns:

Type Description
str

Number of categories

Source code in bofire/data_models/features/categorical.py
def __str__(self) -> str:
    """Returns the number of categories as str

    Returns:
        str: Number of categories
    """
    return f"{len(self.categories)} categories"

fixed_value(self, transform_type=None)

Returns the categories to which the feature is fixed, None if the feature is not fixed

Returns:

Type Description
List[str]

List of categories or None

Source code in bofire/data_models/features/categorical.py
def fixed_value(
    self, transform_type: Optional[TTransform] = None
) -> Union[List[str], List[float], None]:
    """Returns the categories to which the feature is fixed, None if the feature is not fixed

    Returns:
        List[str]: List of categories or None
    """
    if self.is_fixed():
        val = self.get_allowed_categories()[0]
        if transform_type is None:
            return [val]
        elif transform_type == CategoricalEncodingEnum.ONE_HOT:
            return self.to_onehot_encoding(pd.Series([val])).values[0].tolist()
        elif transform_type == CategoricalEncodingEnum.DUMMY:
            return self.to_dummy_encoding(pd.Series([val])).values[0].tolist()
        elif transform_type == CategoricalEncodingEnum.ORDINAL:
            return self.to_ordinal_encoding(pd.Series([val])).tolist()
        else:
            raise ValueError(
                f"Unkwon transform type {transform_type} for categorical input {self.key}"
            )
    else:
        return None

from_dummy_encoding(self, values)

Convert points back from dummy encoding.

Parameters:

Name Type Description Default
values pd.DataFrame

Dummy-hot encoded values.

required

Exceptions:

Type Description
ValueError

If one-hot columns not present in values.

Returns:

Type Description
pd.Series

Series with categorical values.

Source code in bofire/data_models/features/categorical.py
def from_dummy_encoding(self, values: pd.DataFrame) -> pd.Series:
    """Convert points back from dummy encoding.

    Args:
        values (pd.DataFrame): Dummy-hot encoded values.

    Raises:
        ValueError: If one-hot columns not present in `values`.

    Returns:
        pd.Series: Series with categorical values.
    """
    cat_cols = [get_encoded_name(self.key, c) for c in self.categories]
    # we allow here explicitly that the dataframe can have more columns than needed to have it
    # easier in the backtransform.
    if np.any([c not in values.columns for c in cat_cols[1:]]):
        raise ValueError(
            f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols[1:]}."
        )
    values = values.copy()
    values[cat_cols[0]] = 1 - values[cat_cols[1:]].sum(axis=1)
    s = values[cat_cols].idxmax(1).str[(len(self.key) + 1) :]
    s.name = self.key
    return s

from_onehot_encoding(self, values)

Converts values back from one-hot encoding.

Parameters:

Name Type Description Default
values pd.DataFrame

One-hot encoded values.

required

Exceptions:

Type Description
ValueError

If one-hot columns not present in values.

Returns:

Type Description
pd.Series

Series with categorical values.

Source code in bofire/data_models/features/categorical.py
def from_onehot_encoding(self, values: pd.DataFrame) -> pd.Series:
    """Converts values back from one-hot encoding.

    Args:
        values (pd.DataFrame): One-hot encoded values.

    Raises:
        ValueError: If one-hot columns not present in `values`.

    Returns:
        pd.Series: Series with categorical values.
    """
    cat_cols = [get_encoded_name(self.key, c) for c in self.categories]
    # we allow here explicitly that the dataframe can have more columns than needed to have it
    # easier in the backtransform.
    if np.any([c not in values.columns for c in cat_cols]):
        raise ValueError(
            f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
        )
    s = values[cat_cols].idxmax(1).str[(len(self.key) + 1) :]
    s.name = self.key
    return s

from_ordinal_encoding(self, values)

Convertes values back from ordinal encoding.

Parameters:

Name Type Description Default
values pd.Series

Ordinal encoded series.

required

Returns:

Type Description
pd.Series

Series with categorical values.

Source code in bofire/data_models/features/categorical.py
def from_ordinal_encoding(self, values: pd.Series) -> pd.Series:
    """Convertes values back from ordinal encoding.

    Args:
        values (pd.Series): Ordinal encoded series.

    Returns:
        pd.Series: Series with categorical values.
    """
    enc = np.array(self.categories)
    return pd.Series(enc[values], index=values.index, name=self.key)

generate_allowed(allowed, info) classmethod

Generates the list of allowed categories if not provided.

Source code in bofire/data_models/features/categorical.py
@field_validator("allowed")
@classmethod
def generate_allowed(cls, allowed, info):
    """Generates the list of allowed categories if not provided."""
    if allowed is None and "categories" in info.data.keys():
        return [True for _ in range(len(info.data["categories"]))]
    return allowed

get_allowed_categories(self)

Returns the allowed categories.

Returns:

Type Description
list of str

The allowed categories

Source code in bofire/data_models/features/categorical.py
def get_allowed_categories(self):
    """Returns the allowed categories.

    Returns:
        list of str: The allowed categories
    """
    if self.allowed is None:
        return []
    return [c for c, a in zip(self.categories, self.allowed) if a]

get_bounds(self, transform_type, values=None, reference_value=None)

Returns the bounds of an input feature depending on the requested transform type.

Parameters:

Name Type Description Default
transform_type Optional[TTransform]

The requested transform type. Defaults to None.

required
values Optional[pd.Series]

If values are provided the bounds are returned taking the most extreme values for the feature into account. Defaults to None.

None
reference_value Optional[float]

If a reference value is provided, then the local bounds based on a local search region are provided. Currently only supported for continuous inputs. For more details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.

None

Returns:

Type Description
Tuple[List[float], List[float]]

List of lower bound values, list of upper bound values.

Source code in bofire/data_models/features/categorical.py
def get_bounds(
    self,
    transform_type: TTransform,
    values: Optional[pd.Series] = None,
    reference_value: Optional[str] = None,
) -> Tuple[List[float], List[float]]:
    assert isinstance(transform_type, CategoricalEncodingEnum)
    if transform_type == CategoricalEncodingEnum.ORDINAL:
        return [0], [len(self.categories) - 1]
    if transform_type == CategoricalEncodingEnum.ONE_HOT:
        # in the case that values are None, we return the bounds
        # based on the optimization bounds, else we return the true
        # bounds as this is for model fitting.
        if values is None:
            lower = [0.0 for _ in self.categories]
            upper = [
                1.0 if self.allowed[i] is True else 0.0  # type: ignore
                for i, _ in enumerate(self.categories)
            ]
        else:
            lower = [0.0 for _ in self.categories]
            upper = [1.0 for _ in self.categories]
        return lower, upper
    if transform_type == CategoricalEncodingEnum.DUMMY:
        lower = [0.0 for _ in range(len(self.categories) - 1)]
        upper = [1.0 for _ in range(len(self.categories) - 1)]
        return lower, upper
    if transform_type == CategoricalEncodingEnum.DESCRIPTOR:
        raise ValueError(
            f"Invalid descriptor transform for categorical {self.key}."
        )
    else:
        raise ValueError(
            f"Invalid transform_type {transform_type} provided for categorical {self.key}."
        )

get_forbidden_categories(self)

Returns the non-allowed categories

Returns:

Type Description
List[str]

List of the non-allowed categories

Source code in bofire/data_models/features/categorical.py
def get_forbidden_categories(self):
    """Returns the non-allowed categories

    Returns:
        List[str]: List of the non-allowed categories
    """
    return list(set(self.categories) - set(self.get_allowed_categories()))

get_possible_categories(self, values)

Return the superset of categories that have been used in the experimental dataset and that can be used in the optimization

Parameters:

Name Type Description Default
values pd.Series

Series with the values for this feature

required

Returns:

Type Description
list

list of possible categories

Source code in bofire/data_models/features/categorical.py
def get_possible_categories(self, values: pd.Series) -> list:
    """Return the superset of categories that have been used in the experimental dataset and
    that can be used in the optimization

    Args:
        values (pd.Series): Series with the values for this feature

    Returns:
        list: list of possible categories
    """
    return sorted(set(list(set(values.tolist())) + self.get_allowed_categories()))

is_fixed(self)

Returns True if there is only one allowed category.

Returns:

Type Description
[bool]

True if there is only one allowed category

Source code in bofire/data_models/features/categorical.py
def is_fixed(self) -> bool:
    """Returns True if there is only one allowed category.

    Returns:
        [bool]: True if there is only one allowed category
    """
    if self.allowed is None:
        return False
    return sum(self.allowed) == 1

sample(self, n, seed=None)

Draw random samples from the feature.

Parameters:

Name Type Description Default
n int

number of samples.

required

Returns:

Type Description
pd.Series

drawn samples.

Source code in bofire/data_models/features/categorical.py
def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
    """Draw random samples from the feature.

    Args:
        n (int): number of samples.

    Returns:
        pd.Series: drawn samples.
    """
    return pd.Series(
        name=self.key,
        data=np.random.default_rng(seed=seed).choice(
            self.get_allowed_categories(), n
        ),
    )

to_dummy_encoding(self, values)

Converts values to a dummy-hot encoding, dropping the first categorical level.

Parameters:

Name Type Description Default
values pd.Series

Series to be transformed.

required

Returns:

Type Description
pd.DataFrame

Dummy-hot transformed data frame.

Source code in bofire/data_models/features/categorical.py
def to_dummy_encoding(self, values: pd.Series) -> pd.DataFrame:
    """Converts values to a dummy-hot encoding, dropping the first categorical level.

    Args:
        values (pd.Series): Series to be transformed.

    Returns:
        pd.DataFrame: Dummy-hot transformed data frame.
    """
    return pd.DataFrame(
        {get_encoded_name(self.key, c): values == c for c in self.categories[1:]},
        dtype=float,
        index=values.index,
    )

to_onehot_encoding(self, values)

Converts values to a one-hot encoding.

Parameters:

Name Type Description Default
values pd.Series

Series to be transformed.

required

Returns:

Type Description
pd.DataFrame

One-hot transformed data frame.

Source code in bofire/data_models/features/categorical.py
def to_onehot_encoding(self, values: pd.Series) -> pd.DataFrame:
    """Converts values to a one-hot encoding.

    Args:
        values (pd.Series): Series to be transformed.

    Returns:
        pd.DataFrame: One-hot transformed data frame.
    """
    return pd.DataFrame(
        {get_encoded_name(self.key, c): values == c for c in self.categories},
        dtype=float,
        index=values.index,
    )

to_ordinal_encoding(self, values)

Converts values to an ordinal integer based encoding.

Parameters:

Name Type Description Default
values pd.Series

Series to be transformed.

required

Returns:

Type Description
pd.Series

Ordinal encoded values.

Source code in bofire/data_models/features/categorical.py
def to_ordinal_encoding(self, values: pd.Series) -> pd.Series:
    """Converts values to an ordinal integer based encoding.

    Args:
        values (pd.Series): Series to be transformed.

    Returns:
        pd.Series: Ordinal encoded values.
    """
    enc = pd.Series(range(len(self.categories)), index=list(self.categories))
    s = enc[values]
    s.index = values.index
    s.name = self.key
    return s

validate_candidental(self, values)

Method to validate the suggested candidates

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with candidates

required

Exceptions:

Type Description
ValueError

when not all values for a feature are one of the allowed categories

Returns:

Type Description
pd.Series

The passed dataFrame with candidates

Source code in bofire/data_models/features/categorical.py
def validate_candidental(self, values: pd.Series) -> pd.Series:
    """Method to validate the suggested candidates

    Args:
        values (pd.Series): A dataFrame with candidates

    Raises:
        ValueError: when not all values for a feature are one of the allowed categories

    Returns:
        pd.Series: The passed dataFrame with candidates
    """
    values = values.map(str)
    if sum(values.isin(self.get_allowed_categories())) != len(values):
        raise ValueError(
            f"not all values of input feature `{self.key}` are a valid allowed category from {self.get_allowed_categories()}"
        )
    return values

validate_experimental(self, values, strict=False)

Method to validate the experimental dataFrame

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with experiments

required
strict bool

Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

False

Exceptions:

Type Description
ValueError

when an entry is not in the list of allowed categories

ValueError

when there is no variation in a feature provided by the experimental data

Returns:

Type Description
pd.Series

A dataFrame with experiments

Source code in bofire/data_models/features/categorical.py
def validate_experimental(
    self, values: pd.Series, strict: bool = False
) -> pd.Series:
    """Method to validate the experimental dataFrame

    Args:
        values (pd.Series): A dataFrame with experiments
        strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

    Raises:
        ValueError: when an entry is not in the list of allowed categories
        ValueError: when there is no variation in a feature provided by the experimental data

    Returns:
        pd.Series: A dataFrame with experiments
    """
    values = values.map(str)
    if sum(values.isin(self.categories)) != len(values):
        raise ValueError(
            f"invalid values for `{self.key}`, allowed are: `{self.categories}`"
        )
    if strict:
        possible_categories = self.get_possible_categories(values)
        if len(possible_categories) != len(self.categories):
            raise ValueError(
                f"Categories {list(set(self.categories)-set(possible_categories))} of feature {self.key} not used. Remove them."
            )
    return values

CategoricalOutput (Output)

Source code in bofire/data_models/features/categorical.py
class CategoricalOutput(Output):
    type: Literal["CategoricalOutput"] = "CategoricalOutput"
    order_id: ClassVar[int] = 10

    categories: CategoryVals
    objective: AnyCategoricalObjective

    @model_validator(mode="after")
    def validate_objective_categories(self):
        """validates that objective categories match the output categories

        Raises:
            ValueError: when categories do not match objective categories

        Returns:
            self
        """
        if self.objective.categories != self.categories:  # type: ignore
            raise ValueError("categories must match to objective categories")
        return self

    def __call__(self, values: pd.Series) -> pd.Series:
        if self.objective is None:
            return pd.Series(
                data=[np.nan for _ in range(len(values))],
                index=values.index,
                name=values.name,
            )
        return self.objective(values)  # type: ignore

    def validate_experimental(self, values: pd.Series) -> pd.Series:
        values = values.map(str)
        if sum(values.isin(self.categories)) != len(values):
            raise ValueError(
                f"invalid values for `{self.key}`, allowed are: `{self.categories}`"
            )
        return values

    def __str__(self) -> str:
        return "CategoricalOutputFeature"

validate_experimental(self, values)

Abstract method to validate the experimental Series

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with values for the outcome

required

Returns:

Type Description
pd.Series

The passed dataFrame with experiments

Source code in bofire/data_models/features/categorical.py
def validate_experimental(self, values: pd.Series) -> pd.Series:
    values = values.map(str)
    if sum(values.isin(self.categories)) != len(values):
        raise ValueError(
            f"invalid values for `{self.key}`, allowed are: `{self.categories}`"
        )
    return values

validate_objective_categories(self)

validates that objective categories match the output categories

Exceptions:

Type Description
ValueError

when categories do not match objective categories

Returns:

Type Description

self

Source code in bofire/data_models/features/categorical.py
@model_validator(mode="after")
def validate_objective_categories(self):
    """validates that objective categories match the output categories

    Raises:
        ValueError: when categories do not match objective categories

    Returns:
        self
    """
    if self.objective.categories != self.categories:  # type: ignore
        raise ValueError("categories must match to objective categories")
    return self

continuous

ContinuousInput (NumericalInput)

Base class for all continuous input features.

Attributes:

Name Type Description
bounds Tuple[float, float]

A tuple that stores the lower and upper bound of the feature.

stepsize float

Float indicating the allowed stepsize between lower and upper. Defaults to None.

local_relative_bounds Tuple[float, float]

A tuple that stores the lower and upper bounds relative to a reference value. Defaults to None.

Source code in bofire/data_models/features/continuous.py
class ContinuousInput(NumericalInput):
    """Base class for all continuous input features.

    Attributes:
        bounds (Tuple[float, float]): A tuple that stores the lower and upper bound of the feature.
        stepsize (float, optional): Float indicating the allowed stepsize between lower and upper. Defaults to None.
        local_relative_bounds (Tuple[float, float], optional): A tuple that stores the lower and upper bounds relative to a reference value.
            Defaults to None.
    """

    type: Literal["ContinuousInput"] = "ContinuousInput"
    order_id: ClassVar[int] = 1

    bounds: Tuple[float, float]
    local_relative_bounds: Optional[
        Tuple[Annotated[float, Field(gt=0)], Annotated[float, Field(gt=0)]]
    ] = None
    stepsize: Optional[float] = None

    @property
    def lower_bound(self) -> float:
        return self.bounds[0]

    @property
    def upper_bound(self) -> float:
        return self.bounds[1]

    @model_validator(mode="after")
    def validate_step_size(self):
        if self.stepsize is None:
            return self
        lower, upper = self.bounds
        if lower == upper and self.stepsize is not None:
            raise ValueError(
                "Stepsize cannot be provided for a fixed continuous input."
            )
        range = upper - lower
        if np.arange(lower, upper + self.stepsize, self.stepsize)[-1] != upper:
            raise ValueError(
                f"Stepsize of {self.stepsize} does not match the provided interval [{lower},{upper}]."
            )
        if range // self.stepsize == 1:
            raise ValueError("Stepsize is too big, only one value allowed.")
        return self

    def round(self, values: pd.Series) -> pd.Series:
        """Round values to the stepsize of the feature. If no stepsize is provided return the
        provided values.

        Args:
            values (pd.Series): The values that should be rounded.

        Returns:
            pd.Series: The rounded values
        """
        if self.stepsize is None:
            return values
        self.validate_candidental(values=values)
        allowed_values = np.arange(
            self.lower_bound, self.upper_bound + self.stepsize, self.stepsize
        )
        idx = abs(values.values.reshape([len(values), 1]) - allowed_values).argmin(  # type: ignore
            axis=1
        )
        return pd.Series(
            data=self.lower_bound + idx * self.stepsize, index=values.index
        )

    @field_validator("bounds")
    @classmethod
    def validate_lower_upper(cls, bounds):
        """Validates that the lower bound is lower than the upper bound

        Args:
            values (Dict): Dictionary with attributes key, lower and upper bound

        Raises:
            ValueError: when the lower bound is higher than the upper bound

        Returns:
            Dict: The attributes as dictionary
        """
        if bounds[0] > bounds[1]:
            raise ValueError(
                f"lower bound must be <= upper bound, got {bounds[0]} > {bounds[1]}"
            )
        return bounds

    def validate_candidental(self, values: pd.Series) -> pd.Series:
        """Method to validate the suggested candidates

        Args:
            values (pd.Series): A dataFrame with candidates

        Raises:
            ValueError: when non numerical values are passed
            ValueError: when values are larger than the upper bound of the feature
            ValueError: when values are lower than the lower bound of the feature

        Returns:
            pd.Series: The passed dataFrame with candidates
        """

        noise = 10e-6
        values = super().validate_candidental(values)
        if (values < self.lower_bound - noise).any():
            raise ValueError(
                f"not all values of input feature `{self.key}`are larger than lower bound `{self.lower_bound}` "
            )
        if (values > self.upper_bound + noise).any():
            raise ValueError(
                f"not all values of input feature `{self.key}`are smaller than upper bound `{self.upper_bound}` "
            )
        return values

    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
        """Draw random samples from the feature.

        Args:
            n (int): number of samples.

        Returns:
            pd.Series: drawn samples.
        """
        return pd.Series(
            name=self.key,
            data=np.random.default_rng(seed=seed).uniform(
                self.lower_bound, self.upper_bound, n
            ),
        )

    def get_bounds(
        self,
        transform_type: Optional[TTransform] = None,
        values: Optional[pd.Series] = None,
        reference_value: Optional[float] = None,
    ) -> Tuple[List[float], List[float]]:
        assert transform_type is None
        if reference_value is not None and values is not None:
            raise ValueError("Only one can be used, `local_value` or `values`.")
        if values is None:
            if reference_value is None or self.is_fixed():
                return [self.lower_bound], [self.upper_bound]
            else:
                local_relative_bounds = self.local_relative_bounds or (
                    math.inf,
                    math.inf,
                )
                return [
                    max(
                        reference_value - local_relative_bounds[0],
                        self.lower_bound,
                    )
                ], [
                    min(
                        reference_value + local_relative_bounds[1],
                        self.upper_bound,
                    )
                ]
        lower = min(self.lower_bound, values.min())  # type: ignore
        upper = max(self.upper_bound, values.max())  # type: ignore
        return [lower], [upper]

    def __str__(self) -> str:
        """Method to return a string of lower and upper bound

        Returns:
            str: String of a list with lower and upper bound
        """
        return f"[{self.lower_bound},{self.upper_bound}]"

__str__(self) special

Method to return a string of lower and upper bound

Returns:

Type Description
str

String of a list with lower and upper bound

Source code in bofire/data_models/features/continuous.py
def __str__(self) -> str:
    """Method to return a string of lower and upper bound

    Returns:
        str: String of a list with lower and upper bound
    """
    return f"[{self.lower_bound},{self.upper_bound}]"

get_bounds(self, transform_type=None, values=None, reference_value=None)

Returns the bounds of an input feature depending on the requested transform type.

Parameters:

Name Type Description Default
transform_type Optional[TTransform]

The requested transform type. Defaults to None.

None
values Optional[pd.Series]

If values are provided the bounds are returned taking the most extreme values for the feature into account. Defaults to None.

None
reference_value Optional[float]

If a reference value is provided, then the local bounds based on a local search region are provided. Currently only supported for continuous inputs. For more details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.

None

Returns:

Type Description
Tuple[List[float], List[float]]

List of lower bound values, list of upper bound values.

Source code in bofire/data_models/features/continuous.py
def get_bounds(
    self,
    transform_type: Optional[TTransform] = None,
    values: Optional[pd.Series] = None,
    reference_value: Optional[float] = None,
) -> Tuple[List[float], List[float]]:
    assert transform_type is None
    if reference_value is not None and values is not None:
        raise ValueError("Only one can be used, `local_value` or `values`.")
    if values is None:
        if reference_value is None or self.is_fixed():
            return [self.lower_bound], [self.upper_bound]
        else:
            local_relative_bounds = self.local_relative_bounds or (
                math.inf,
                math.inf,
            )
            return [
                max(
                    reference_value - local_relative_bounds[0],
                    self.lower_bound,
                )
            ], [
                min(
                    reference_value + local_relative_bounds[1],
                    self.upper_bound,
                )
            ]
    lower = min(self.lower_bound, values.min())  # type: ignore
    upper = max(self.upper_bound, values.max())  # type: ignore
    return [lower], [upper]

round(self, values)

Round values to the stepsize of the feature. If no stepsize is provided return the provided values.

Parameters:

Name Type Description Default
values pd.Series

The values that should be rounded.

required

Returns:

Type Description
pd.Series

The rounded values

Source code in bofire/data_models/features/continuous.py
def round(self, values: pd.Series) -> pd.Series:
    """Round values to the stepsize of the feature. If no stepsize is provided return the
    provided values.

    Args:
        values (pd.Series): The values that should be rounded.

    Returns:
        pd.Series: The rounded values
    """
    if self.stepsize is None:
        return values
    self.validate_candidental(values=values)
    allowed_values = np.arange(
        self.lower_bound, self.upper_bound + self.stepsize, self.stepsize
    )
    idx = abs(values.values.reshape([len(values), 1]) - allowed_values).argmin(  # type: ignore
        axis=1
    )
    return pd.Series(
        data=self.lower_bound + idx * self.stepsize, index=values.index
    )

sample(self, n, seed=None)

Draw random samples from the feature.

Parameters:

Name Type Description Default
n int

number of samples.

required

Returns:

Type Description
pd.Series

drawn samples.

Source code in bofire/data_models/features/continuous.py
def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
    """Draw random samples from the feature.

    Args:
        n (int): number of samples.

    Returns:
        pd.Series: drawn samples.
    """
    return pd.Series(
        name=self.key,
        data=np.random.default_rng(seed=seed).uniform(
            self.lower_bound, self.upper_bound, n
        ),
    )

validate_candidental(self, values)

Method to validate the suggested candidates

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with candidates

required

Exceptions:

Type Description
ValueError

when non numerical values are passed

ValueError

when values are larger than the upper bound of the feature

ValueError

when values are lower than the lower bound of the feature

Returns:

Type Description
pd.Series

The passed dataFrame with candidates

Source code in bofire/data_models/features/continuous.py
def validate_candidental(self, values: pd.Series) -> pd.Series:
    """Method to validate the suggested candidates

    Args:
        values (pd.Series): A dataFrame with candidates

    Raises:
        ValueError: when non numerical values are passed
        ValueError: when values are larger than the upper bound of the feature
        ValueError: when values are lower than the lower bound of the feature

    Returns:
        pd.Series: The passed dataFrame with candidates
    """

    noise = 10e-6
    values = super().validate_candidental(values)
    if (values < self.lower_bound - noise).any():
        raise ValueError(
            f"not all values of input feature `{self.key}`are larger than lower bound `{self.lower_bound}` "
        )
    if (values > self.upper_bound + noise).any():
        raise ValueError(
            f"not all values of input feature `{self.key}`are smaller than upper bound `{self.upper_bound}` "
        )
    return values

validate_lower_upper(bounds) classmethod

Validates that the lower bound is lower than the upper bound

Parameters:

Name Type Description Default
values Dict

Dictionary with attributes key, lower and upper bound

required

Exceptions:

Type Description
ValueError

when the lower bound is higher than the upper bound

Returns:

Type Description
Dict

The attributes as dictionary

Source code in bofire/data_models/features/continuous.py
@field_validator("bounds")
@classmethod
def validate_lower_upper(cls, bounds):
    """Validates that the lower bound is lower than the upper bound

    Args:
        values (Dict): Dictionary with attributes key, lower and upper bound

    Raises:
        ValueError: when the lower bound is higher than the upper bound

    Returns:
        Dict: The attributes as dictionary
    """
    if bounds[0] > bounds[1]:
        raise ValueError(
            f"lower bound must be <= upper bound, got {bounds[0]} > {bounds[1]}"
        )
    return bounds

ContinuousOutput (Output)

The base class for a continuous output feature

Attributes:

Name Type Description
objective objective

objective of the feature indicating in which direction it should be optimzed. Defaults to MaximizeObjective.

Source code in bofire/data_models/features/continuous.py
class ContinuousOutput(Output):
    """The base class for a continuous output feature

    Attributes:
        objective (objective, optional): objective of the feature indicating in which direction it should be optimzed. Defaults to `MaximizeObjective`.
    """

    type: Literal["ContinuousOutput"] = "ContinuousOutput"
    order_id: ClassVar[int] = 9
    unit: Optional[str] = None

    objective: Optional[AnyObjective] = Field(
        default_factory=lambda: MaximizeObjective(w=1.0)
    )

    def __call__(self, values: pd.Series) -> pd.Series:
        if self.objective is None:
            return pd.Series(
                data=[np.nan for _ in range(len(values))],
                index=values.index,
                name=values.name,
            )
        return self.objective(values)  # type: ignore

    def validate_experimental(self, values: pd.Series) -> pd.Series:
        try:
            values = pd.to_numeric(values, errors="raise").astype("float64")
        except ValueError:
            raise ValueError(
                f"not all values of input feature `{self.key}` are numerical"
            )
        return values

    def __str__(self) -> str:
        return "ContinuousOutputFeature"

validate_experimental(self, values)

Abstract method to validate the experimental Series

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with values for the outcome

required

Returns:

Type Description
pd.Series

The passed dataFrame with experiments

Source code in bofire/data_models/features/continuous.py
def validate_experimental(self, values: pd.Series) -> pd.Series:
    try:
        values = pd.to_numeric(values, errors="raise").astype("float64")
    except ValueError:
        raise ValueError(
            f"not all values of input feature `{self.key}` are numerical"
        )
    return values

descriptor

CategoricalDescriptorInput (CategoricalInput)

Class for categorical input features with descriptors

Attributes:

Name Type Description
categories List[str]

Names of the categories.

allowed List[bool]

List of bools indicating if a category is allowed within the optimization.

descriptors List[str]

List of strings representing the names of the descriptors.

values List[List[float]]

List of lists representing the descriptor values.

Source code in bofire/data_models/features/descriptor.py
class CategoricalDescriptorInput(CategoricalInput):
    """Class for categorical input features with descriptors

    Attributes:
        categories (List[str]): Names of the categories.
        allowed (List[bool]): List of bools indicating if a category is allowed within the optimization.
        descriptors (List[str]): List of strings representing the names of the descriptors.
        values (List[List[float]]): List of lists representing the descriptor values.
    """

    type: Literal["CategoricalDescriptorInput"] = "CategoricalDescriptorInput"
    order_id: ClassVar[int] = 6

    descriptors: Descriptors
    values: Annotated[
        List[List[float]],
        Field(min_length=1),
    ]

    @field_validator("values")
    @classmethod
    def validate_values(cls, v, info):
        """validates the compatability of passed values for the descriptors and the defined categories

        Args:
            v (List[List[float]]): Nested list with descriptor values
            values (Dict): Dictionary with attributes

        Raises:
            ValueError: when values have different length than categories
            ValueError: when rows in values have different length than descriptors
            ValueError: when a descriptor shows no variance in the data

        Returns:
            List[List[float]]: Nested list with descriptor values
        """
        if len(v) != len(info.data["categories"]):
            raise ValueError("values must have same length as categories")
        for row in v:
            if len(row) != len(info.data["descriptors"]):
                raise ValueError("rows in values must have same length as descriptors")
        a = np.array(v)
        for i, d in enumerate(info.data["descriptors"]):
            if len(set(a[:, i])) == 1:
                raise ValueError(f"No variation for descriptor {d}.")
        return v

    @staticmethod
    def valid_transform_types() -> List[CategoricalEncodingEnum]:
        return [
            CategoricalEncodingEnum.ONE_HOT,
            CategoricalEncodingEnum.DUMMY,
            CategoricalEncodingEnum.ORDINAL,
            CategoricalEncodingEnum.DESCRIPTOR,
        ]

    def to_df(self):
        """tabular overview of the feature as DataFrame

        Returns:
            pd.DataFrame: tabular overview of the feature as DataFrame
        """
        data = dict(zip(self.categories, self.values))
        return pd.DataFrame.from_dict(data, orient="index", columns=self.descriptors)

    def fixed_value(
        self, transform_type: Optional[TTransform] = None
    ) -> Union[List[str], List[float], None]:
        """Returns the categories to which the feature is fixed, None if the feature is not fixed

        Returns:
            List[str]: List of categories or None
        """
        if transform_type != CategoricalEncodingEnum.DESCRIPTOR:
            return super().fixed_value(transform_type)
        else:
            val = self.get_allowed_categories()[0]
            return self.to_descriptor_encoding(pd.Series([val])).values[0].tolist()

    def get_bounds(
        self,
        transform_type: TTransform,
        values: Optional[pd.Series] = None,
        reference_value: Optional[str] = None,
    ) -> Tuple[List[float], List[float]]:
        if transform_type != CategoricalEncodingEnum.DESCRIPTOR:
            return super().get_bounds(transform_type, values)
        else:
            # in case that values is None, we return the optimization bounds
            # else we return the complete bounds
            if values is None:
                df = self.to_df().loc[self.get_allowed_categories()]
            else:
                df = self.to_df()
            lower = df.min().values.tolist()  # type: ignore
            upper = df.max().values.tolist()  # type: ignore
            return lower, upper

    def validate_experimental(
        self, values: pd.Series, strict: bool = False
    ) -> pd.Series:
        """Method to validate the experimental dataFrame

        Args:
            values (pd.Series): A dataFrame with experiments
            strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

        Raises:
            ValueError: when an entry is not in the list of allowed categories
            ValueError: when there is no variation in a feature provided by the experimental data
            ValueError: when no variation is present or planed for a given descriptor

        Returns:
            pd.Series: A dataFrame with experiments
        """
        values = super().validate_experimental(values, strict)
        if strict:
            lower, upper = self.get_bounds(
                transform_type=CategoricalEncodingEnum.DESCRIPTOR, values=values
            )
            for i, desc in enumerate(self.descriptors):
                if lower[i] == upper[i]:
                    raise ValueError(
                        f"No variation present or planned for descriptor {desc} for feature {self.key}. Remove the descriptor."
                    )
        return values

    @classmethod
    def from_df(cls, key: str, df: pd.DataFrame):
        """Creates a feature from a dataframe

        Args:
            key (str): The name of the feature
            df (pd.DataFrame): Categories as rows and descriptors as columns

        Returns:
            _type_: _description_
        """
        return cls(
            key=key,
            categories=list(df.index),
            allowed=[True for _ in range(len(df))],
            descriptors=list(df.columns),
            values=df.values.tolist(),
        )

    def to_descriptor_encoding(self, values: pd.Series) -> pd.DataFrame:
        """Converts values to descriptor encoding.

        Args:
            values (pd.Series): Values to transform.

        Returns:
            pd.DataFrame: Descriptor encoded dataframe.
        """
        return pd.DataFrame(
            data=values.map(dict(zip(self.categories, self.values))).values.tolist(),  # type: ignore
            columns=[get_encoded_name(self.key, d) for d in self.descriptors],
            index=values.index,
        )

    def from_descriptor_encoding(self, values: pd.DataFrame) -> pd.Series:
        """Converts values back from descriptor encoding.

        Args:
            values (pd.DataFrame): Descriptor encoded dataframe.

        Raises:
            ValueError: If descriptor columns not found in the dataframe.

        Returns:
            pd.Series: Series with categorical values.
        """
        cat_cols = [get_encoded_name(self.key, d) for d in self.descriptors]
        # we allow here explicitly that the dataframe can have more columns than needed to have it
        # easier in the backtransform.
        if np.any([c not in values.columns for c in cat_cols]):
            raise ValueError(
                f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
            )
        s = pd.DataFrame(
            data=np.sqrt(
                np.sum(
                    (
                        values[cat_cols].to_numpy()[:, np.newaxis, :]
                        - self.to_df().iloc[self.allowed].to_numpy()
                    )
                    ** 2,
                    axis=2,
                )
            ),
            columns=self.get_allowed_categories(),
            index=values.index,
        ).idxmin(1)
        s.name = self.key
        return s

fixed_value(self, transform_type=None)

Returns the categories to which the feature is fixed, None if the feature is not fixed

Returns:

Type Description
List[str]

List of categories or None

Source code in bofire/data_models/features/descriptor.py
def fixed_value(
    self, transform_type: Optional[TTransform] = None
) -> Union[List[str], List[float], None]:
    """Returns the categories to which the feature is fixed, None if the feature is not fixed

    Returns:
        List[str]: List of categories or None
    """
    if transform_type != CategoricalEncodingEnum.DESCRIPTOR:
        return super().fixed_value(transform_type)
    else:
        val = self.get_allowed_categories()[0]
        return self.to_descriptor_encoding(pd.Series([val])).values[0].tolist()

from_descriptor_encoding(self, values)

Converts values back from descriptor encoding.

Parameters:

Name Type Description Default
values pd.DataFrame

Descriptor encoded dataframe.

required

Exceptions:

Type Description
ValueError

If descriptor columns not found in the dataframe.

Returns:

Type Description
pd.Series

Series with categorical values.

Source code in bofire/data_models/features/descriptor.py
def from_descriptor_encoding(self, values: pd.DataFrame) -> pd.Series:
    """Converts values back from descriptor encoding.

    Args:
        values (pd.DataFrame): Descriptor encoded dataframe.

    Raises:
        ValueError: If descriptor columns not found in the dataframe.

    Returns:
        pd.Series: Series with categorical values.
    """
    cat_cols = [get_encoded_name(self.key, d) for d in self.descriptors]
    # we allow here explicitly that the dataframe can have more columns than needed to have it
    # easier in the backtransform.
    if np.any([c not in values.columns for c in cat_cols]):
        raise ValueError(
            f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
        )
    s = pd.DataFrame(
        data=np.sqrt(
            np.sum(
                (
                    values[cat_cols].to_numpy()[:, np.newaxis, :]
                    - self.to_df().iloc[self.allowed].to_numpy()
                )
                ** 2,
                axis=2,
            )
        ),
        columns=self.get_allowed_categories(),
        index=values.index,
    ).idxmin(1)
    s.name = self.key
    return s

from_df(key, df) classmethod

Creates a feature from a dataframe

Parameters:

Name Type Description Default
key str

The name of the feature

required
df pd.DataFrame

Categories as rows and descriptors as columns

required

Returns:

Type Description
_type_

description

Source code in bofire/data_models/features/descriptor.py
@classmethod
def from_df(cls, key: str, df: pd.DataFrame):
    """Creates a feature from a dataframe

    Args:
        key (str): The name of the feature
        df (pd.DataFrame): Categories as rows and descriptors as columns

    Returns:
        _type_: _description_
    """
    return cls(
        key=key,
        categories=list(df.index),
        allowed=[True for _ in range(len(df))],
        descriptors=list(df.columns),
        values=df.values.tolist(),
    )

get_bounds(self, transform_type, values=None, reference_value=None)

Returns the bounds of an input feature depending on the requested transform type.

Parameters:

Name Type Description Default
transform_type Optional[TTransform]

The requested transform type. Defaults to None.

required
values Optional[pd.Series]

If values are provided the bounds are returned taking the most extreme values for the feature into account. Defaults to None.

None
reference_value Optional[float]

If a reference value is provided, then the local bounds based on a local search region are provided. Currently only supported for continuous inputs. For more details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.

None

Returns:

Type Description
Tuple[List[float], List[float]]

List of lower bound values, list of upper bound values.

Source code in bofire/data_models/features/descriptor.py
def get_bounds(
    self,
    transform_type: TTransform,
    values: Optional[pd.Series] = None,
    reference_value: Optional[str] = None,
) -> Tuple[List[float], List[float]]:
    if transform_type != CategoricalEncodingEnum.DESCRIPTOR:
        return super().get_bounds(transform_type, values)
    else:
        # in case that values is None, we return the optimization bounds
        # else we return the complete bounds
        if values is None:
            df = self.to_df().loc[self.get_allowed_categories()]
        else:
            df = self.to_df()
        lower = df.min().values.tolist()  # type: ignore
        upper = df.max().values.tolist()  # type: ignore
        return lower, upper

to_descriptor_encoding(self, values)

Converts values to descriptor encoding.

Parameters:

Name Type Description Default
values pd.Series

Values to transform.

required

Returns:

Type Description
pd.DataFrame

Descriptor encoded dataframe.

Source code in bofire/data_models/features/descriptor.py
def to_descriptor_encoding(self, values: pd.Series) -> pd.DataFrame:
    """Converts values to descriptor encoding.

    Args:
        values (pd.Series): Values to transform.

    Returns:
        pd.DataFrame: Descriptor encoded dataframe.
    """
    return pd.DataFrame(
        data=values.map(dict(zip(self.categories, self.values))).values.tolist(),  # type: ignore
        columns=[get_encoded_name(self.key, d) for d in self.descriptors],
        index=values.index,
    )

to_df(self)

tabular overview of the feature as DataFrame

Returns:

Type Description
pd.DataFrame

tabular overview of the feature as DataFrame

Source code in bofire/data_models/features/descriptor.py
def to_df(self):
    """tabular overview of the feature as DataFrame

    Returns:
        pd.DataFrame: tabular overview of the feature as DataFrame
    """
    data = dict(zip(self.categories, self.values))
    return pd.DataFrame.from_dict(data, orient="index", columns=self.descriptors)

validate_experimental(self, values, strict=False)

Method to validate the experimental dataFrame

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with experiments

required
strict bool

Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

False

Exceptions:

Type Description
ValueError

when an entry is not in the list of allowed categories

ValueError

when there is no variation in a feature provided by the experimental data

ValueError

when no variation is present or planed for a given descriptor

Returns:

Type Description
pd.Series

A dataFrame with experiments

Source code in bofire/data_models/features/descriptor.py
def validate_experimental(
    self, values: pd.Series, strict: bool = False
) -> pd.Series:
    """Method to validate the experimental dataFrame

    Args:
        values (pd.Series): A dataFrame with experiments
        strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

    Raises:
        ValueError: when an entry is not in the list of allowed categories
        ValueError: when there is no variation in a feature provided by the experimental data
        ValueError: when no variation is present or planed for a given descriptor

    Returns:
        pd.Series: A dataFrame with experiments
    """
    values = super().validate_experimental(values, strict)
    if strict:
        lower, upper = self.get_bounds(
            transform_type=CategoricalEncodingEnum.DESCRIPTOR, values=values
        )
        for i, desc in enumerate(self.descriptors):
            if lower[i] == upper[i]:
                raise ValueError(
                    f"No variation present or planned for descriptor {desc} for feature {self.key}. Remove the descriptor."
                )
    return values

validate_values(v, info) classmethod

validates the compatability of passed values for the descriptors and the defined categories

Parameters:

Name Type Description Default
v List[List[float]]

Nested list with descriptor values

required
values Dict

Dictionary with attributes

required

Exceptions:

Type Description
ValueError

when values have different length than categories

ValueError

when rows in values have different length than descriptors

ValueError

when a descriptor shows no variance in the data

Returns:

Type Description
List[List[float]]

Nested list with descriptor values

Source code in bofire/data_models/features/descriptor.py
@field_validator("values")
@classmethod
def validate_values(cls, v, info):
    """validates the compatability of passed values for the descriptors and the defined categories

    Args:
        v (List[List[float]]): Nested list with descriptor values
        values (Dict): Dictionary with attributes

    Raises:
        ValueError: when values have different length than categories
        ValueError: when rows in values have different length than descriptors
        ValueError: when a descriptor shows no variance in the data

    Returns:
        List[List[float]]: Nested list with descriptor values
    """
    if len(v) != len(info.data["categories"]):
        raise ValueError("values must have same length as categories")
    for row in v:
        if len(row) != len(info.data["descriptors"]):
            raise ValueError("rows in values must have same length as descriptors")
    a = np.array(v)
    for i, d in enumerate(info.data["descriptors"]):
        if len(set(a[:, i])) == 1:
            raise ValueError(f"No variation for descriptor {d}.")
    return v

ContinuousDescriptorInput (ContinuousInput)

Class for continuous input features with descriptors

Attributes:

Name Type Description
lower_bound float

Lower bound of the feature in the optimization.

upper_bound float

Upper bound of the feature in the optimization.

descriptors List[str]

Names of the descriptors.

values List[float]

Values of the descriptors.

Source code in bofire/data_models/features/descriptor.py
class ContinuousDescriptorInput(ContinuousInput):
    """Class for continuous input features with descriptors

    Attributes:
        lower_bound (float): Lower bound of the feature in the optimization.
        upper_bound (float): Upper bound of the feature in the optimization.
        descriptors (List[str]): Names of the descriptors.
        values (List[float]): Values of the descriptors.
    """

    type: Literal["ContinuousDescriptorInput"] = "ContinuousDescriptorInput"
    order_id: ClassVar[int] = 2

    descriptors: Descriptors
    values: DiscreteVals

    @model_validator(mode="after")
    def validate_list_lengths(self):
        """compares the length of the defined descriptors list with the provided values

        Args:
            values (Dict): Dictionary with all attribues

        Raises:
            ValueError: when the number of descriptors does not math the number of provided values

        Returns:
            Dict: Dict with the attributes
        """
        if len(self.descriptors) != len(self.values):
            raise ValueError(
                'must provide same number of descriptors and values, got {len(values["descriptors"])} != {len(values["values"])}'
            )
        return self

    def to_df(self) -> pd.DataFrame:
        """tabular overview of the feature as DataFrame

        Returns:
            pd.DataFrame: tabular overview of the feature as DataFrame
        """
        return pd.DataFrame(
            data=[self.values], index=[self.key], columns=self.descriptors
        )

to_df(self)

tabular overview of the feature as DataFrame

Returns:

Type Description
pd.DataFrame

tabular overview of the feature as DataFrame

Source code in bofire/data_models/features/descriptor.py
def to_df(self) -> pd.DataFrame:
    """tabular overview of the feature as DataFrame

    Returns:
        pd.DataFrame: tabular overview of the feature as DataFrame
    """
    return pd.DataFrame(
        data=[self.values], index=[self.key], columns=self.descriptors
    )

validate_list_lengths(self)

compares the length of the defined descriptors list with the provided values

Parameters:

Name Type Description Default
values Dict

Dictionary with all attribues

required

Exceptions:

Type Description
ValueError

when the number of descriptors does not math the number of provided values

Returns:

Type Description
Dict

Dict with the attributes

Source code in bofire/data_models/features/descriptor.py
@model_validator(mode="after")
def validate_list_lengths(self):
    """compares the length of the defined descriptors list with the provided values

    Args:
        values (Dict): Dictionary with all attribues

    Raises:
        ValueError: when the number of descriptors does not math the number of provided values

    Returns:
        Dict: Dict with the attributes
    """
    if len(self.descriptors) != len(self.values):
        raise ValueError(
            'must provide same number of descriptors and values, got {len(values["descriptors"])} != {len(values["values"])}'
        )
    return self

discrete

DiscreteInput (NumericalInput)

Feature with discretized ordinal values allowed in the optimization.

Attributes:

Name Type Description
key(str)

key of the feature.

values(List[float])

the discretized allowed values during the optimization.

Source code in bofire/data_models/features/discrete.py
class DiscreteInput(NumericalInput):
    """Feature with discretized ordinal values allowed in the optimization.

    Attributes:
        key(str): key of the feature.
        values(List[float]): the discretized allowed values during the optimization.
    """

    type: Literal["DiscreteInput"] = "DiscreteInput"
    order_id: ClassVar[int] = 3

    values: DiscreteVals

    @field_validator("values")
    @classmethod
    def validate_values_unique(cls, values):
        """Validates that provided values are unique.

        Args:
            values (List[float]): List of values

        Raises:
            ValueError: when values are non-unique.
            ValueError: when values contains only one entry.
            ValueError: when values is empty.

        Returns:
            List[values]: Sorted list of values
        """
        if len(values) != len(set(values)):
            raise ValueError("Discrete values must be unique")
        if len(values) == 1:
            raise ValueError(
                "Fixed discrete inputs are not supported. Please use a fixed continuous input."
            )
        if len(values) == 0:
            raise ValueError("No values defined.")
        return sorted(values)

    @property
    def lower_bound(self) -> float:
        """Lower bound of the set of allowed values"""
        return min(self.values)

    @property
    def upper_bound(self) -> float:
        """Upper bound of the set of allowed values"""
        return max(self.values)

    def validate_candidental(self, values: pd.Series) -> pd.Series:
        """Method to validate the provided candidates.

        Args:
            values (pd.Series): suggested candidates for the feature

        Raises:
            ValueError: Raises error when one of the provided values is not contained in the list of allowed values.

        Returns:
            pd.Series: _uggested candidates for the feature
        """
        values = super().validate_candidental(values)
        if not np.isin(values.to_numpy(), np.array(self.values)).all():
            raise ValueError(
                f"Not allowed values in candidates for feature {self.key}."
            )
        return values

    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
        """Draw random samples from the feature.

        Args:
            n (int): number of samples.

        Returns:
            pd.Series: drawn samples.
        """
        return pd.Series(
            name=self.key, data=np.random.default_rng(seed=seed).choice(self.values, n)
        )

    def from_continuous(self, values: pd.DataFrame) -> pd.Series:
        """Rounds continuous values to the closest discrete ones.

        Args:
            values (pd.DataFrame): Dataframe with continuous entries.

        Returns:
            pd.Series: Series with discrete values.
        """

        s = pd.DataFrame(
            data=np.abs(
                (values[self.key].to_numpy()[:, np.newaxis] - np.array(self.values))
            ),
            columns=self.values,
            index=values.index,
        ).idxmin(1)
        s.name = self.key
        return s

    def get_bounds(
        self,
        transform_type: Optional[TTransform] = None,
        values: Optional[pd.Series] = None,
        reference_value: Optional[float] = None,
    ) -> Tuple[List[float], List[float]]:
        assert transform_type is None
        if values is None:
            return [self.lower_bound], [self.upper_bound]  # type: ignore
        lower = min(self.lower_bound, values.min())  # type: ignore
        upper = max(self.upper_bound, values.max())  # type: ignore
        return [lower], [upper]  # type: ignore

lower_bound: float property readonly

Lower bound of the set of allowed values

upper_bound: float property readonly

Upper bound of the set of allowed values

from_continuous(self, values)

Rounds continuous values to the closest discrete ones.

Parameters:

Name Type Description Default
values pd.DataFrame

Dataframe with continuous entries.

required

Returns:

Type Description
pd.Series

Series with discrete values.

Source code in bofire/data_models/features/discrete.py
def from_continuous(self, values: pd.DataFrame) -> pd.Series:
    """Rounds continuous values to the closest discrete ones.

    Args:
        values (pd.DataFrame): Dataframe with continuous entries.

    Returns:
        pd.Series: Series with discrete values.
    """

    s = pd.DataFrame(
        data=np.abs(
            (values[self.key].to_numpy()[:, np.newaxis] - np.array(self.values))
        ),
        columns=self.values,
        index=values.index,
    ).idxmin(1)
    s.name = self.key
    return s

get_bounds(self, transform_type=None, values=None, reference_value=None)

Returns the bounds of an input feature depending on the requested transform type.

Parameters:

Name Type Description Default
transform_type Optional[TTransform]

The requested transform type. Defaults to None.

None
values Optional[pd.Series]

If values are provided the bounds are returned taking the most extreme values for the feature into account. Defaults to None.

None
reference_value Optional[float]

If a reference value is provided, then the local bounds based on a local search region are provided. Currently only supported for continuous inputs. For more details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.

None

Returns:

Type Description
Tuple[List[float], List[float]]

List of lower bound values, list of upper bound values.

Source code in bofire/data_models/features/discrete.py
def get_bounds(
    self,
    transform_type: Optional[TTransform] = None,
    values: Optional[pd.Series] = None,
    reference_value: Optional[float] = None,
) -> Tuple[List[float], List[float]]:
    assert transform_type is None
    if values is None:
        return [self.lower_bound], [self.upper_bound]  # type: ignore
    lower = min(self.lower_bound, values.min())  # type: ignore
    upper = max(self.upper_bound, values.max())  # type: ignore
    return [lower], [upper]  # type: ignore

sample(self, n, seed=None)

Draw random samples from the feature.

Parameters:

Name Type Description Default
n int

number of samples.

required

Returns:

Type Description
pd.Series

drawn samples.

Source code in bofire/data_models/features/discrete.py
def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
    """Draw random samples from the feature.

    Args:
        n (int): number of samples.

    Returns:
        pd.Series: drawn samples.
    """
    return pd.Series(
        name=self.key, data=np.random.default_rng(seed=seed).choice(self.values, n)
    )

validate_candidental(self, values)

Method to validate the provided candidates.

Parameters:

Name Type Description Default
values pd.Series

suggested candidates for the feature

required

Exceptions:

Type Description
ValueError

Raises error when one of the provided values is not contained in the list of allowed values.

Returns:

Type Description
pd.Series

_uggested candidates for the feature

Source code in bofire/data_models/features/discrete.py
def validate_candidental(self, values: pd.Series) -> pd.Series:
    """Method to validate the provided candidates.

    Args:
        values (pd.Series): suggested candidates for the feature

    Raises:
        ValueError: Raises error when one of the provided values is not contained in the list of allowed values.

    Returns:
        pd.Series: _uggested candidates for the feature
    """
    values = super().validate_candidental(values)
    if not np.isin(values.to_numpy(), np.array(self.values)).all():
        raise ValueError(
            f"Not allowed values in candidates for feature {self.key}."
        )
    return values

validate_values_unique(values) classmethod

Validates that provided values are unique.

Parameters:

Name Type Description Default
values List[float]

List of values

required

Exceptions:

Type Description
ValueError

when values are non-unique.

ValueError

when values contains only one entry.

ValueError

when values is empty.

Returns:

Type Description
List[values]

Sorted list of values

Source code in bofire/data_models/features/discrete.py
@field_validator("values")
@classmethod
def validate_values_unique(cls, values):
    """Validates that provided values are unique.

    Args:
        values (List[float]): List of values

    Raises:
        ValueError: when values are non-unique.
        ValueError: when values contains only one entry.
        ValueError: when values is empty.

    Returns:
        List[values]: Sorted list of values
    """
    if len(values) != len(set(values)):
        raise ValueError("Discrete values must be unique")
    if len(values) == 1:
        raise ValueError(
            "Fixed discrete inputs are not supported. Please use a fixed continuous input."
        )
    if len(values) == 0:
        raise ValueError("No values defined.")
    return sorted(values)

feature

Feature (BaseModel)

The base class for all features.

Source code in bofire/data_models/features/feature.py
class Feature(BaseModel):
    """The base class for all features."""

    type: str
    key: str
    order_id: ClassVar[int] = -1

    def __lt__(self, other) -> bool:
        """
        Method to compare two models to get them in the desired order.
        Return True if other is larger than self, else False. (see FEATURE_ORDER)

        Args:
            other: The other class to compare to self

        Returns:
            bool: True if the other class is larger than self, else False
        """
        order_self = self.order_id
        order_other = other.order_id
        if order_self == order_other:
            return self.key < other.key
        else:
            return order_self < order_other

__lt__(self, other) special

Method to compare two models to get them in the desired order. Return True if other is larger than self, else False. (see FEATURE_ORDER)

Parameters:

Name Type Description Default
other

The other class to compare to self

required

Returns:

Type Description
bool

True if the other class is larger than self, else False

Source code in bofire/data_models/features/feature.py
def __lt__(self, other) -> bool:
    """
    Method to compare two models to get them in the desired order.
    Return True if other is larger than self, else False. (see FEATURE_ORDER)

    Args:
        other: The other class to compare to self

    Returns:
        bool: True if the other class is larger than self, else False
    """
    order_self = self.order_id
    order_other = other.order_id
    if order_self == order_other:
        return self.key < other.key
    else:
        return order_self < order_other

Input (Feature)

Base class for all input features.

Source code in bofire/data_models/features/feature.py
class Input(Feature):
    """Base class for all input features."""

    @staticmethod
    @abstractmethod
    def valid_transform_types() -> List[Union[CategoricalEncodingEnum, AnyMolFeatures]]:
        pass

    @abstractmethod
    def is_fixed(self) -> bool:
        """Indicates if a variable is set to a fixed value.

        Returns:
            bool: True if fixed, els False.
        """
        pass

    @abstractmethod
    def fixed_value(
        self, transform_type: Optional[TTransform] = None
    ) -> Union[None, List[str], List[float]]:
        """Method to return the fixed value in case of a fixed feature.

        Returns:
            Union[None,str,float]: None in case the feature is not fixed, else the fixed value.
        """
        pass

    @abstractmethod
    def validate_experimental(
        self, values: pd.Series, strict: bool = False
    ) -> pd.Series:
        """Abstract method to validate the experimental dataFrame

        Args:
            values (pd.Series): A dataFrame with experiments
            strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

        Returns:
            pd.Series: The passed dataFrame with experiments
        """
        pass

    @abstractmethod
    def validate_candidental(self, values: pd.Series) -> pd.Series:
        """Abstract method to validate the suggested candidates

        Args:
            values (pd.Series): A dataFrame with candidates

        Returns:
            pd.Series: The passed dataFrame with candidates
        """
        pass

    @abstractmethod
    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
        """Sample a series of allowed values.

        Args:
            n (int): Number of samples

        Returns:
            pd.Series: Sampled values.
        """
        pass

    @abstractmethod
    def get_bounds(
        self,
        transform_type: Optional[TTransform] = None,
        values: Optional[pd.Series] = None,
        reference_value: Optional[Union[float, str]] = None,
    ) -> Tuple[List[float], List[float]]:
        """Returns the bounds of an input feature depending on the requested transform type.

        Args:
            transform_type (Optional[TTransform], optional): The requested transform type. Defaults to None.
            values (Optional[pd.Series], optional): If values are provided the bounds are returned taking
                the most extreme values for the feature into account. Defaults to None.
            reference_value (Optional[float], optional): If a reference value is provided, then the local bounds based
                on a local search region are provided. Currently only supported for continuous inputs. For more
                details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.
        Returns:
            Tuple[List[float], List[float]]: List of lower bound values, list of upper bound values.
        """
        pass

fixed_value(self, transform_type=None)

Method to return the fixed value in case of a fixed feature.

Returns:

Type Description
Union[None,str,float]

None in case the feature is not fixed, else the fixed value.

Source code in bofire/data_models/features/feature.py
@abstractmethod
def fixed_value(
    self, transform_type: Optional[TTransform] = None
) -> Union[None, List[str], List[float]]:
    """Method to return the fixed value in case of a fixed feature.

    Returns:
        Union[None,str,float]: None in case the feature is not fixed, else the fixed value.
    """
    pass

get_bounds(self, transform_type=None, values=None, reference_value=None)

Returns the bounds of an input feature depending on the requested transform type.

Parameters:

Name Type Description Default
transform_type Optional[TTransform]

The requested transform type. Defaults to None.

None
values Optional[pd.Series]

If values are provided the bounds are returned taking the most extreme values for the feature into account. Defaults to None.

None
reference_value Optional[float]

If a reference value is provided, then the local bounds based on a local search region are provided. Currently only supported for continuous inputs. For more details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.

None

Returns:

Type Description
Tuple[List[float], List[float]]

List of lower bound values, list of upper bound values.

Source code in bofire/data_models/features/feature.py
@abstractmethod
def get_bounds(
    self,
    transform_type: Optional[TTransform] = None,
    values: Optional[pd.Series] = None,
    reference_value: Optional[Union[float, str]] = None,
) -> Tuple[List[float], List[float]]:
    """Returns the bounds of an input feature depending on the requested transform type.

    Args:
        transform_type (Optional[TTransform], optional): The requested transform type. Defaults to None.
        values (Optional[pd.Series], optional): If values are provided the bounds are returned taking
            the most extreme values for the feature into account. Defaults to None.
        reference_value (Optional[float], optional): If a reference value is provided, then the local bounds based
            on a local search region are provided. Currently only supported for continuous inputs. For more
            details, it is referred to https://www.merl.com/publications/docs/TR2023-057.pdf.
    Returns:
        Tuple[List[float], List[float]]: List of lower bound values, list of upper bound values.
    """
    pass

is_fixed(self)

Indicates if a variable is set to a fixed value.

Returns:

Type Description
bool

True if fixed, els False.

Source code in bofire/data_models/features/feature.py
@abstractmethod
def is_fixed(self) -> bool:
    """Indicates if a variable is set to a fixed value.

    Returns:
        bool: True if fixed, els False.
    """
    pass

sample(self, n, seed=None)

Sample a series of allowed values.

Parameters:

Name Type Description Default
n int

Number of samples

required

Returns:

Type Description
pd.Series

Sampled values.

Source code in bofire/data_models/features/feature.py
@abstractmethod
def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
    """Sample a series of allowed values.

    Args:
        n (int): Number of samples

    Returns:
        pd.Series: Sampled values.
    """
    pass

validate_candidental(self, values)

Abstract method to validate the suggested candidates

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with candidates

required

Returns:

Type Description
pd.Series

The passed dataFrame with candidates

Source code in bofire/data_models/features/feature.py
@abstractmethod
def validate_candidental(self, values: pd.Series) -> pd.Series:
    """Abstract method to validate the suggested candidates

    Args:
        values (pd.Series): A dataFrame with candidates

    Returns:
        pd.Series: The passed dataFrame with candidates
    """
    pass

validate_experimental(self, values, strict=False)

Abstract method to validate the experimental dataFrame

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with experiments

required
strict bool

Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

False

Returns:

Type Description
pd.Series

The passed dataFrame with experiments

Source code in bofire/data_models/features/feature.py
@abstractmethod
def validate_experimental(
    self, values: pd.Series, strict: bool = False
) -> pd.Series:
    """Abstract method to validate the experimental dataFrame

    Args:
        values (pd.Series): A dataFrame with experiments
        strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

    Returns:
        pd.Series: The passed dataFrame with experiments
    """
    pass

Output (Feature)

Base class for all output features.

Attributes:

Name Type Description
key(str)

Key of the Feature.

Source code in bofire/data_models/features/feature.py
class Output(Feature):
    """Base class for all output features.

    Attributes:
        key(str): Key of the Feature.
    """

    @abstractmethod
    def __call__(self, values: pd.Series) -> pd.Series:
        pass

    @abstractmethod
    def validate_experimental(self, values: pd.Series) -> pd.Series:
        """Abstract method to validate the experimental Series

        Args:
            values (pd.Series): A dataFrame with values for the outcome

        Returns:
            pd.Series: The passed dataFrame with experiments
        """
        pass

validate_experimental(self, values)

Abstract method to validate the experimental Series

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with values for the outcome

required

Returns:

Type Description
pd.Series

The passed dataFrame with experiments

Source code in bofire/data_models/features/feature.py
@abstractmethod
def validate_experimental(self, values: pd.Series) -> pd.Series:
    """Abstract method to validate the experimental Series

    Args:
        values (pd.Series): A dataFrame with values for the outcome

    Returns:
        pd.Series: The passed dataFrame with experiments
    """
    pass

get_encoded_name(feature_key, option_name)

Get the name of the encoded column. Option could be the category or the descriptor name.

Source code in bofire/data_models/features/feature.py
def get_encoded_name(feature_key: str, option_name: str) -> str:
    """Get the name of the encoded column. Option could be the category or the descriptor name."""
    return f"{feature_key}_{option_name}"

molecular

CategoricalMolecularInput (CategoricalInput, MolecularInput)

Source code in bofire/data_models/features/molecular.py
class CategoricalMolecularInput(CategoricalInput, MolecularInput):
    type: Literal["CategoricalMolecularInput"] = "CategoricalMolecularInput"
    # order_id: ClassVar[int] = 7
    order_id: ClassVar[int] = 5

    @field_validator("categories")
    @classmethod
    def validate_smiles(cls, categories: Sequence[str]):
        """validates that categories are valid smiles. Note that this check can only
        be executed when rdkit is available.

        Args:
            categories (List[str]): List of smiles

        Raises:
            ValueError: when string is not a smiles

        Returns:
            List[str]: List of the smiles
        """
        # check on rdkit availability:
        try:
            smiles2mol(categories[0])
        except NameError:
            warnings.warn("rdkit not installed, categories cannot be validated.")
            return categories

        for cat in categories:
            smiles2mol(cat)
        return categories

    @staticmethod
    def valid_transform_types() -> List[Union[AnyMolFeatures, CategoricalEncodingEnum]]:
        return CategoricalInput.valid_transform_types() + [
            Fingerprints,
            FingerprintsFragments,
            Fragments,
            MordredDescriptors,  # type: ignore
        ]

    def get_bounds(
        self,
        transform_type: Union[CategoricalEncodingEnum, AnyMolFeatures],
        values: Optional[pd.Series] = None,
        reference_value: Optional[str] = None,
    ) -> Tuple[List[float], List[float]]:
        if isinstance(transform_type, CategoricalEncodingEnum):
            # we are just using the standard categorical transformations
            return super().get_bounds(
                transform_type=transform_type,
                values=values,
                reference_value=reference_value,
            )
        else:
            # in case that values is None, we return the optimization bounds
            # else we return the complete bounds
            data = self.to_descriptor_encoding(
                transform_type=transform_type,
                values=(
                    pd.Series(self.get_allowed_categories())
                    if values is None
                    else pd.Series(self.categories)
                ),
            )
        lower = data.min(axis=0).values.tolist()
        upper = data.max(axis=0).values.tolist()
        return lower, upper

    def from_descriptor_encoding(
        self, transform_type: AnyMolFeatures, values: pd.DataFrame
    ) -> pd.Series:
        """Converts values back from descriptor encoding.

        Args:
            values (pd.DataFrame): Descriptor encoded dataframe.

        Raises:
            ValueError: If descriptor columns not found in the dataframe.

        Returns:
            pd.Series: Series with categorical values.
        """

        # This method is modified based on the categorical descriptor feature
        # TODO: move it to more central place
        cat_cols = [
            get_encoded_name(self.key, d) for d in transform_type.get_descriptor_names()
        ]
        # we allow here explicitly that the dataframe can have more columns than needed to have it
        # easier in the backtransform.
        if np.any([c not in values.columns for c in cat_cols]):
            raise ValueError(
                f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
            )
        s = pd.DataFrame(
            data=np.sqrt(
                np.sum(
                    (
                        values[cat_cols].to_numpy()[:, np.newaxis, :]
                        - self.to_descriptor_encoding(
                            transform_type=transform_type,
                            values=pd.Series(self.get_allowed_categories()),
                        ).to_numpy()
                    )
                    ** 2,
                    axis=2,
                )
            ),
            columns=self.get_allowed_categories(),
            index=values.index,
        ).idxmin(1)
        s.name = self.key
        return s

from_descriptor_encoding(self, transform_type, values)

Converts values back from descriptor encoding.

Parameters:

Name Type Description Default
values pd.DataFrame

Descriptor encoded dataframe.

required

Exceptions:

Type Description
ValueError

If descriptor columns not found in the dataframe.

Returns:

Type Description
pd.Series

Series with categorical values.

Source code in bofire/data_models/features/molecular.py
def from_descriptor_encoding(
    self, transform_type: AnyMolFeatures, values: pd.DataFrame
) -> pd.Series:
    """Converts values back from descriptor encoding.

    Args:
        values (pd.DataFrame): Descriptor encoded dataframe.

    Raises:
        ValueError: If descriptor columns not found in the dataframe.

    Returns:
        pd.Series: Series with categorical values.
    """

    # This method is modified based on the categorical descriptor feature
    # TODO: move it to more central place
    cat_cols = [
        get_encoded_name(self.key, d) for d in transform_type.get_descriptor_names()
    ]
    # we allow here explicitly that the dataframe can have more columns than needed to have it
    # easier in the backtransform.
    if np.any([c not in values.columns for c in cat_cols]):
        raise ValueError(
            f"{self.key}: Column names don't match categorical levels: {values.columns}, {cat_cols}."
        )
    s = pd.DataFrame(
        data=np.sqrt(
            np.sum(
                (
                    values[cat_cols].to_numpy()[:, np.newaxis, :]
                    - self.to_descriptor_encoding(
                        transform_type=transform_type,
                        values=pd.Series(self.get_allowed_categories()),
                    ).to_numpy()
                )
                ** 2,
                axis=2,
            )
        ),
        columns=self.get_allowed_categories(),
        index=values.index,
    ).idxmin(1)
    s.name = self.key
    return s

get_bounds(self, transform_type, values=None, reference_value=None)

Calculates the lower and upper bounds for the feature based on the given transform type and values.

Parameters:

Name Type Description Default
transform_type AnyMolFeatures

The type of transformation to apply to the data.

required
values pd.Series

The actual data over which the lower and upper bounds are calculated.

None
reference_value Optional[str]

The reference value for the transformation. Not used here. Defaults to None.

None

Returns:

Type Description
Tuple[List[float], List[float]]

A tuple containing the lower and upper bounds of the transformed data.

Exceptions:

Type Description
NotImplementedError

Raised when values is None, as it is currently required for MolecularInput.

Source code in bofire/data_models/features/molecular.py
def get_bounds(
    self,
    transform_type: Union[CategoricalEncodingEnum, AnyMolFeatures],
    values: Optional[pd.Series] = None,
    reference_value: Optional[str] = None,
) -> Tuple[List[float], List[float]]:
    if isinstance(transform_type, CategoricalEncodingEnum):
        # we are just using the standard categorical transformations
        return super().get_bounds(
            transform_type=transform_type,
            values=values,
            reference_value=reference_value,
        )
    else:
        # in case that values is None, we return the optimization bounds
        # else we return the complete bounds
        data = self.to_descriptor_encoding(
            transform_type=transform_type,
            values=(
                pd.Series(self.get_allowed_categories())
                if values is None
                else pd.Series(self.categories)
            ),
        )
    lower = data.min(axis=0).values.tolist()
    upper = data.max(axis=0).values.tolist()
    return lower, upper

validate_smiles(categories) classmethod

validates that categories are valid smiles. Note that this check can only be executed when rdkit is available.

Parameters:

Name Type Description Default
categories List[str]

List of smiles

required

Exceptions:

Type Description
ValueError

when string is not a smiles

Returns:

Type Description
List[str]

List of the smiles

Source code in bofire/data_models/features/molecular.py
@field_validator("categories")
@classmethod
def validate_smiles(cls, categories: Sequence[str]):
    """validates that categories are valid smiles. Note that this check can only
    be executed when rdkit is available.

    Args:
        categories (List[str]): List of smiles

    Raises:
        ValueError: when string is not a smiles

    Returns:
        List[str]: List of the smiles
    """
    # check on rdkit availability:
    try:
        smiles2mol(categories[0])
    except NameError:
        warnings.warn("rdkit not installed, categories cannot be validated.")
        return categories

    for cat in categories:
        smiles2mol(cat)
    return categories

MolecularInput (Input)

Source code in bofire/data_models/features/molecular.py
class MolecularInput(Input):
    type: Literal["MolecularInput"] = "MolecularInput"
    # order_id: ClassVar[int] = 6
    order_id: ClassVar[int] = 4

    @staticmethod
    def valid_transform_types() -> List[AnyMolFeatures]:
        return [Fingerprints, FingerprintsFragments, Fragments, MordredDescriptors]  # type: ignore

    def validate_experimental(
        self, values: pd.Series, strict: bool = False
    ) -> pd.Series:
        values = values.map(str)
        for smi in values:
            smiles2mol(smi)

        return values

    def validate_candidental(self, values: pd.Series) -> pd.Series:
        values = values.map(str)
        for smi in values:
            smiles2mol(smi)
        return values

    def is_fixed(self) -> bool:
        return False

    def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
        return None

    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
        raise ValueError("Sampling not supported for `MolecularInput`")

    def get_bounds(
        self,
        transform_type: AnyMolFeatures,
        values: pd.Series,
        reference_value: Optional[str] = None,
    ) -> Tuple[List[float], List[float]]:
        """
        Calculates the lower and upper bounds for the feature based on the given transform type and values.

        Args:
            transform_type (AnyMolFeatures): The type of transformation to apply to the data.
            values (pd.Series): The actual data over which the lower and upper bounds are calculated.
            reference_value (Optional[str], optional): The reference value for the transformation. Not used here.
                Defaults to None.

        Returns:
            Tuple[List[float], List[float]]: A tuple containing the lower and upper bounds of the transformed data.

        Raises:
            NotImplementedError: Raised when `values` is None, as it is currently required for `MolecularInput`.
        """
        if values is None:
            raise NotImplementedError(
                "`values` is currently required for `MolecularInput`"
            )
        else:
            data = self.to_descriptor_encoding(transform_type, values)

        lower = data.min(axis=0).values.tolist()
        upper = data.max(axis=0).values.tolist()

        return lower, upper

    def to_descriptor_encoding(
        self, transform_type: AnyMolFeatures, values: pd.Series
    ) -> pd.DataFrame:
        """Converts values to descriptor encoding.

        Args:
            values (pd.Series): Values to transform.

        Returns:
            pd.DataFrame: Descriptor encoded dataframe.
        """
        descriptor_values = transform_type.get_descriptor_values(values)

        descriptor_values.columns = [
            get_encoded_name(self.key, d) for d in transform_type.get_descriptor_names()
        ]
        descriptor_values.index = values.index

        return descriptor_values

fixed_value(self, transform_type=None)

Method to return the fixed value in case of a fixed feature.

Returns:

Type Description
Union[None,str,float]

None in case the feature is not fixed, else the fixed value.

Source code in bofire/data_models/features/molecular.py
def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
    return None

get_bounds(self, transform_type, values, reference_value=None)

Calculates the lower and upper bounds for the feature based on the given transform type and values.

Parameters:

Name Type Description Default
transform_type AnyMolFeatures

The type of transformation to apply to the data.

required
values pd.Series

The actual data over which the lower and upper bounds are calculated.

required
reference_value Optional[str]

The reference value for the transformation. Not used here. Defaults to None.

None

Returns:

Type Description
Tuple[List[float], List[float]]

A tuple containing the lower and upper bounds of the transformed data.

Exceptions:

Type Description
NotImplementedError

Raised when values is None, as it is currently required for MolecularInput.

Source code in bofire/data_models/features/molecular.py
def get_bounds(
    self,
    transform_type: AnyMolFeatures,
    values: pd.Series,
    reference_value: Optional[str] = None,
) -> Tuple[List[float], List[float]]:
    """
    Calculates the lower and upper bounds for the feature based on the given transform type and values.

    Args:
        transform_type (AnyMolFeatures): The type of transformation to apply to the data.
        values (pd.Series): The actual data over which the lower and upper bounds are calculated.
        reference_value (Optional[str], optional): The reference value for the transformation. Not used here.
            Defaults to None.

    Returns:
        Tuple[List[float], List[float]]: A tuple containing the lower and upper bounds of the transformed data.

    Raises:
        NotImplementedError: Raised when `values` is None, as it is currently required for `MolecularInput`.
    """
    if values is None:
        raise NotImplementedError(
            "`values` is currently required for `MolecularInput`"
        )
    else:
        data = self.to_descriptor_encoding(transform_type, values)

    lower = data.min(axis=0).values.tolist()
    upper = data.max(axis=0).values.tolist()

    return lower, upper

is_fixed(self)

Indicates if a variable is set to a fixed value.

Returns:

Type Description
bool

True if fixed, els False.

Source code in bofire/data_models/features/molecular.py
def is_fixed(self) -> bool:
    return False

sample(self, n, seed=None)

Sample a series of allowed values.

Parameters:

Name Type Description Default
n int

Number of samples

required

Returns:

Type Description
pd.Series

Sampled values.

Source code in bofire/data_models/features/molecular.py
def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
    raise ValueError("Sampling not supported for `MolecularInput`")

to_descriptor_encoding(self, transform_type, values)

Converts values to descriptor encoding.

Parameters:

Name Type Description Default
values pd.Series

Values to transform.

required

Returns:

Type Description
pd.DataFrame

Descriptor encoded dataframe.

Source code in bofire/data_models/features/molecular.py
def to_descriptor_encoding(
    self, transform_type: AnyMolFeatures, values: pd.Series
) -> pd.DataFrame:
    """Converts values to descriptor encoding.

    Args:
        values (pd.Series): Values to transform.

    Returns:
        pd.DataFrame: Descriptor encoded dataframe.
    """
    descriptor_values = transform_type.get_descriptor_values(values)

    descriptor_values.columns = [
        get_encoded_name(self.key, d) for d in transform_type.get_descriptor_names()
    ]
    descriptor_values.index = values.index

    return descriptor_values

validate_candidental(self, values)

Abstract method to validate the suggested candidates

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with candidates

required

Returns:

Type Description
pd.Series

The passed dataFrame with candidates

Source code in bofire/data_models/features/molecular.py
def validate_candidental(self, values: pd.Series) -> pd.Series:
    values = values.map(str)
    for smi in values:
        smiles2mol(smi)
    return values

validate_experimental(self, values, strict=False)

Abstract method to validate the experimental dataFrame

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with experiments

required
strict bool

Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

False

Returns:

Type Description
pd.Series

The passed dataFrame with experiments

Source code in bofire/data_models/features/molecular.py
def validate_experimental(
    self, values: pd.Series, strict: bool = False
) -> pd.Series:
    values = values.map(str)
    for smi in values:
        smiles2mol(smi)

    return values

numerical

NumericalInput (Input)

Abstract base class for all numerical (ordinal) input features.

Source code in bofire/data_models/features/numerical.py
class NumericalInput(Input):
    """Abstract base class for all numerical (ordinal) input features."""

    unit: Optional[str] = None

    @staticmethod
    def valid_transform_types() -> List:
        return []

    def to_unit_range(
        self, values: Union[pd.Series, np.ndarray], use_real_bounds: bool = False
    ) -> Union[pd.Series, np.ndarray]:
        """Convert to the unit range between 0 and 1.

        Args:
            values (pd.Series): values to be transformed
            use_real_bounds (bool, optional): if True, use the bounds from the actual values else the bounds from the feature.
                Defaults to False.

        Raises:
            ValueError: If lower_bound == upper bound an error is raised

        Returns:
            pd.Series: transformed values.
        """
        if use_real_bounds:
            lower, upper = self.get_bounds(transform_type=None, values=values)
            lower = lower[0]
            upper = upper[0]
        else:
            lower, upper = self.lower_bound, self.upper_bound  # type: ignore
        if lower == upper:
            raise ValueError("Fixed feature cannot be transformed to unit range.")
        valrange = upper - lower
        return (values - lower) / valrange

    def from_unit_range(
        self, values: Union[pd.Series, np.ndarray]
    ) -> Union[pd.Series, np.ndarray]:
        """Convert from unit range.

        Args:
            values (pd.Series): values to transform from.

        Raises:
            ValueError: if the feature is fixed raise a value error.

        Returns:
            pd.Series: _description_
        """
        if self.is_fixed():
            raise ValueError("Fixed feature cannot be transformed from unit range.")
        valrange = self.upper_bound - self.lower_bound  # type: ignore
        return (values * valrange) + self.lower_bound  # type: ignore

    def is_fixed(self):
        """Method to check if the feature is fixed

        Returns:
            Boolean: True when the feature is fixed, false otherwise.
        """
        return self.lower_bound == self.upper_bound  # type: ignore

    def fixed_value(
        self, transform_type: Optional[TTransform] = None
    ) -> Union[None, List[float]]:
        """Method to get the value to which the feature is fixed

        Returns:
            Float: Return the feature value or None if the feature is not fixed.
        """
        assert transform_type is None
        if self.is_fixed():
            return [self.lower_bound]  # type: ignore
        else:
            return None

    def validate_experimental(self, values: pd.Series, strict=False) -> pd.Series:
        """Method to validate the experimental dataFrame

        Args:
            values (pd.Series): A dataFrame with experiments
            strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not.
                Defaults to False.

        Raises:
            ValueError: when a value is not numerical
            ValueError: when there is no variation in a feature provided by the experimental data

        Returns:
            pd.Series: A dataFrame with experiments
        """
        try:
            values = pd.to_numeric(values, errors="raise").astype("float64")
        except ValueError:
            raise ValueError(
                f"not all values of input feature `{self.key}` are numerical"
            )
        values = values.astype("float64")
        if strict:
            lower, upper = self.get_bounds(transform_type=None, values=values)
            if lower == upper:
                raise ValueError(
                    f"No variation present or planned for feature {self.key}. Remove it."
                )
        return values

    def validate_candidental(self, values: pd.Series) -> pd.Series:
        """Validate the suggested candidates for the feature.

        Args:
            values (pd.Series): suggested candidates for the feature

        Raises:
            ValueError: Error is raised when one of the values is not numerical.

        Returns:
            pd.Series: the original provided candidates
        """
        try:
            values = pd.to_numeric(values, errors="raise").astype("float64")
        except ValueError:
            raise ValueError(
                f"not all values of input feature `{self.key}` are numerical"
            )
        return values

fixed_value(self, transform_type=None)

Method to get the value to which the feature is fixed

Returns:

Type Description
Float

Return the feature value or None if the feature is not fixed.

Source code in bofire/data_models/features/numerical.py
def fixed_value(
    self, transform_type: Optional[TTransform] = None
) -> Union[None, List[float]]:
    """Method to get the value to which the feature is fixed

    Returns:
        Float: Return the feature value or None if the feature is not fixed.
    """
    assert transform_type is None
    if self.is_fixed():
        return [self.lower_bound]  # type: ignore
    else:
        return None

from_unit_range(self, values)

Convert from unit range.

Parameters:

Name Type Description Default
values pd.Series

values to transform from.

required

Exceptions:

Type Description
ValueError

if the feature is fixed raise a value error.

Returns:

Type Description
pd.Series

description

Source code in bofire/data_models/features/numerical.py
def from_unit_range(
    self, values: Union[pd.Series, np.ndarray]
) -> Union[pd.Series, np.ndarray]:
    """Convert from unit range.

    Args:
        values (pd.Series): values to transform from.

    Raises:
        ValueError: if the feature is fixed raise a value error.

    Returns:
        pd.Series: _description_
    """
    if self.is_fixed():
        raise ValueError("Fixed feature cannot be transformed from unit range.")
    valrange = self.upper_bound - self.lower_bound  # type: ignore
    return (values * valrange) + self.lower_bound  # type: ignore

is_fixed(self)

Method to check if the feature is fixed

Returns:

Type Description
Boolean

True when the feature is fixed, false otherwise.

Source code in bofire/data_models/features/numerical.py
def is_fixed(self):
    """Method to check if the feature is fixed

    Returns:
        Boolean: True when the feature is fixed, false otherwise.
    """
    return self.lower_bound == self.upper_bound  # type: ignore

to_unit_range(self, values, use_real_bounds=False)

Convert to the unit range between 0 and 1.

Parameters:

Name Type Description Default
values pd.Series

values to be transformed

required
use_real_bounds bool

if True, use the bounds from the actual values else the bounds from the feature. Defaults to False.

False

Exceptions:

Type Description
ValueError

If lower_bound == upper bound an error is raised

Returns:

Type Description
pd.Series

transformed values.

Source code in bofire/data_models/features/numerical.py
def to_unit_range(
    self, values: Union[pd.Series, np.ndarray], use_real_bounds: bool = False
) -> Union[pd.Series, np.ndarray]:
    """Convert to the unit range between 0 and 1.

    Args:
        values (pd.Series): values to be transformed
        use_real_bounds (bool, optional): if True, use the bounds from the actual values else the bounds from the feature.
            Defaults to False.

    Raises:
        ValueError: If lower_bound == upper bound an error is raised

    Returns:
        pd.Series: transformed values.
    """
    if use_real_bounds:
        lower, upper = self.get_bounds(transform_type=None, values=values)
        lower = lower[0]
        upper = upper[0]
    else:
        lower, upper = self.lower_bound, self.upper_bound  # type: ignore
    if lower == upper:
        raise ValueError("Fixed feature cannot be transformed to unit range.")
    valrange = upper - lower
    return (values - lower) / valrange

validate_candidental(self, values)

Validate the suggested candidates for the feature.

Parameters:

Name Type Description Default
values pd.Series

suggested candidates for the feature

required

Exceptions:

Type Description
ValueError

Error is raised when one of the values is not numerical.

Returns:

Type Description
pd.Series

the original provided candidates

Source code in bofire/data_models/features/numerical.py
def validate_candidental(self, values: pd.Series) -> pd.Series:
    """Validate the suggested candidates for the feature.

    Args:
        values (pd.Series): suggested candidates for the feature

    Raises:
        ValueError: Error is raised when one of the values is not numerical.

    Returns:
        pd.Series: the original provided candidates
    """
    try:
        values = pd.to_numeric(values, errors="raise").astype("float64")
    except ValueError:
        raise ValueError(
            f"not all values of input feature `{self.key}` are numerical"
        )
    return values

validate_experimental(self, values, strict=False)

Method to validate the experimental dataFrame

Parameters:

Name Type Description Default
values pd.Series

A dataFrame with experiments

required
strict bool

Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not. Defaults to False.

False

Exceptions:

Type Description
ValueError

when a value is not numerical

ValueError

when there is no variation in a feature provided by the experimental data

Returns:

Type Description
pd.Series

A dataFrame with experiments

Source code in bofire/data_models/features/numerical.py
def validate_experimental(self, values: pd.Series, strict=False) -> pd.Series:
    """Method to validate the experimental dataFrame

    Args:
        values (pd.Series): A dataFrame with experiments
        strict (bool, optional): Boolean to distinguish if the occurence of fixed features in the dataset should be considered or not.
            Defaults to False.

    Raises:
        ValueError: when a value is not numerical
        ValueError: when there is no variation in a feature provided by the experimental data

    Returns:
        pd.Series: A dataFrame with experiments
    """
    try:
        values = pd.to_numeric(values, errors="raise").astype("float64")
    except ValueError:
        raise ValueError(
            f"not all values of input feature `{self.key}` are numerical"
        )
    values = values.astype("float64")
    if strict:
        lower, upper = self.get_bounds(transform_type=None, values=values)
        if lower == upper:
            raise ValueError(
                f"No variation present or planned for feature {self.key}. Remove it."
            )
    return values