Source code for pidDataUtils

#!/usr/bin/env python3

##########################################################################
# basf2 (Belle II Analysis Software Framework)                           #
# Author: The Belle II Collaboration                                     #
#                                                                        #
# See git log for contributors and copyright holders.                    #
# This file is licensed under LGPL-3.0, see LICENSE.md.                  #
##########################################################################


import numpy as np
import pandas as pd
import h5py
import uproot


def _make_const_lists():
    """Moving this code into a function to avoid a top-level ROOT import."""
    from ROOT import Belle2  # noqa: make Belle2 namespace available
    import ROOT.Belle2

    PARTICLES, PDG_CODES = [], []
    for i in range(len(ROOT.Belle2.Const.chargedStableSet)):
        particle = ROOT.Belle2.Const.chargedStableSet.at(i)
        name = (particle.__repr__()[7:-1]
                .replace("-", "")
                .replace("+", "")
                .replace("euteron", ""))
        PARTICLES.append(name)
        PDG_CODES.append(particle.getPDGCode())
    # PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
    # PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]

    DETECTORS = []
    for det in ROOT.Belle2.Const.PIDDetectors.set():
        DETECTORS.append(ROOT.Belle2.Const.parseDetectors(det))
    # DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]

    return PARTICLES, PDG_CODES, DETECTORS


# PARTICLES, PDG_CODES, DETECTORS = _make_const_lists()
PARTICLES = ["e", "mu", "pi", "K", "p", "d"]
PDG_CODES = [11, 13, 211, 321, 2212, 1000010020]
DETECTORS = ["SVD", "CDC", "TOP", "ARICH", "ECL", "KLM"]

P_BINS = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5])
THETA_BINS = np.radians(np.array([17, 28, 40, 60, 77, 96, 115, 133, 150]))


def _column(particle, detector):
    """Default column names for detector log-likelihoods.

    Args:
        particle (str): particle name
        detector (str): detector name

    Returns:
        str: Corresponding column name.
    """
    return f"{detector}_{particle}"



[docs]
def root_column(particle, detector):
    """Column names for detector log-likelihoods found in our ROOT datafiles.

    Args:
        particle (str): particle name
        detector (str): detector name

    Returns:
        str: Corresponding column name.
    """
    pdg = PDG_CODES[PARTICLES.index(particle)]
    return f"pidLogLikelyhoodOf{pdg}From{detector}"




[docs]
def read_root(root_filenames):
    """Reads one or several ROOT datafiles into a DataFrame.

    Args:
        root_filenames (list(str) or str): If only one filename, can be given as
            a string. If more than one, should be given as a list or tuple.

    Returns:
        pandas.DataFrame: DataFrame containing the data of the ROOT datafile(s).
    """

    return uproot.concatenate(root_filenames, library='pd')




[docs]
def make_h5(df, tags, out_filename, pdg=None, column=root_column):
    """Make an HDF5 file in our 'slim' format from the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the data.
        tags (list(str) or str): The particle tags used as a prefix for desired
            columns. e.g. for kaons in a D* decay, this is 'DST_D0_K'. One or
            more can be given.
        out_filename (str): Output filename for the h5 file that will be
            written.
        pdg (int or None): The PDG code for the particles being
            extracted. If None, uses the values found in the 'mcPDG' column of
            the DataFrame. Defaults to None.
        column: A function which, given the particle and
            detector names, returns the column name for the corresponding
            detector log-likelihood. Defaults to root_column, which assumes
            column names are of the format
            f'pidLogLikelyhoodOf{pdg}From{detector}'.
    """

    if isinstance(tags, str):
        tags = [tags]

    def _concat(arrs):
        return np.concatenate(arrs) if len(arrs) > 1 else arrs[0]

    def _get_all(col):
        return _concat([df[f"{tag}_{col}"].values for tag in tags])

    with h5py.File(out_filename, "w") as f:
        if pdg is not None:
            pdg_values = np.ones(len(df) * len(tags)) * pdg
        else:
            pdg_values = np.abs(_get_all("mcPDG"))

        f.create_dataset("pdg", data=pdg_values)
        f.create_dataset("p", data=_get_all("p"))
        f.create_dataset("theta", data=np.arccos(_get_all("cosTheta")))
        f.create_dataset("phi", data=_get_all("phi"))

        for det in DETECTORS:
            for p in PARTICLES:
                data = _get_all(column(p, det))
                data[np.isnan(data)] = 0
                f.create_dataset(f"{det}/{p}", data=data)




[docs]
def merge_h5s(filenames, out_filename, pdgs=None):
    """Merge several HDF5 files in our 'slim' format together.

    Args:
        filenames (list(str)): Filenames of HDF5 files to be merged.
        out_filename (str): Output filename.
        pdgs (list(int)): The PDG tags for the particle types, one per
            filename, to overwrite the 'pdg' columns in those files when
            merging. If None, simply uses the 'pdg' columns from the files.
            Defaults to None.
    """
    fs = [h5py.File(fname, "r") for fname in filenames]
    m = h5py.File(out_filename, "w")

    keys = ["p", "theta", "phi"]
    keys += [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]

    for key in keys:
        m.create_dataset(key, data=np.concatenate([f[key][()] for f in fs]))

    if pdgs is not None:
        # replace 'pdg' data with kinematic tags
        m.create_dataset(
            "pdg",
            data=np.concatenate(
                [np.ones_like(f["pdg"][()]) * pdg for f, pdg in zip(fs, pdgs)]
            ),
        )
    else:
        m.create_dataset("pdg", data=np.concatenate([f["pdg"][()] for f in fs]))

    for f in fs:
        f.close()
    m.close()




[docs]
def split_h5(
    filename,
    output_dir,
    train_size=0.8,
    val_size=0.1,
    test_size=0.1,
    shuffle=True,
    random_state=None,
):
    """Split the data in an HDF5 'slim' format file in train, validation, and
    test sets, stored in .npz files for ease of weight training.

    Args:
        filename (str): Filename of HDF5 input file.
        output_dir (str): Name of output directory, in which the train,
            validation, and test sets will be written. Will be created if it
            does not already exist.
        train_size (float): Fraction of the dataset to use for
            training. Defaults to 0.8.
        val_size (float): Fraction of the dataset to use for
            validation. Defaults to 0.1.
        test_size (float): Fraction of the dataset to use for testing.
            Defaults to 0.1.
        shuffle (bool): Whether to shuffle the dataset before
            splitting. Defaults to True.
        random_state (int or None): Random state for the shuffling.
            Defaults to None.
    """

    from sklearn.model_selection import train_test_split
    from os.path import join
    from os import makedirs

    assert train_size > 0, f"train_size ({train_size}) must be positive"
    assert val_size >= 0, f"val_size ({val_size}) may not be negative"
    assert test_size >= 0, f"test_size ({test_size}) may not be negative"
    assert val_size + test_size != 0, "val_size and test_size cannot both be zero"

    if val_size == 0:
        val_size = test_size
        test_size = 0

    if train_size + val_size + test_size != 1:
        total = train_size + val_size + test_size
        train_size = train_size / total
        val_size = val_size / total
        test_size = test_size / total

    # read data
    with h5py.File(filename, "r") as f:
        data = np.stack(
            [f[det][p][()] for p in PARTICLES for det in DETECTORS], axis=-1
        )
        p_data = f["p"][()]
        theta_data = f["theta"][()]
        labels = np.abs(f["pdg"][()])
        for i, p in enumerate(PDG_CODES):
            labels[labels == p] = i
        mask = labels < 6

    X = data[mask]
    y = labels[mask]
    p = p_data[mask]
    t = theta_data[mask]

    makedirs(output_dir, exist_ok=True)
    kw = dict(shuffle=shuffle, random_state=random_state)

    # split once
    (X_0, X, y_0, y, p_0, p, t_0, t) = train_test_split(
        X, y, p, t, train_size=train_size, **kw
    )
    np.savez(join(output_dir, "train.npz"), X=X_0, y=y_0, p=p_0, theta=t_0)

    # split again if desired
    if test_size != 0:
        size = val_size / (1 - train_size)
        (X_1, X_2, y_1, y_2, p_1, p_2, t_1, t_2) = train_test_split(
            X, y, p, t, train_size=size, **kw
        )

        np.savez(join(output_dir, "val.npz"), X=X_1, y=y_1, p=p_1, theta=t_1)
        np.savez(join(output_dir, "test.npz"), X=X_2, y=y_2, p=p_2, theta=t_2)

    else:
        np.savez(join(output_dir, "val.npz"), X=X, y=y, p=p, theta=t)




[docs]
def softmax(x):
    """Performs softmax calculation with corrections to help prevent overflow.

    Note:
        This is the calculation used to convert log-likelihoods to likelihood
        ratios. Implementation following
        https://stackoverflow.com/a/67112412/18837571

    Args:
        x (:func:`numpy.array`): Data to be softmaxed. Softmax is calculated over the last
            dimension.

    Returns:
        :func:`numpy.array`: Softmaxed data.
    """
    maxes = np.amax(x, axis=-1, keepdims=True)
    x_exp = np.exp(x - maxes)
    return x_exp / np.sum(x_exp, axis=-1, keepdims=True)




[docs]
def make_labels(df):
    """Make a 'labels' column for the DataFrame. The 'labels' column contains
    the particle type labels for each event: 0 for electron, 1 for muon, and so
    on.

    Args:
        df (pandas.DataFrame): DataFrame that 'labels' column will be added to. Must
            not have NaN values in the 'pdg' column.
    """
    labels = np.abs(df["pdg"].values)
    if np.count_nonzero(~np.isfinite(labels)):
        print(
            'Warning: dataset contains NaN values in the "pdg" column. '
            'This means the "labels" column cannot be made, so most of the '
            "pidplots methods will fail."
        )
        labels = np.ones_like(labels) * np.nan
    else:
        for i, p in enumerate(PDG_CODES):
            labels[labels == p] = i
        labels[labels >= 6] = -1
    df["labels"] = labels




[docs]
def make_bins(df, p_bins=P_BINS, theta_bins=THETA_BINS):
    """Make 'p_bin' and 'theta_bin' column in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame to add bins columns to.
        p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
            Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
        theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
            Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
            degrees.
    """
    df["p_bin"] = np.digitize(df["p"].values, p_bins) - 1
    df["theta_bin"] = np.digitize(df["theta"].values, theta_bins) - 1




[docs]
def make_lrs(df, column=_column):
    """Makes likelihood ratio columns for each of the six particle types in the
    given DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    # hypothesis log-likelihoods
    h_logls = np.stack(
        [
            np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
            for p in PARTICLES
        ],
        -1,
    )

    # compute likelihood ratios
    lrs = softmax(h_logls)
    for i, p in enumerate(PARTICLES):
        df[f"lr_{p}"] = lrs[:, i]




[docs]
def make_binary_lrs(df, column=_column):
    """Makes binary likelihood ratio columns for each of the five non-pion
    particle type hypotheses in the given DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    for h in PARTICLES:
        if h == "pi":
            continue

        h_logls = np.stack(
            [
                np.sum(df[[column(p, det) for det in DETECTORS]].values, -1)
                for p in [h, "pi"]
            ],
            -1,
        )
        lrs = softmax(h_logls)
        df[f"binary_lr_{h}"] = lrs[:, 0]




[docs]
def make_pid(df):
    """Makes a 'pid' column in the given DataFrame. The 'pid' column is the
    predicted particle type. Requires likelihood ratio columns to exist.

    Args:
        df (pandas.DataFrame): DataFrame to which the 'pid' column will be added.
    """
    lrs = np.stack([df[f"lr_{p}"].values for p in PARTICLES], axis=-1)
    pids = np.argmax(lrs, axis=-1)
    df["pid"] = pids




[docs]
def compute_det_lrs(d, det, column=_column):
    """Computes single-detector likelihood ratios from the given DataFrame.

    Args:
        d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
        det (str): The name of the detector for which the single-detector
            likelihood ratios will be calculated.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".

    Returns:
        :func:`numpy.array`: The detector likelihood ratios.
    """
    h_logls = d[[column(p, det) for p in PARTICLES]].values
    lrs = softmax(h_logls)
    return lrs




[docs]
def make_pid_det(df, column=_column):
    """Makes single-detector PID columns for each of the detectors in the given DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    for det in DETECTORS:
        mask = df[column("e", det)] == 0  # TODO: make more robust
        lrs = compute_det_lrs(df, det, column=column)
        pids = np.argmax(lrs, axis=-1)
        pids[mask] = -1
        df[f"pid_{det}"] = pids




[docs]
def compute_abl_lrs(d, det, column=_column):
    """Computes ablation likelihood ratios from the given DataFrame.

    Args:
        d (pandas.DataFrame): DataFrame containing the detector log-likelihoods.
        det (str): The name of the detector to be omitted for the ablation.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".

    Returns:
        :func:`numpy.array`: The ablation likelihood ratios.
    """

    def _cols(p):
        others = [det2 for det2 in DETECTORS if det2 != det]
        return [column(p, det2) for det2 in others]

    h_logls = np.stack([np.sum(d[_cols(p)].values, -1) for p in PARTICLES], -1)
    lrs = softmax(h_logls)
    return lrs




[docs]
def make_pid_abl(df, column=_column):
    """Makes ablation PID columns for each of the detectors in the given
    DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    for det in DETECTORS:
        lrs = compute_abl_lrs(df, det, column=column)
        pids = np.argmax(lrs, axis=-1)
        df[f"pid_no_{det}"] = pids




[docs]
def compute_contrib(d, corr=True):
    """Computes the detector contributions.

    Args:
        d (pandas.DataFrame): DataFrame containing the likelihood ratio data.
        corr (bool): Whether to compute contribution to the likelihood
            ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
            (False). Defaults to True.

    Returns:
        dict[str, :func:`numpy.array`]: The contributions of each detector.
    """
    out = dict()
    for det in DETECTORS:
        reg_lrs = d[[f"lr_{p}" for p in PARTICLES]].values
        abl_lrs = compute_abl_lrs(d, det)
        idx = d["labels" if corr else "pid"].values.astype(int)
        reg_lr = reg_lrs[np.arange(len(idx)), idx]
        abl_lr = abl_lrs[np.arange(len(idx)), idx]
        ctrb = reg_lr - abl_lr
        out[det] = ctrb
    return out




[docs]
def make_contrib(df, corr=True):
    """Makes columns for the detector contributions in the given DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        corr (bool): Whether to compute contribution to the likelihood
            ratio of the _correct_ hypothesis (True) or the _chosen_ hypothesis
            (False). Defaults to True.
    """
    ctrbs = compute_contrib(df, corr=corr)
    for det, ctrb in ctrbs.items():
        df[f"contrib_{det}"] = ctrb




[docs]
def make_columns(
    df,
    p_bins=P_BINS,
    theta_bins=THETA_BINS,
    contrib_corr=True,
    column=_column,
):
    """Makes all the additional columns for a given DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the columns will be added.
        p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
            Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
        theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
            Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
            degrees.
        contrib_corr (bool): Whether to compute contribution to the
            likelihood ratio of the _correct_ hypothesis (True) or the _chosen_
            hypothesis (False). Defaults to True.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    make_labels(df)
    make_bins(df, p_bins=p_bins, theta_bins=theta_bins)
    make_lrs(df, column=column)
    make_binary_lrs(df, column=column)
    make_pid(df)
    make_pid_det(df, column=column)
    make_pid_abl(df)
    make_contrib(df, corr=contrib_corr)




[docs]
def apply_weights(df, weights, p_bins=P_BINS, theta_bins=THETA_BINS, column=_column):
    """Applies the given weights to the log-likelihood data in the DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame to which the weights are applied.
        weights (dict[tuple(int), :func:`numpy.array`] or :func:`numpy.array`): The calibration weight
            values. If a dict, keys should be a tuple of ints, and each value is
            the six-by-six array of weights for the bin. If a single np.array,
            should be a six-by-six array of weights to be applied globally.
        p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
            Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
        theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
            Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
            degrees.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    if weights is None:
        return

    # per-bin weights
    if isinstance(weights, dict):
        for p in range(len(p_bins) - 1):
            p_lo, p_hi = p_bins[p], p_bins[p + 1]
            p_mask = (df["p"] >= p_lo) & (df["p"] <= p_hi)

            for theta in range(len(theta_bins) - 1):
                t_lo, t_hi = theta_bins[theta], theta_bins[theta + 1]
                t_mask = (df["theta"] >= t_lo) & (df["theta"] <= t_hi)

                for i, h in enumerate(PARTICLES):
                    for j, d in enumerate(DETECTORS):
                        df.loc[(p_mask & t_mask), column(h, d)] *= weights[p, theta][
                            i, j
                        ]

    # global weights
    else:
        for i, h in enumerate(PARTICLES):
            for j, d in enumerate(DETECTORS):
                df[column(h, d)] *= weights[i, j]




[docs]
def cut_particles(df, allowed_particles, column=_column):
    """Cuts the log-likelihood data associated with given particle types.

    Args:
        df (pandas.DataFrame): DataFrame to which the cuts will be applied.
        allowed_particles (list(str)): List of allowed particle types. Any
            particle types not present will be cut, unless the list is empty (in
            which case no cuts are applied).
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".
    """
    if len(allowed_particles) == 0:
        return

    for p in PARTICLES:
        if not (p in allowed_particles):
            for d in DETECTORS:
                df[column(p, d)] = -1e10




[docs]
def read_h5(filename):
    """Read an HDF5 file in our 'slim' format into a DataFrame.

    Args:
        filename (str): Input filename.

    Returns:
        pandas.DataFrame: DataFrame containing data.
    """
    df = pd.DataFrame()
    with h5py.File(filename, "r") as f:
        for key in ["pdg", "p", "theta", "phi"]:
            df[key] = f[key][()]
        for key in [f"{d}/{p}" for d in DETECTORS for p in PARTICLES]:
            df_key = key.replace("/", "_")
            df[df_key] = f[key][()]
            df[df_key] = df[df_key].fillna(0)
    return df




[docs]
def read_npz(filename):
    """Read an npz file in our training format into a DataFrame.

    Args:
        filename (str): Input filename.

    Returns:
        pandas.DataFrame: DataFrame containing data.
    """
    data = np.load(filename)
    df = pd.DataFrame(
        data=data["X"], columns=[f"{d}_{p}" for p in PARTICLES for d in DETECTORS],
    )
    df["labels"] = data["y"]
    df["p"] = data["p"]
    df["theta"] = data["theta"]

    df["pdg"] = df["labels"]
    for i, pdg in enumerate(PDG_CODES):
        df.loc[df["labels"] == i, "pdg"] = pdg

    return df




[docs]
def produce_analysis_df(
    df,
    compute_cols=True,
    drop_nans=True,
    drop_outside_bins=True,
    weights=None,
    allowed_particles=[],
    p_bins=P_BINS,
    theta_bins=THETA_BINS,
    column=None,
):
    """Prepares a DataFrame for PID analysis by applying weights, computing and
    adding additional columns, cutting NaNs, and more.

    Args:
        df (pandas.DataFrame): DataFrame to prepare for analysis.
        compute_cols (bool): Whether to compute and add additional
            columns. Defaults to True.
        drop_nans (bool): Whether to drop rows that contain NaNs.
            Defaults to True.
        drop_outside_bins (bool): Whether to drop rows for particles
            outside of the momentum and theta bins. Defaults to True.
        weights (:func:`numpy.array`): Calibration weights to be applied to the
            detector log-likelihoods. Defaults to None.
        allowed_particles (list(str)): If not empty, specifies the
            allowed particle types. Any not allowed particle types will be
            excluded from the PID calculations. If empty, all particle types are
            considered.  Defaults to [].
        p_bins (:func:`numpy.array`): The edges of the momentum bins in GeV.
            Defaults to P_BINS, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.5] GeV.
        theta_bins (:func:`numpy.array`): The edges of the theta bins in radians.
            Defaults to THETA_BINS, [17, 28, 40, 60, 77, 96, 115, 133, 150]
            degrees.
        column: A function which given the particle and
            detector names returns the corresponding detector log-likelihood
            column name. Defaults to _column, which gives column names of the
            format f"{detector}_{particle}".

    Returns:
        pandas.DataFrame: Return the prepared DataFrame. (Not all modifications in
            this method are in-place.)
    """
    if column is not None:
        for p in PARTICLES:
            for d in DETECTORS:
                df[f"{d}_{p}"] = df[column(p, d)]

    apply_weights(df, weights, p_bins=p_bins, theta_bins=theta_bins)
    cut_particles(df, allowed_particles)

    if compute_cols:
        make_columns(
            df,
            p_bins=p_bins,
            theta_bins=theta_bins,
            contrib_corr=True,
        )
        if drop_outside_bins:
            df = df.loc[
                np.logical_and.reduce(
                    [
                        df["p_bin"].values >= 0,
                        df["p_bin"].values < len(p_bins) - 1,
                        df["theta_bin"].values >= 0,
                        df["theta_bin"].values < len(theta_bins) - 1,
                    ]
                )
            ]

    if drop_nans:
        df = df.dropna()
        df = df[df["labels"] >= 0]
    return df