Source code for sysvar.api

from __future__ import annotations

from typing import List, Iterable, Optional, Dict, Union, Any
from os import path
from warnings import warn
from pathlib import Path

import numpy as np
from pandas import DataFrame

import matplotlib.pyplot as plt

from sysvar.variations import Variator
from sysvar.corrections import create_correction_object
from sysvar.eigendecomposer import (
    EigenDecomposer,
    ExistingEigenVariationsSaver,
)
from sysvar.covariance_calculator import CovarianceCalculator
from sysvar.channel_template_handler import ChannelTemplateHandler

import logging

logging.basicConfig(
    format="%(levelname)s : %(funcName)s: %(lineno)d :  %(message)s",
    level=logging.INFO,
)


[docs] def add_weights_to_dataframe( df: DataFrame, correction_source: str | Path | dict, MC_production: str | None = None, prefix: str | None = None, weightname: str | None = None, overwrite: bool = False, Nvar: int = 0, ): """ Add correction weights (and optional toy variations) to a pandas DataFrame in-place. This function computes per-row correction weights using a correction object created from `correction_source` and writes them into `df` as a new column. The rows that receive a given correction value are determined by boolean query strings produced by `correction.build_queries(prefix)` and evaluated against `df` via `df.eval()`. If `Nvar > 0`, additional columns containing toy variations of the weights are added as well (one column per toy). The toy weights are produced via a `Variator` constructed from the correction. Parameters ---------- df: The DataFrame to modify in-place. correction_source: Defines how to construct the correction: - `Path`: treated as a path to a CSV/TSV correction table. - `str`: either a correction identifier (YAML-based, legacy) or a path-like string to a CSV. - `dict`: a fully specified custom correction configuration. MC_production: MC production tag required for YAML-based corrections (legacy). Not used for CSV-based inputs. May be ignored depending on `correction_source`. prefix: Optional prefix used when building the dependent-variable column names used in the query strings (e.g. "trk" -> "trk_pt"). Passed through to `correction.build_queries(prefix)`. weightname: Base name of the weight column. The final column name is built by `correction._build_column_name(prefix, weightname)`. overwrite: If True and the target weight column already exists, it will be overwritten. If False and the column exists, the function logs a warning and does not modify the DataFrame. Nvar: Number of toy-variation columns to add. Must be a non-negative integer. If 0, only the central weight column is added. If > 0, columns named "{column_name}_var_{j}" for j in [0, Nvar-1] are added. Returns ------- None The DataFrame is modified in-place. Raises ------ ValueError If `Nvar` is negative, or if `correction_source` / `MC_production` do not form a valid combination for constructing a correction. Exception Propagates any exception raised by `create_correction_object`, `df.eval`, or by the correction/variator internals. Notes ----- - The queries produced by `correction.build_queries(prefix)` are evaluated using `df.eval()`, so the DataFrame must contain the referenced columns. - For correctness, the correction's binning (number of central values / queries) must match the internal structure of the correction and the `Variator`. """ if Nvar < 0: raise ValueError("Nvar must be a positive integer") def _add_weights(df, correction, prefix, column_name, variator=None): df.loc[:, column_name] = 1.0 if variator is not None: variation_columns = [f"{column_name}_var_{j}" for j in range(variator.Nvar)] df.loc[:, variation_columns] = 1.0 for i, (v, q) in enumerate( zip(correction.central_values, correction.build_queries(prefix)) ): mask = df.eval(q) df.loc[mask, column_name] = v if variator is not None: df.loc[mask, variation_columns] = variator.variations[:, i] correction = create_correction_object( correction_source=correction_source, MC_production=MC_production, ) column_name = correction._build_column_name(prefix, weightname) # Early skip: do not construct Variator and do not touch df if column_name in df.columns and not overwrite: logging.warning( "%s exists but it will not be overwritten. Skipping. " "If you want to overwrite set overwrite=True.", column_name, ) return # Only build variator if we will actually write variator = Variator(correction, Nvar) if Nvar > 0 else None if column_name in df.columns and overwrite: logging.info("%s exists and will be overwritten", column_name) else: logging.info("%s does not exist. Adding it to dataframe", column_name) _add_weights(df, correction, prefix, column_name, variator)
[docs] def eigendecompose( df: DataFrame, settings: dict[str, Any], systematic_source: str | Path | dict, title: str | None = None, cov_matrix_path: str | Path | None = None, criterion: str = "max_differences", prc: float = 1e-4, max_variations: int | None = None, save_variations: bool = False, save_channel_covariance_matrices: bool = False, verbose: bool = True, seed: int = 8311311, ): """ Run an eigendecomposition workflow and return the configured `EigenDecomposer`. This is a convenience wrapper around `EigenDecomposer` that: 1) constructs an `EigenDecomposer`, 2) generates template variations, 3) applies the requested precision / variation limits, 4) selects the important eigendimensions using `criterion`, 5) optionally persists variations and/or per-channel covariance matrices. Parameters ---------- df: Input DataFrame used by the decomposer (templates / channels / yields, as expected by `EigenDecomposer`). settings: Configuration dictionary consumed by `EigenDecomposer` (e.g. channel definitions, output paths, variables to use, etc.). systematic_source: Source used to build the underlying correction / systematic definition. Typically one of: - `str`: a correction/systematic identifier (e.g. YAML key; legacy), - `Path` or path-like `str`: a CSV file describing the correction, - `dict`: an in-memory correction configuration. The exact interpretation is delegated to `EigenDecomposer`. title (str | None, optional): Custom identifier for CSV-based corrections. If not provided, defaults to the CSV file stem (e.g. "track_eff" for ".../track_eff.csv"). The identifier is used to match this correction to the corresponding systematic configuration in the `settings` dictionary. Example: title="track_eff" must match the key/name used in settings["systematics"][...]. cov_matrix_path: Optional path to an explicit covariance matrix to use instead of building it from uncertainties. If provided, it is passed through to `EigenDecomposer`. criterion: Criterion used to select “important” eigendimensions. Must be understood by `EigenDecomposer.find_important_eigendimension_indices`. Default is `"max_differences"`. prc: Precision threshold used to determine how many eigendimensions to keep. Interpreted by the decomposer. Default is 1e-4. max_variations: Optional hard cap on the number of variations/eigendimensions to consider (after applying the precision criterion). If None, no cap is applied. save_variations: If True, calls `EigenDecomposer.save_template_variations()`. This performs file I/O to whatever output location the decomposer/settings define. save_channel_covariance_matrices: If True, calls `EigenDecomposer.save_channel_covariance_matrices()`. This performs file I/O. verbose: If True, enables verbose output/logging in `EigenDecomposer`. seed: Random seed forwarded to `EigenDecomposer` for reproducibility. Returns ------- EigenDecomposer The initialized decomposer instance containing the decomposition results and selected eigendimensions. Notes ----- This function has optional side effects (writing files) when `save_variations` and/or `save_channel_covariance_matrices` are enabled. """ egd = EigenDecomposer( df=df, settings=settings, systematic_source=systematic_source, title=title, cov_matrix_path=cov_matrix_path, verbose=verbose, seed=seed, ) egd.precision = prc egd.max_variations = max_variations egd.vary_templates() egd.find_important_eigendimension_indices(criterion) if save_variations: egd.save_template_variations() if save_channel_covariance_matrices: egd.save_channel_covariance_matrices() return egd
[docs] def save_nominal_templates(df: DataFrame, settings: Dict, data=None): """Save nominal templates for an MC dataset. Write nominal templates for a Monte Carlo (MC) dataset (and optional experimental data) to the output file configured in `settings`. Only ROOT (.root) files are currently supported. Channels, templates, signal extraction variables, binning, and other required configuration are read from the `settings` dictionary. The produced file structure is compatible with cabinetry and can be used to build a pyhf model including systematic uncertainties. The function creates or recreates the configured output file on disk and therefore will overwrite any existing file at that location. Because the file is recreated, eigenvariation histograms saved before calling this function would be lost; call this function before saving eigenvariations for systematics. Parameters: df (DataFrame): MC dataset containing template information used to build the nominal templates (e.g., event records or pre-binned contents). This object is read but not modified by the function. settings (Dict): Configuration dictionary. Must include the output filename (currently a .root path) and the definitions for channels, templates, signal-extraction variables and binning required to produce the histograms. data (optional): Experimental (observed) dataset to be histogrammed and included in the output using the same channels, variables and binning as the MC templates. If None, no observed-data histograms are written. Returns: ChannelTemplateHandler: The handler object used to save the nominal templates. This object can be returned for later use inspect saved templates). """ # Create an eigendecomposer object without any systematic effect ecth = ChannelTemplateHandler(df=df, settings=settings) ecth.save_nominal_templates(data=data) return ecth
[docs] def save_existing_eigenvariations( df: DataFrame, settings: Dict, syst_effect: str, verbose=True ): """Save existing eigenvariations for a specified systematic effect. This function complements the nominal-template saving step: nominal templates should already be present in the configured output file before calling this function. The saver will read variations from `df` and write eigenvariation histograms into the same ROOT file structure expected by cabinetry/pyhf so that the model can later be built including these systematic eigenvariations. Instead of using the nominal weights for the histogram filling, the nominal weight for the syst_effect is replaced by the variations present in `df`. The number of variations that should be present in `df` is read from the settings dictionary in the systematics part. Args: df (DataFrame): The dataset to extract variations from. settings (Dict): Configuration settings for eigenvariation saving. syst_effect (str): The systematic effect to save variations for. verbose (bool, optional): If True, enables verbose logging. Defaults to True. Returns: None """ ees = ExistingEigenVariationsSaver(df, settings) ees.syst_effect = syst_effect ees.save_existing_eigenvariations(verbose=verbose)
[docs] def calculate_covariance_matrix( df: DataFrame, settings: Dict, syst_effect: str | Dict, binning: Dict, channels: List, input_cov: np.ndarray = None, save_cov: bool = False, ): """ Calculate the covariance matrix for a given dataset. This function computes a covariance matrix based on the input data, configuration settings, and systematic effects. It provides support for pre-defined systematics or custom-defined ones and allows the user to specify binning and channels. Optionally, it can save the covariance matrix to a file. Args: df (DataFrame): The input data to calculate the covariance matrix from. settings (Dict): Configuration settings, same as for the `EigenDecomposer`. syst_effect (str | Dict): The name of the systematic effect to consider for the covariance matrix. For systematics from YAML files the name is enough. If this is a custom systematic then a dictionary with for the custom systematic is expected similarly to the dictionary necessary for the custom correction object in the eigendecomposition. binning (Dict): Binning information for the covariance matrix. Keys should be the variable names present in the df and values lists of bin edges. channels (List): List of channels to consider for the covariance matrix. save_cov (bool, optional): If True, saves the covariance matrix. The path should be read from the settings dictionary. Defaults to False. Returns: the covariance matrix from the covariance matrix calculator """ cc = CovarianceCalculator(df, settings, syst_effect, binning, channels, input_cov) cc.vary_templates() if save_cov: cc.save_covariance() return cc.cov
[docs] def plot_analysis_corr_matrix( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> tuple[plt.Figure, plt.Axes]: """ Plot the correlation matrix of an eigendecomposition analysis. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing correlation data. save (bool, optional): If True, saves the plot to file. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ fig, ax = eigendecomposer_obj.plot_corr_matrix(save=save, filename=filename) return fig, ax
[docs] def plot_cov_diff( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> tuple[plt.Figure, plt.Axes]: """ Plot the normalized covariance difference between original and eigendecomposed covariance matrix for an initial truncation guess. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing covariance data. save (bool, optional): If True, saves the plot. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ fig, ax = eigendecomposer_obj.plot_cov_diff(save=save, filename=filename) return fig, ax
[docs] def register_saving_info(eigendecomposer_obj: EigenDecomposer, saving_info: Dict): """ Register saving information in the eigendecomposer object. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object to update. saving_info (Dict): Dictionary containing saving parameters. Returns: None """ eigendecomposer_obj.register_saving_info(saving_info)
[docs] def plot_up_and_down_variations( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> List[tuple[plt.Figure, plt.Axes]]: """ Plot up/down variations for each template in the decomposition. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing templates. save (bool, optional): If True, saves the plots. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ figures = [] for t_name, t_obj in eigendecomposer_obj.templates.items(): t_obj.register_saving_info(eigendecomposer_obj.saving_info) fig, ax = t_obj.plot_up_and_down_variations( title=t_name, save=save, filename=filename ) figures.append((fig, ax)) return figures
[docs] def plot_templates_relative_variations_in_grid( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> List[tuple[plt.Figure, plt.Axes]]: """ Plot relative template variations in a grid layout. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing templates. save (bool, optional): If True, saves the plots. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ figures = [] for t_name, t_obj in eigendecomposer_obj.templates.items(): t_obj.register_saving_info(eigendecomposer_obj.saving_info) fig, ax = t_obj.plot_relative_variations_in_grid( title=t_name, save=save, filename=filename ) figures.append((fig, ax)) return figures
[docs] def plot_correction_cov_and_corr( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> tuple[plt.Figure, plt.Axes]: """ Plot correction covariance and correlation matrices. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction data. save (bool, optional): If True, saves the plot. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ eigendecomposer_obj.variator.register_saving_info(eigendecomposer_obj.saving_info) fig, ax = eigendecomposer_obj.variator.plot_cov_and_corr( save=save, filename=filename ) return fig, ax
[docs] def plot_correction_variations_in_grid( eigendecomposer_obj: EigenDecomposer, nbins=21, save: bool = False, filename: Union[None, str] = None, ) -> tuple[plt.Figure, plt.Axes]: """ Plot correction variations in a grid layout. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction variations. nbins (int, optional): Number of bins to use in the grid plot. Defaults to 21. save (bool, optional): If True, saves the plots. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ eigendecomposer_obj.variator.register_saving_info(eigendecomposer_obj.saving_info) fig, ax = eigendecomposer_obj.variator.plot_relative_variations_in_grid( nbins=nbins, save=save, filename=filename ) return fig, ax
[docs] def plot_correction_errors( eigendecomposer_obj: EigenDecomposer, save: bool = False, filename: Union[None, str] = None, ) -> tuple[plt.Figure, plt.Axes]: """ Plot correction error comparisons. Args: eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction information. save (bool, optional): If True, saves the plot. Defaults to False. filename (str, optional): Output file name if saving. Defaults to None. Returns: None """ eigendecomposer_obj.correction.register_saving_info(eigendecomposer_obj.saving_info) fig, ax = eigendecomposer_obj.correction.plot_error_comparison( save=save, filename=filename ) return fig, ax