from __future__ import annotations
from typing import List, Iterable, Optional, Dict, Union
from os import path
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
from sysvar.variations import Variator
from sysvar.corrections import create_correction_object
from sysvar.eigendecomposer import (
EigenDecomposer,
ExistingEigenVariationsSaver,
)
from sysvar.covariance_calculator import CovarianceCalculator
from sysvar.channel_template_handler import ChannelTemplateHandler
import logging
logging.basicConfig(
format="%(levelname)s : %(funcName)s: %(lineno)d : %(message)s",
level=logging.INFO,
)
[docs]
def add_weights_to_dataframe(
df: DataFrame,
systematic: str,
MC_production: str,
prefix: str,
weightname: str,
overwrite: bool = False,
Nvar: int = 0,
):
"""
Add weight columns to a DataFrame.
This function augments `df` in-place by adding a central weight column whose
name is constructed from `prefix` and `weightname`, and filling it using
the configured correction for the given `systematic` and `MC_production`.
If `Nvar > 0`, it also adds `Nvar` variation columns (named
"{column_name}_var_{j}") and fills them from variations (toys) of the
central values of the corrections. A dedicated `Variator` object is used
internally to generate the toys.
Parameters:
df (DataFrame): pandas DataFrame to be augmented (modified in-place).
systematic (str): Name of the systematic/correction to apply.
MC_production (str): MC production tag used to locate the correction.
prefix (str): Prefix used when building the weight column name.
weightname (str): Base name of the weight column to add.
overwrite (bool, optional): If True, overwrite an existing column with
the same name. Defaults to False.
Nvar (int, optional): Number of variation columns to add. Must be a
non-negative integer. If 0, only the central value column is added.
Defaults to 0.
Returns:
None: The DataFrame `df` is modified in-place.
Raises:
ValueError: If `Nvar` is negative.
"""
if Nvar < 0:
raise ValueError("Nvar must be a positive integer")
def _add_weights(df, correction, prefix, column_name, variator=None):
df.loc[:, column_name] = 1.0
if variator is not None:
variation_columns = [f"{column_name}_var_{j}" for j in range(variator.Nvar)]
df.loc[:, variation_columns] = 1.0
for i, (v, q) in enumerate(
zip(correction.central_values, correction.build_queries(prefix))
):
mask = df.eval(q)
df.loc[mask, column_name] = v
if variator is not None:
df.loc[mask, variation_columns] = variator.variations[:, i]
correction = create_correction_object(systematic, MC_production)
column_name = correction._build_column_name(prefix, weightname)
variator = Variator(correction, Nvar) if Nvar > 0 else None
if column_name in df.columns and overwrite:
logging.info("%s exists but it will be overwriten", column_name)
_add_weights(df, correction, prefix, column_name, variator)
elif column_name in df.columns and not overwrite:
logging.warning(
"%s exists but it not will be ovewritten. Skipping. No weights are added. If you want to change this behaviour set the overwrite argument to True",
column_name,
)
elif column_name not in df.columns:
logging.info("%s does not exist. Adding it to dataframe", column_name)
_add_weights(df, correction, prefix, column_name, variator)
[docs]
def eigendecompose(
df: DataFrame,
settings: Dict,
syst_effect: str,
criterion: str = "max_differences",
prc: float = 1e-4,
max_variations: int | None = None,
save_variations: bool = False,
save_channel_covariance_matrices: bool = False,
verbose: bool = True,
seed: int = 8311311,
):
"""
Performs eigendecomposition on the input DataFrame based on specified settings,
systematic effect, and criterion, and returns the resulting `EigenDecomposer` object.
This function initializes an `EigenDecomposer` instance using the provided
DataFrame, settings, and systematic effect. It then applies a precision level
to the decomposition and identifies important eigendimension indices based on
the specified criterion. Optionally, it saves template variations.
Args:
df (DataFrame): The input data to be decomposed.
settings (Dict): Configuration settings for the decomposition.
syst_effect (str): The systematic effect to consider.
criterion (str, optional): Criterion for selecting important eigendimensions.
Defaults to "max_differences".
prc (float, optional): Precision level for the decomposition. Defaults to 1e-4.
max_variations (int | None, optional): Maximum number of variations to consider taking into account the precision criterion. If None, all variations up to the precision are considered. Defaults to None.
save_variations (bool, optional): If True, saves template variations. Defaults to False.
save_channel_covariance_matrices (bool, optional): If True, saves covariance matrices per channel. Defaults to False.
verbose (bool, optional): If True, prints additional information during execution. Defaults to True.
seed (int, optional): Random seed for reproducibility. Defaults to 8311311.
Returns:
EigenDecomposer: An instance of the `EigenDecomposer` class containing the decomposition results.
"""
egd = EigenDecomposer(df, settings, syst_effect, verbose=verbose, seed=seed)
egd.vary_templates()
egd.precision = prc
egd.max_variations = max_variations
egd.find_important_eigendimension_indices(criterion)
if save_variations:
egd.save_template_variations()
if save_channel_covariance_matrices:
egd.save_channel_covariance_matrices()
return egd
[docs]
def save_nominal_templates(df: DataFrame, settings: Dict, data=None):
"""Save nominal templates for an MC dataset.
Write nominal templates for a Monte Carlo (MC) dataset (and optional
experimental data) to the output file configured in `settings`. Only ROOT
(.root) files are currently supported. Channels, templates, signal
extraction variables, binning, and other required configuration are read
from the `settings` dictionary. The produced file structure is compatible
with cabinetry and can be used to build a pyhf model including systematic
uncertainties.
The function creates or recreates the configured output file on disk and
therefore will overwrite any existing file at that location. Because the
file is recreated, eigenvariation histograms saved before calling this
function would be lost; call this function before saving eigenvariations
for systematics.
Parameters:
df (DataFrame): MC dataset containing template information used to
build the nominal templates (e.g., event records or pre-binned
contents). This object is read but not modified by the function.
settings (Dict): Configuration dictionary. Must include the output
filename (currently a .root path) and the definitions for channels,
templates, signal-extraction variables and binning required to
produce the histograms.
data (optional): Experimental (observed) dataset to be histogrammed and
included in the output using the same channels, variables and
binning as the MC templates. If None, no observed-data histograms
are written.
Returns:
ChannelTemplateHandler: The handler object used to save the nominal
templates. This object can be returned for later use inspect
saved templates).
"""
# Create an eigendecomposer object without any systematic effect
ecth = ChannelTemplateHandler(df=df, settings=settings)
ecth.save_nominal_templates(data=data)
return ecth
[docs]
def save_existing_eigenvariations(
df: DataFrame, settings: Dict, syst_effect: str, verbose=True
):
"""Save existing eigenvariations for a specified systematic effect.
This function complements the nominal-template saving step:
nominal templates should already be present in the configured output file
before calling this function.
The saver will read variations from `df` and write eigenvariation histograms
into the same ROOT file structure expected by cabinetry/pyhf so that the model
can later be built including these systematic eigenvariations.
Instead of using the nominal weights for the histogram filling, the nominal weight
for the syst_effect is replaced by the variations present in `df`.
The number of variations that should be present in `df` is read from the settings
dictionary in the systematics part.
Args:
df (DataFrame): The dataset to extract variations from.
settings (Dict): Configuration settings for eigenvariation saving.
syst_effect (str): The systematic effect to save variations for.
verbose (bool, optional): If True, enables verbose logging. Defaults to True.
Returns:
None
"""
ees = ExistingEigenVariationsSaver(df, settings)
ees.syst_effect = syst_effect
ees.save_existing_eigenvariations(verbose=verbose)
[docs]
def calculate_covariance_matrix(
df: DataFrame,
settings: Dict,
syst_effect: str | Dict,
binning: Dict,
channels: List,
input_cov: np.ndarray = None,
save_cov: bool = False,
):
"""
Calculate the covariance matrix for a given dataset.
This function computes a covariance matrix based on the input data, configuration settings, and systematic effects. It provides support for pre-defined systematics or custom-defined ones and allows the user to specify binning and channels. Optionally, it can save the covariance matrix to a file.
Args:
df (DataFrame): The input data to calculate the covariance matrix from.
settings (Dict): Configuration settings, same as for the `EigenDecomposer`.
syst_effect (str | Dict): The name of the systematic effect to consider for the covariance matrix. For systematics from YAML files the name is enough. If this is a custom systematic then a dictionary with for the custom systematic is expected similarly to the dictionary necessary for the custom correction object in the eigendecomposition.
binning (Dict): Binning information for the covariance matrix. Keys should be the variable names present in the df and values lists of bin edges.
channels (List): List of channels to consider for the covariance matrix.
save_cov (bool, optional): If True, saves the covariance matrix. The path should be read from the settings dictionary. Defaults to False.
Returns:
the covariance matrix from the covariance matrix calculator
"""
cc = CovarianceCalculator(df, settings, syst_effect, binning, channels, input_cov)
cc.vary_templates()
if save_cov:
cc.save_covariance()
return cc.cov
[docs]
def plot_analysis_corr_matrix(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> tuple[plt.Figure, plt.Axes]:
"""
Plot the correlation matrix of an eigendecomposition analysis.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing correlation data.
save (bool, optional): If True, saves the plot to file. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
fig, ax = eigendecomposer_obj.plot_corr_matrix(save=save, filename=filename)
return fig, ax
[docs]
def plot_cov_diff(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> tuple[plt.Figure, plt.Axes]:
"""
Plot the normalized covariance difference between original and eigendecomposed covariance matrix for an initial truncation guess.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing covariance data.
save (bool, optional): If True, saves the plot. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
fig, ax = eigendecomposer_obj.plot_cov_diff(save=save, filename=filename)
return fig, ax
[docs]
def register_saving_info(eigendecomposer_obj: EigenDecomposer, saving_info: Dict):
"""
Register saving information in the eigendecomposer object.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object to update.
saving_info (Dict): Dictionary containing saving parameters.
Returns:
None
"""
eigendecomposer_obj.register_saving_info(saving_info)
[docs]
def plot_up_and_down_variations(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> List[tuple[plt.Figure, plt.Axes]]:
"""
Plot up/down variations for each template in the decomposition.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing templates.
save (bool, optional): If True, saves the plots. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
figures = []
for t_name, t_obj in eigendecomposer_obj.templates.items():
t_obj.register_saving_info(eigendecomposer_obj.saving_info)
fig, ax = t_obj.plot_up_and_down_variations(
title=t_name, save=save, filename=filename
)
figures.append((fig, ax))
return figures
[docs]
def plot_templates_relative_variations_in_grid(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> List[tuple[plt.Figure, plt.Axes]]:
"""
Plot relative template variations in a grid layout.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing templates.
save (bool, optional): If True, saves the plots. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
figures = []
for t_name, t_obj in eigendecomposer_obj.templates.items():
t_obj.register_saving_info(eigendecomposer_obj.saving_info)
fig, ax = t_obj.plot_relative_variations_in_grid(
title=t_name, save=save, filename=filename
)
figures.append((fig, ax))
return figures
[docs]
def plot_correction_cov_and_corr(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> tuple[plt.Figure, plt.Axes]:
"""
Plot correction covariance and correlation matrices.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction data.
save (bool, optional): If True, saves the plot. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
eigendecomposer_obj.variator.register_saving_info(eigendecomposer_obj.saving_info)
fig, ax = eigendecomposer_obj.variator.plot_cov_and_corr(
save=save, filename=filename
)
return fig, ax
[docs]
def plot_correction_variations_in_grid(
eigendecomposer_obj: EigenDecomposer,
nbins=21,
save: bool = False,
filename: Union[None, str] = None,
) -> tuple[plt.Figure, plt.Axes]:
"""
Plot correction variations in a grid layout.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction variations.
nbins (int, optional): Number of bins to use in the grid plot. Defaults to 21.
save (bool, optional): If True, saves the plots. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
eigendecomposer_obj.variator.register_saving_info(eigendecomposer_obj.saving_info)
fig, ax = eigendecomposer_obj.variator.plot_relative_variations_in_grid(
nbins=nbins, save=save, filename=filename
)
return fig, ax
[docs]
def plot_correction_errors(
eigendecomposer_obj: EigenDecomposer,
save: bool = False,
filename: Union[None, str] = None,
) -> tuple[plt.Figure, plt.Axes]:
"""
Plot correction error comparisons.
Args:
eigendecomposer_obj (EigenDecomposer): The decomposition object containing correction information.
save (bool, optional): If True, saves the plot. Defaults to False.
filename (str, optional): Output file name if saving. Defaults to None.
Returns:
None
"""
eigendecomposer_obj.correction.register_saving_info(eigendecomposer_obj.saving_info)
fig, ax = eigendecomposer_obj.correction.plot_error_comparison(
save=save, filename=filename
)
return fig, ax