bioimage_py.evaluation.variation_of_information

Variation of information (split / merge) and its per-object decomposition.

Both are pure reductions of a ContingencyTable built as contingency_table(segmentation, groundtruth) (axis A = segmentation, axis B = groundtruth). The split score is the conditional entropy H(seg | gt) (over-segmentation) and the merge score is H(gt | seg) (under-segmentation); their sum is the variation of information.

  1"""Variation of information (split / merge) and its per-object decomposition.
  2
  3Both are pure reductions of a :class:`ContingencyTable` built as
  4``contingency_table(segmentation, groundtruth)`` (axis A = segmentation, axis B = groundtruth). The
  5split score is the conditional entropy ``H(seg | gt)`` (over-segmentation) and the merge score is
  6``H(gt | seg)`` (under-segmentation); their sum is the variation of information.
  7"""
  8from __future__ import annotations
  9
 10from typing import Optional, Sequence, Tuple
 11
 12import numpy as np
 13import pandas as pd
 14
 15from ..runner.config import RunnerConfig
 16from ..sources import SourceLike
 17from ._common import build_table
 18from .contingency_table import ContingencyTable
 19
 20__all__ = ["vi_scores", "object_vi_scores", "variation_of_information", "object_vi"]
 21
 22
 23def _pair_sizes(table: ContingencyTable) -> Tuple[np.ndarray, np.ndarray]:
 24    """Return, per pair, the marginal sizes of its A-label and B-label (both float64)."""
 25    sa = table.sizes_a.astype("float64")[np.searchsorted(table.labels_a, table.pairs[:, 0])]
 26    sb = table.sizes_b.astype("float64")[np.searchsorted(table.labels_b, table.pairs[:, 1])]
 27    return sa, sb
 28
 29
 30def vi_scores(table: ContingencyTable, *, use_log2: bool = True) -> Tuple[float, float]:
 31    """Compute the split and merge variation of information from a contingency table.
 32
 33    Args:
 34        table: A contingency table built as ``contingency_table(segmentation, groundtruth)``.
 35        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
 36
 37    Returns:
 38        The split variation of information (``H(seg | gt)``) and the merge variation of information
 39        (``H(gt | seg)``).
 40    """
 41    n = table.n_points
 42    if n == 0:
 43        return 0.0, 0.0
 44    log = np.log2 if use_log2 else np.log
 45    counts = table.counts.astype("float64")
 46    pa = table.sizes_a.astype("float64") / n
 47    pb = table.sizes_b.astype("float64") / n
 48    h_a = -np.sum(pa * log(pa))
 49    h_b = -np.sum(pb * log(pb))
 50    sa, sb = _pair_sizes(table)
 51    mutual = np.sum(counts / n * log(n * counts / (sa * sb)))
 52    return float(h_a - mutual), float(h_b - mutual)
 53
 54
 55def object_vi_scores(table: ContingencyTable, *, use_log2: bool = True) -> "pd.DataFrame":
 56    """Compute the per-groundtruth-object variation of information from a contingency table.
 57
 58    Based on https://arxiv.org/pdf/1708.02599.pdf (page 16).
 59
 60    Args:
 61        table: A contingency table built as ``contingency_table(segmentation, groundtruth)``.
 62        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
 63
 64    Returns:
 65        A pandas DataFrame with one row per groundtruth object, sorted by label, with columns
 66        ``label`` (groundtruth id), ``vi_split`` and ``vi_merge``.
 67    """
 68    if table.pairs.shape[0] == 0:
 69        return pd.DataFrame({"label": pd.Series(dtype="uint64"),
 70                             "vi_split": pd.Series(dtype="float64"),
 71                             "vi_merge": pd.Series(dtype="float64")})
 72    log = np.log2 if use_log2 else np.log
 73    counts = table.counts.astype("float64")
 74    sa, sb = _pair_sizes(table)
 75
 76    # Group the pairs by their groundtruth (B) label.
 77    order = np.argsort(table.pairs[:, 1], kind="stable")
 78    b_sorted = table.pairs[:, 1][order]
 79    c, sa_o, sb_o = counts[order], sa[order], sb[order]
 80    starts = np.flatnonzero(np.concatenate(([True], b_sorted[1:] != b_sorted[:-1])))
 81
 82    vi_merge = np.add.reduceat(-(c / sb_o) * log(c / sb_o), starts)
 83    vi_split = np.add.reduceat(-(c / sb_o) * log(c / sa_o), starts)
 84    return pd.DataFrame({"label": b_sorted[starts].astype("uint64"),
 85                         "vi_split": vi_split, "vi_merge": vi_merge}).reset_index(drop=True)
 86
 87
 88def variation_of_information(
 89    segmentation: SourceLike,
 90    groundtruth: SourceLike,
 91    *,
 92    ignore_seg: Optional[Sequence[int]] = None,
 93    ignore_gt: Optional[Sequence[int]] = None,
 94    use_log2: bool = True,
 95    num_workers: int = 1,
 96    block_shape: Optional[Tuple[int, ...]] = None,
 97    job_type: str = "local",
 98    job_config: Optional[RunnerConfig] = None,
 99    mask: Optional[SourceLike] = None,
100) -> Tuple[float, float]:
101    """Compute the split and merge variation of information between two segmentations.
102
103    Args:
104        segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a `Source`).
105        groundtruth: The groundtruth segmentation; same shape as ``segmentation``.
106        ignore_seg: Labels to ignore in the segmentation (their voxels are excluded).
107        ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded).
108        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
109        num_workers: Number of parallel workers used to build the contingency table.
110        block_shape: Shape of the processing blocks. Defaults to the input chunk shape.
111        job_type: Execution backend: one of ``"local"``, ``"subprocess"`` or ``"slurm"``.
112        job_config: Backend configuration (a `RunnerConfig` / `SlurmConfig`).
113        mask: Optional binary mask; voxels outside the mask are excluded.
114
115    Returns:
116        The split variation of information and the merge variation of information.
117    """
118    table = build_table(segmentation, groundtruth, ignore_seg=ignore_seg, ignore_gt=ignore_gt,
119                        num_workers=num_workers, block_shape=block_shape, job_type=job_type,
120                        job_config=job_config, mask=mask)
121    return vi_scores(table, use_log2=use_log2)
122
123
124def object_vi(
125    segmentation: SourceLike,
126    groundtruth: SourceLike,
127    *,
128    ignore_seg: Optional[Sequence[int]] = None,
129    ignore_gt: Optional[Sequence[int]] = None,
130    use_log2: bool = True,
131    num_workers: int = 1,
132    block_shape: Optional[Tuple[int, ...]] = None,
133    job_type: str = "local",
134    job_config: Optional[RunnerConfig] = None,
135    mask: Optional[SourceLike] = None,
136) -> "pd.DataFrame":
137    """Compute the per-groundtruth-object variation of information between two segmentations.
138
139    Args:
140        segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a `Source`).
141        groundtruth: The groundtruth segmentation; same shape as ``segmentation``.
142        ignore_seg: Labels to ignore in the segmentation (their voxels are excluded).
143        ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded).
144        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
145        num_workers: Number of parallel workers used to build the contingency table.
146        block_shape: Shape of the processing blocks. Defaults to the input chunk shape.
147        job_type: Execution backend: one of ``"local"``, ``"subprocess"`` or ``"slurm"``.
148        job_config: Backend configuration (a `RunnerConfig` / `SlurmConfig`).
149        mask: Optional binary mask; voxels outside the mask are excluded.
150
151    Returns:
152        A pandas DataFrame with one row per groundtruth object (columns ``label``, ``vi_split``,
153        ``vi_merge``).
154    """
155    table = build_table(segmentation, groundtruth, ignore_seg=ignore_seg, ignore_gt=ignore_gt,
156                        num_workers=num_workers, block_shape=block_shape, job_type=job_type,
157                        job_config=job_config, mask=mask)
158    return object_vi_scores(table, use_log2=use_log2)
def vi_scores( table: bioimage_py.evaluation.ContingencyTable, *, use_log2: bool = True) -> Tuple[float, float]:
31def vi_scores(table: ContingencyTable, *, use_log2: bool = True) -> Tuple[float, float]:
32    """Compute the split and merge variation of information from a contingency table.
33
34    Args:
35        table: A contingency table built as ``contingency_table(segmentation, groundtruth)``.
36        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
37
38    Returns:
39        The split variation of information (``H(seg | gt)``) and the merge variation of information
40        (``H(gt | seg)``).
41    """
42    n = table.n_points
43    if n == 0:
44        return 0.0, 0.0
45    log = np.log2 if use_log2 else np.log
46    counts = table.counts.astype("float64")
47    pa = table.sizes_a.astype("float64") / n
48    pb = table.sizes_b.astype("float64") / n
49    h_a = -np.sum(pa * log(pa))
50    h_b = -np.sum(pb * log(pb))
51    sa, sb = _pair_sizes(table)
52    mutual = np.sum(counts / n * log(n * counts / (sa * sb)))
53    return float(h_a - mutual), float(h_b - mutual)

Compute the split and merge variation of information from a contingency table.

Args: table: A contingency table built as contingency_table(segmentation, groundtruth). use_log2: Whether to use log2 (bits) or natural log (nats).

Returns: The split variation of information (H(seg | gt)) and the merge variation of information (H(gt | seg)).

def object_vi_scores( table: bioimage_py.evaluation.ContingencyTable, *, use_log2: bool = True) -> pandas.DataFrame:
56def object_vi_scores(table: ContingencyTable, *, use_log2: bool = True) -> "pd.DataFrame":
57    """Compute the per-groundtruth-object variation of information from a contingency table.
58
59    Based on https://arxiv.org/pdf/1708.02599.pdf (page 16).
60
61    Args:
62        table: A contingency table built as ``contingency_table(segmentation, groundtruth)``.
63        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
64
65    Returns:
66        A pandas DataFrame with one row per groundtruth object, sorted by label, with columns
67        ``label`` (groundtruth id), ``vi_split`` and ``vi_merge``.
68    """
69    if table.pairs.shape[0] == 0:
70        return pd.DataFrame({"label": pd.Series(dtype="uint64"),
71                             "vi_split": pd.Series(dtype="float64"),
72                             "vi_merge": pd.Series(dtype="float64")})
73    log = np.log2 if use_log2 else np.log
74    counts = table.counts.astype("float64")
75    sa, sb = _pair_sizes(table)
76
77    # Group the pairs by their groundtruth (B) label.
78    order = np.argsort(table.pairs[:, 1], kind="stable")
79    b_sorted = table.pairs[:, 1][order]
80    c, sa_o, sb_o = counts[order], sa[order], sb[order]
81    starts = np.flatnonzero(np.concatenate(([True], b_sorted[1:] != b_sorted[:-1])))
82
83    vi_merge = np.add.reduceat(-(c / sb_o) * log(c / sb_o), starts)
84    vi_split = np.add.reduceat(-(c / sb_o) * log(c / sa_o), starts)
85    return pd.DataFrame({"label": b_sorted[starts].astype("uint64"),
86                         "vi_split": vi_split, "vi_merge": vi_merge}).reset_index(drop=True)

Compute the per-groundtruth-object variation of information from a contingency table.

Based on https://arxiv.org/pdf/1708.02599.pdf (page 16).

Args: table: A contingency table built as contingency_table(segmentation, groundtruth). use_log2: Whether to use log2 (bits) or natural log (nats).

Returns: A pandas DataFrame with one row per groundtruth object, sorted by label, with columns label (groundtruth id), vi_split and vi_merge.

def variation_of_information( segmentation: 'SourceLike', groundtruth: 'SourceLike', *, ignore_seg: Optional[Sequence[int]] = None, ignore_gt: Optional[Sequence[int]] = None, use_log2: bool = True, num_workers: int = 1, block_shape: Optional[Tuple[int, ...]] = None, job_type: str = 'local', job_config: Optional[bioimage_py.runner.RunnerConfig] = None, mask: 'Optional[SourceLike]' = None) -> Tuple[float, float]:
 89def variation_of_information(
 90    segmentation: SourceLike,
 91    groundtruth: SourceLike,
 92    *,
 93    ignore_seg: Optional[Sequence[int]] = None,
 94    ignore_gt: Optional[Sequence[int]] = None,
 95    use_log2: bool = True,
 96    num_workers: int = 1,
 97    block_shape: Optional[Tuple[int, ...]] = None,
 98    job_type: str = "local",
 99    job_config: Optional[RunnerConfig] = None,
100    mask: Optional[SourceLike] = None,
101) -> Tuple[float, float]:
102    """Compute the split and merge variation of information between two segmentations.
103
104    Args:
105        segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a `Source`).
106        groundtruth: The groundtruth segmentation; same shape as ``segmentation``.
107        ignore_seg: Labels to ignore in the segmentation (their voxels are excluded).
108        ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded).
109        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
110        num_workers: Number of parallel workers used to build the contingency table.
111        block_shape: Shape of the processing blocks. Defaults to the input chunk shape.
112        job_type: Execution backend: one of ``"local"``, ``"subprocess"`` or ``"slurm"``.
113        job_config: Backend configuration (a `RunnerConfig` / `SlurmConfig`).
114        mask: Optional binary mask; voxels outside the mask are excluded.
115
116    Returns:
117        The split variation of information and the merge variation of information.
118    """
119    table = build_table(segmentation, groundtruth, ignore_seg=ignore_seg, ignore_gt=ignore_gt,
120                        num_workers=num_workers, block_shape=block_shape, job_type=job_type,
121                        job_config=job_config, mask=mask)
122    return vi_scores(table, use_log2=use_log2)

Compute the split and merge variation of information between two segmentations.

Args: segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a Source). groundtruth: The groundtruth segmentation; same shape as segmentation. ignore_seg: Labels to ignore in the segmentation (their voxels are excluded). ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded). use_log2: Whether to use log2 (bits) or natural log (nats). num_workers: Number of parallel workers used to build the contingency table. block_shape: Shape of the processing blocks. Defaults to the input chunk shape. job_type: Execution backend: one of "local", "subprocess" or "slurm". job_config: Backend configuration (a RunnerConfig / SlurmConfig). mask: Optional binary mask; voxels outside the mask are excluded.

Returns: The split variation of information and the merge variation of information.

def object_vi( segmentation: 'SourceLike', groundtruth: 'SourceLike', *, ignore_seg: Optional[Sequence[int]] = None, ignore_gt: Optional[Sequence[int]] = None, use_log2: bool = True, num_workers: int = 1, block_shape: Optional[Tuple[int, ...]] = None, job_type: str = 'local', job_config: Optional[bioimage_py.runner.RunnerConfig] = None, mask: 'Optional[SourceLike]' = None) -> pandas.DataFrame:
125def object_vi(
126    segmentation: SourceLike,
127    groundtruth: SourceLike,
128    *,
129    ignore_seg: Optional[Sequence[int]] = None,
130    ignore_gt: Optional[Sequence[int]] = None,
131    use_log2: bool = True,
132    num_workers: int = 1,
133    block_shape: Optional[Tuple[int, ...]] = None,
134    job_type: str = "local",
135    job_config: Optional[RunnerConfig] = None,
136    mask: Optional[SourceLike] = None,
137) -> "pd.DataFrame":
138    """Compute the per-groundtruth-object variation of information between two segmentations.
139
140    Args:
141        segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a `Source`).
142        groundtruth: The groundtruth segmentation; same shape as ``segmentation``.
143        ignore_seg: Labels to ignore in the segmentation (their voxels are excluded).
144        ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded).
145        use_log2: Whether to use ``log2`` (bits) or natural ``log`` (nats).
146        num_workers: Number of parallel workers used to build the contingency table.
147        block_shape: Shape of the processing blocks. Defaults to the input chunk shape.
148        job_type: Execution backend: one of ``"local"``, ``"subprocess"`` or ``"slurm"``.
149        job_config: Backend configuration (a `RunnerConfig` / `SlurmConfig`).
150        mask: Optional binary mask; voxels outside the mask are excluded.
151
152    Returns:
153        A pandas DataFrame with one row per groundtruth object (columns ``label``, ``vi_split``,
154        ``vi_merge``).
155    """
156    table = build_table(segmentation, groundtruth, ignore_seg=ignore_seg, ignore_gt=ignore_gt,
157                        num_workers=num_workers, block_shape=block_shape, job_type=job_type,
158                        job_config=job_config, mask=mask)
159    return object_vi_scores(table, use_log2=use_log2)

Compute the per-groundtruth-object variation of information between two segmentations.

Args: segmentation: Candidate segmentation to evaluate (a numpy/zarr/n5 array or a Source). groundtruth: The groundtruth segmentation; same shape as segmentation. ignore_seg: Labels to ignore in the segmentation (their voxels are excluded). ignore_gt: Labels to ignore in the groundtruth (their voxels are excluded). use_log2: Whether to use log2 (bits) or natural log (nats). num_workers: Number of parallel workers used to build the contingency table. block_shape: Shape of the processing blocks. Defaults to the input chunk shape. job_type: Execution backend: one of "local", "subprocess" or "slurm". job_config: Backend configuration (a RunnerConfig / SlurmConfig). mask: Optional binary mask; voxels outside the mask are excluded.

Returns: A pandas DataFrame with one row per groundtruth object (columns label, vi_split, vi_merge).