Source code for locan.analysis.nearest_neighbor

"""

Nearest-neighbor distance distribution analysis

Nearest-neighbor distance distributions provide information about deviations
from a spatial homogeneous Poisson process
(i.e. complete spatial randomness, CSR).
Point-event distances are given by the distance between a random point
(not being an event) and the nearest event.
The point-event distance distribution is estimated from a number of random
sample points and plotted in comparison to
the analytical function for equal localization density.

For a homogeneous 2D Poisson process with intensity :math:`\\rho` (expected
number of points per unit area) the distance
from a randomly chosen event to the nearest other event (nearest-neighbor
distance) is distributed according to the
following probability density (pdf) or cumulative density function (cdf) [1]_:

.. math::

   pdf(w) &= 2 \\rho \\pi w \\ exp(- \\rho \\pi w^2)

   cdf(w) &= 1 - exp (- \\rho \\pi w^2)


The same distribution holds for point-event distances if events are distributed
as a homogeneous Poisson process with
intensity :math:`\\rho`.

References
----------
.. [1] Philip M. Dixon, Nearest Neighbor Methods,
   Department of Statistics, Iowa State University,
   20 December 2001

"""

from __future__ import annotations

import logging
import sys
from typing import TYPE_CHECKING, Any, Literal

if sys.version_info >= (3, 11):
    from typing import Self
else:
    from typing_extensions import Self

if TYPE_CHECKING:
    import matplotlib as mpl

    from locan.data.locdata import LocData

import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import pandas as pd
from scipy import stats
from sklearn.neighbors import NearestNeighbors

from locan.analysis import metadata_analysis_pb2
from locan.analysis.analysis_base import _Analysis
from locan.configuration import N_JOBS

__all__: list[str] = ["NearestNeighborDistances"]

logger = logging.getLogger(__name__)


# The algorithms


[docs] def pdf_nnDistances_csr_2D(x: npt.ArrayLike, density: float) -> npt.NDArray[np.float64]: """ Probability density function for nearest-neighbor distances of points distributed in 2D with complete spatial randomness. Parameters ---------- x distance density density of points Returns ------- float Probability density function pdf(x). """ x = np.asarray(x) return_value: npt.NDArray[np.float64] = ( 2 * density * np.pi * x * np.exp(-density * np.pi * x**2) ) return return_value
[docs] def pdf_nnDistances_csr_3D(x: npt.ArrayLike, density: float) -> npt.NDArray[np.float64]: """ Probability density function for nearest-neighbor distances of points distributed in 3D with complete spatial randomness. Parameters ---------- x distance density density of points Returns ------- npt.NDArray[np.float64] Probability density function pdf(x). """ x = np.asarray(x) a = (3 / 4 / np.pi / density) ** (1 / 3) return_value: npt.NDArray[np.float64] = ( 3 / a * (x / a) ** 2 * np.exp(-((x / a) ** 3)) ) return return_value
def _nearest_neighbor_distances( points: npt.ArrayLike, k: int = 1, other_points: npt.ArrayLike | None = None ) -> pd.DataFrame: if other_points is None: nn = NearestNeighbors(n_neighbors=k, metric="euclidean", n_jobs=N_JOBS).fit( points ) distances, indices = nn.kneighbors() else: nn = NearestNeighbors(n_neighbors=k, metric="euclidean", n_jobs=N_JOBS).fit( other_points ) distances, indices = nn.kneighbors(points) return pd.DataFrame( {"nn_distance": distances[..., k - 1], "nn_index": indices[..., k - 1]} ) # The specific analysis classes
[docs] class NearestNeighborDistances(_Analysis): """ Compute the k-nearest-neighbor distances within data or between data and other_data. The algorithm relies on sklearn.neighbors.NearestNeighbors. Parameters ---------- meta : locan.analysis.metadata_analysis_pb2.AMetadata Metadata about the current analysis routine. k : int Compute the kth nearest neighbor. Attributes ---------- count : int A counter for counting instantiations. parameter : dict A dictionary with all settings for the current computation. meta : locan.analysis.metadata_analysis_pb2.AMetadata Metadata about the current analysis routine. results : pandas.DataFrame Computed results. distribution_statistics : Distribution_stats | None Distribution parameters derived from MLE fitting of results. """ count = 0 def __init__( self, meta: metadata_analysis_pb2.AMetadata | None = None, k: int = 1 ) -> None: parameters = self._get_parameters(locals()) super().__init__(**parameters) self.dimension: int | None = None self.localization_density: float | None = None self.results: pd.DataFrame | None = None self.distribution_statistics: _DistributionFits | None = None
[docs] def compute(self, locdata: LocData, other_locdata: LocData | None = None) -> Self: """ Run the computation. Parameters ---------- locdata Localization data. other_locdata Other localization data from which nearest neighbors are taken. Returns ------- Self """ if not len(locdata): logger.warning("Locdata is empty.") return self self.dimension = locdata.dimension # setting the localization density of locdata if other_locdata is None: self.localization_density = locdata.properties["localization_density_bb"] else: if other_locdata.dimension != self.dimension: raise TypeError( "Dimensions for locdata and other_locdata must be identical." ) self.localization_density = other_locdata.properties[ "localization_density_bb" ] points = locdata.coordinates if other_locdata is None: other_points = None else: other_points = other_locdata.coordinates self.results = _nearest_neighbor_distances( points=points, **self.parameter, other_points=other_points ) return self
[docs] def fit_distributions(self, with_constraints: bool = True) -> None: """ Fit probability density functions to the distributions of `loc_property` values in the results using MLE (scipy.stats). If with_constraints is true we put the following constraints on the fit procedure: If distribution is expon then floc=np.min(self.analysis_class.results[self.loc_property].values). Parameters ---------- with_constraints Flag to use predefined constraints on fit parameters. """ if self: self.distribution_statistics = _DistributionFits(self) self.distribution_statistics.fit(with_constraints=with_constraints) else: logger.warning("No results available to fit.")
[docs] def hist( self, ax: mpl.axes.Axes | None = None, bins: int | list[int | float] | Literal["auto"] = "auto", density: bool = True, fit: bool = False, **kwargs: Any, ) -> mpl.axes.Axes: """ Provide histogram as :class:`matplotlib.axes.Axes` object showing hist(results). Parameters ---------- ax The axes on which to show the image. bins Bin specification as used in :func:`matplotlib.hist` density Flag for normalization as used in matplotlib.hist. True returns probability density function; None returns counts. fit Flag indicating to fit pdf of nearest-neighbor distances under complete spatial randomness. kwargs Other parameters passed to :func:`matplotlib.plot`. Returns ------- matplotlib.axes.Axes Axes object with the plot. """ if ax is None: ax = plt.gca() if self.results is None: return ax assert self.localization_density is not None # type narrowing # noqa: S101 _values, bin_values, _patches = ax.hist( self.results["nn_distance"], bins=bins, density=density, label="data" ) x_data = (bin_values[:-1] + bin_values[1:]) / 2 if self.dimension == 2: ax.plot( x_data, pdf_nnDistances_csr_2D(x_data, self.localization_density), "r-", label="CSR", **kwargs, ) elif self.dimension == 3: ax.plot( x_data, pdf_nnDistances_csr_3D(x_data, self.localization_density), "r-", label="CSR", **kwargs, ) else: logger.warning( f"No analytic probability density function for {self.dimension} dimensions available." ) # fit distributions: if fit: if self.distribution_statistics is None: self.fit_distributions() assert ( # type narrowing # noqa: S101 self.distribution_statistics is not None ) self.distribution_statistics.plot(ax=ax) ax.set( title="k-Nearest Neighbor Distances\n" # noqa: ISC003 + " (k = " + str(self.parameter["k"]) + ")", xlabel="distance (nm)", ylabel="pdf" if density else "counts", ) ax.legend(loc="best") return ax
# Auxiliary functions and classes
[docs] class NNDistances_csr_2d(stats.rv_continuous): """ Continuous distribution function for nearest-neighbor distances of points distributed in 2D under complete spatial randomness. Parameters ---------- density : float Shape parameter `density`, being the density of points. """ def _pdf(self, x: float, density: float) -> float: return_value: float = 2 * density * np.pi * x * np.exp(-density * np.pi * x**2) return return_value
[docs] class NNDistances_csr_3d(stats.rv_continuous): """ Continuous distribution function for nearest-neighbor distances of points distributed in 3D under complete spatial randomness. Parameters ---------- x : float distance density : float Shape parameter `density`, being the density of points. """ def _pdf(self, x: float, density: float) -> float: a = (3 / 4 / np.pi / density) ** (1 / 3) return_value: float = 3 / a * (x / a) ** 2 * np.exp(-((x / a) ** 3)) return return_value
class _DistributionFits: """ Handle for distribution fits. This class is typically instantiated by LocalizationProperty methods. It holds the statistical parameters derived by fitting the result distributions using MLE (scipy.stats). Statistical parameters are defined as described in :ref:(https://docs.scipy.org/doc/scipy/reference/tutorial/stats/continuous.html) Parameters ---------- analyis_class : NearestNeighborDistances The analysis class with result data to fit. Attributes ---------- analyis_class : NearestNeighborDistances The analysis class with result data to fit. loc_property : str The LocData property for which to fit an appropriate distribution distribution : str | scipy.stats.rv_continuous Distribution model to fit. parameters : list[str] Free parameters in `distribution`. """ def __init__(self, analysis_class: NearestNeighborDistances) -> None: self.analysis_class = analysis_class self.loc_property = "nn_distance" self.distribution: stats.rv_continuous | None = None self.parameters: list[str] = [] def fit(self, with_constraints: bool = True, **kwargs: Any) -> None: """ Fit model function to analysis_class.results. If with_constraints is true (default) we put the following constraints on the fit procedure: loc=0, scale=1 Parameters ---------- with_constraints Flag to use predefined constraints on fit parameters. kwargs Other parameters passed to the `distribution.fit()` method. """ if self.analysis_class.results is None: raise ValueError("Compute results before fitting.") if self.analysis_class.dimension == 2: self.distribution = NNDistances_csr_2d(name="NNDistances_csr_2d", a=0.0) elif self.analysis_class.dimension == 3: self.distribution = NNDistances_csr_3d(name="NNDistances_csr_3d", a=0.0) else: logger.warning( f"No fit model for {self.analysis_class.dimension} dimensions available." ) return self.parameters = [name.strip() for name in self.distribution.shapes.split(",")] # type: ignore[attr-defined] self.parameters += ["loc", "scale"] if with_constraints: kwargs_ = dict(dict(floc=0, fscale=1), **kwargs) else: kwargs_ = kwargs fit_results = self.distribution.fit( # type: ignore[attr-defined] data=self.analysis_class.results[self.loc_property].values, **kwargs_ ) for parameter, result in zip(self.parameters, fit_results): setattr(self, parameter, result) def plot(self, ax: mpl.axes.Axes | None = None, **kwargs: Any) -> mpl.axes.Axes: """ Provide plot as :class:`matplotlib.axes.Axes` object showing the probability distribution functions of fitted results. Parameters ---------- ax The axes on which to show the image. kwargs Other parameters passed to :func:`matplotlib.pyplot.plot`. Returns ------- matplotlib.axes.Axes Axes object with the plot. """ if ax is None: ax = plt.gca() if self.distribution is None: return ax # plot fit curve x_values = np.linspace( self.distribution.ppf(0.001, **self.parameter_dict()), self.distribution.ppf(0.999, **self.parameter_dict()), 100, ) ax.plot( x_values, self.distribution.pdf(x_values, **self.parameter_dict()), "r-", **dict( dict(lw=3, alpha=0.6, label=str(self.distribution.name) + " pdf"), **kwargs, ), ) return ax def parameter_dict(self) -> dict[str, float]: """Dictionary of fitted parameters.""" return {k: self.__dict__[k] for k in self.parameters}