Source code for locan.process.filter

"""

Filter localization data.

This module provides functions for filtering LocData objects.
The functions take LocData as input and compute new LocData objects.

"""

from __future__ import annotations

import sys
from collections.abc import Iterable
from typing import TYPE_CHECKING

import numpy as np
from sklearn.neighbors import NearestNeighbors

from locan import cluster_by_bin
from locan.configuration import N_JOBS
from locan.constants import HullType
from locan.data.images import Image
from locan.data.locdata import LocData
from locan.data.metadata_utils import _modify_meta
from locan.data.regions.region import Interval, Region, Region2D, RoiRegion
from locan.data.validation import _check_loc_properties
from locan.locan_types import RandomGeneratorSeed

if TYPE_CHECKING:
    from locan.data import Image

__all__: list[str] = [
    "Selector",
    "filter_condition",
    "select_by_condition",
    "select_by_region",
    "select_by_image_mask",
    "exclude_sparse_points",
    "random_subset",
    "localizations_in_cluster_regions",
]


[docs] class Selector: """ Define selection interval for a single localization property. Parameters ---------- loc_property Localization property activate Indicator to apply the selection or not lower_bound min fo selection interval upper_bound max of selection interval Attributes ---------- loc_property : str Localization property activate : bool Indicator to apply the selection or not lower_bound : int | float min fo selection interval upper_bound: int | float max of selection interval interval : Interval Class with interval specifications condition : str specification turned into condition string """ def __init__( self, loc_property: str, activate: bool, lower_bound: int | float, upper_bound: int | float, ) -> None: self.loc_property = loc_property self.activate = activate self._lower_bound = lower_bound self._upper_bound = upper_bound self.interval = Interval(lower_bound=lower_bound, upper_bound=upper_bound) def __repr__(self) -> str: return ( f"{self.__class__.__name__}(" f"loc_property='{self.loc_property}', " f"activate={self.activate}, " f"lower_bound={self.lower_bound}, " f"upper_bound={self.upper_bound}" f")" ) @property def lower_bound(self) -> int | float: return self._lower_bound @lower_bound.setter def lower_bound(self, value: int | float) -> None: self._lower_bound = value self.interval = Interval(lower_bound=value, upper_bound=self.upper_bound) @property def upper_bound(self) -> int | float: return self._upper_bound @upper_bound.setter def upper_bound(self, value: int | float) -> None: self._upper_bound = value self.interval = Interval(lower_bound=self.lower_bound, upper_bound=value) @property def condition(self) -> str: if self.activate is True: condition = f"{self.lower_bound} < {self.loc_property} < {self.upper_bound}" else: condition = "" return condition
[docs] def filter_condition(selectors: Iterable[Selector]) -> str: """ Get a condition string from selection specifications. Parameters ---------- selectors Specifications for loc_property selections Returns ------- str """ iterable = (selector.condition for selector in selectors if selector.activate) condition = " and ".join(iterable) return condition
[docs] def select_by_condition(locdata: LocData, condition: str) -> LocData: """ Select by specifying conditions on data properties. Parameters ---------- locdata Specifying the localization data from which to select. condition Conditions as input in select method. More precise: query specifications to be used with pandas query. Returns ------- LocData A new instance of LocData referring to the specified dataset. """ local_parameter = locals() # select new_indices = locdata.data.query(condition).index.values.tolist() # instantiate new_locdata = LocData.from_selection(locdata=locdata, indices=new_indices) # type: ignore[arg-type] # update metadata meta_ = _modify_meta( locdata, new_locdata, function_name=sys._getframe().f_code.co_name, parameter=local_parameter, meta=None, ) new_locdata.meta = meta_ return new_locdata
[docs] def select_by_region( locdata: LocData, region: Region, loc_properties: list[str] | None = None, reduce: bool = True, ) -> LocData: """ Select localizations from `locdata` that are within `region` and return a new LocData object. Selection is with respect to loc_properties or to localization coordinates that correspond to region dimension. Parameters ---------- locdata Localization data that is tested for being inside the region. region Tested region loc_properties Localization properties to be tested. reduce Return the reduced LocData object or keep references alive. Returns ------- LocData A new instance of LocData with all localizations within region of interest. Note ---- Points on boundary of regions are considered outside if region is a shapely object, but inside if region is a matplotlib (which is the case for RoiRegion) object. """ local_parameter = locals() if loc_properties is None: loc_properties_ = locdata.coordinate_keys[0 : region.dimension] else: loc_properties_ = loc_properties points = locdata.data[list(loc_properties_)].values if isinstance(region, (Region2D, RoiRegion)): indices_inside = region.contains(points) locdata_indices_to_keep = locdata.data.index[indices_inside] new_locdata = LocData.from_selection( locdata=locdata, indices=locdata_indices_to_keep ) new_locdata.region = region else: raise NotImplementedError("Only Region2D has been implemented.") # finish if reduce: new_locdata.reduce() # update metadata meta_ = _modify_meta( locdata, new_locdata, function_name=sys._getframe().f_code.co_name, parameter=local_parameter, meta=None, ) new_locdata.meta = meta_ return new_locdata
[docs] def select_by_image_mask( locdata: LocData, image: Image, loc_properties: list[str] | None = None, ) -> LocData: """ Select by masking using an image. Each pixel is interpreted as True if different from 0. Parameters ---------- locdata specifying the localization data from which to select. image Image serving as mask. loc_properties Localization properties to be filtered. Returns ------- LocData a new instance of LocData referring to the selected dataset. """ local_parameter = locals() loc_properties = _check_loc_properties(locdata, loc_properties) assert loc_properties is not None # type narrowing # noqa: S101 if image.ndim != len(loc_properties): raise TypeError( "Dimension of image must correspond to the length of loc_properties" ) bins, bin_indices, collection, counts = cluster_by_bin( locdata=locdata, loc_properties=loc_properties, bins=image.bins ) mask = [tuple(item) for item in np.transpose(np.nonzero(image.data)).tolist()] # type: ignore items = [ collection.references[i] # type: ignore for i, indices in enumerate(bin_indices) if tuple(indices - 1) in mask # type: ignore ] if items: new_index = np.concatenate([item.data.index for item in items]) new_locdata = LocData.concat(items) new_locdata.data.index = new_index else: new_locdata = LocData() # update metadata meta_ = _modify_meta( locdata, new_locdata, function_name=sys._getframe().f_code.co_name, parameter=local_parameter, meta=None, ) new_locdata.meta = meta_ return new_locdata
[docs] def exclude_sparse_points( locdata: LocData, other_locdata: LocData | None = None, radius: float = 50, min_samples: int = 5, ) -> LocData: """ Exclude localizations by thresholding a local density. A subset of localizations, that exhibit a small local density of localizations from locdata or alternatively from other_locdata, is identified as noise and excluded. Noise is identified by using a nearest-neighbor search (:class:`sklearn.neighbors.NearestNeighbors`) to find all localizations within a circle (sphere) of the given `radius`. If the number of localizations is below the threshold value `min_samples`, the localization is considered to be noise. The method identifies the same noise points as done by the clustering algorithm DBSCAN [1]_. Parameters ---------- locdata Specifying the localization data from which to exclude localization data. other_locdata Specifying the localization data on which to compute local density. radius Radius of a circle or sphere in which neighbors are identified (equivalent to epsilon in DBSCAN). min_samples The minimum number of samples in the neighborhood that need to be found for each localization to not be identified as noise (equivalent to minPoints in DBSCAN). Returns ------- LocData All localizations except those identified as sparse (noise) points. References ---------- .. [1] Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu, A density-based algorithm for discovering clusters in large spatial databases with noise. In: Evangelos Simoudis, Jiawei Han, Usama M. Fayyad (Hrsg.): Proceedings of the Second International Conference on Knowledge Discovery and Data Mining (KDD-96). AAAI Press, 1996, S. 226-231, ISBN 1-57735-004-9. """ local_parameter = locals() if other_locdata is None: nn = NearestNeighbors(metric="euclidean", n_jobs=N_JOBS).fit( locdata.coordinates ) neighbor_points_list = nn.radius_neighbors(radius=radius, return_distance=False) # if points is not provided the query point is not considered its own neighbor. else: nn = NearestNeighbors(metric="euclidean", n_jobs=N_JOBS).fit( other_locdata.coordinates ) neighbor_points_list = nn.radius_neighbors( locdata.coordinates, radius=radius, return_distance=False ) indices_to_keep = [len(pts) >= min_samples for pts in neighbor_points_list] locdata_indices_to_keep = locdata.data.index[indices_to_keep] new_locdata = LocData.from_selection(locdata, locdata_indices_to_keep) # update metadata meta_ = _modify_meta( locdata, new_locdata, function_name=sys._getframe().f_code.co_name, parameter=local_parameter, meta=None, ) new_locdata.meta = meta_ return new_locdata
[docs] def random_subset( locdata: LocData, n_points: int, replace: bool = True, seed: RandomGeneratorSeed = None, ) -> LocData: """ Take a random subset of localizations. Parameters ---------- locdata Specifying the localization data from which to select localization data. n_points Number of localizations to randomly choose from locdata. replace Indicate if sampling is with or without replacement seed Random number generation seed Returns ------- LocData A new instance of LocData carrying the subset of localizations. """ local_parameter = locals() if not locdata: return locdata rng = np.random.default_rng(seed) indices = rng.choice(locdata.data.index, size=n_points, replace=replace) new_locdata = LocData.from_selection(locdata, indices) # update metadata meta_ = _modify_meta( locdata, new_locdata, function_name=sys._getframe().f_code.co_name, parameter=local_parameter, meta=None, ) new_locdata = LocData.from_selection(locdata, indices, meta=meta_) return new_locdata
[docs] def localizations_in_cluster_regions( locdata: LocData, collection: LocData | list[LocData], hull_type: HullType | str = HullType.CONVEX_HULL, ) -> LocData: """ Identify localizations from `locdata` within the regions of all `collection` elements. Parameters ---------- locdata Localization data that is tested for being inside the region collection A set of Locdata objects collected in a collection or list. hull_type The hull type for each LocData object that is used to define the region. Returns -------- LocData A collection of LocData objects with all elements of locdata contained by the region. """ locdatas = [] if isinstance(hull_type, str): hull_type = HullType[hull_type.upper()].value else: hull_type = hull_type.value if isinstance(collection, LocData): if isinstance(collection.references, list): # this case covers pure collections for ref in collection.references: cregion = getattr(ref, hull_type).region locdata_selection = select_by_region(locdata=locdata, region=cregion) locdatas.append(locdata_selection) else: # this case covers selections of collections for index in collection.indices: # type: ignore cregion = getattr( collection.references.references[index], hull_type # type: ignore ).region locdata_selection = select_by_region(locdata=locdata, region=cregion) locdatas.append(locdata_selection) else: # this case covers list of LocData objects for ref in collection: cregion = getattr(ref, hull_type).region locdata_selection = select_by_region(locdata=locdata, region=cregion) locdatas.append(locdata_selection) new_collection = LocData.from_collection(locdatas) return new_collection