Tutorial about analyzing grouped cluster properties#

Localization properties vary within clusters. Analyzing such variations can help to characterize cluster populations. Here, we show examples for variations in convex hull properties or coordinate variances.

from pathlib import Path

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from colorcet import m_fire, m_gray, m_dimgray

import locan as lc

/tmp/ipykernel_1019/3459984724.py:6: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

lc.show_versions(system=False, dependencies=False, verbose=False)

Locan:
   version: 0.20.0.dev41+g755b969

Python:
   version: 3.11.6

Synthetic data#

We simulate localization data that follows a Neyman-Scott distribution in 2D:

rng = np.random.default_rng(seed=1)

locdata = lc.simulate_dstorm(parent_intensity=1e-5, region=((0, 10_000), (0, 10_000)), cluster_mu=10, cluster_std=10, seed=rng)

locdata.print_summary()

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
identifier: "1"
comment: ""
source: SIMULATION
state: RAW
element_count: 10380
frame_count: 0
creation_time {
  2024-03-14T11:06:09.138715Z
}

bin_range=((0, 2_000), (0, 2_000))

lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range)

<Axes: title={'center': 'counts'}, xlabel='position_x', ylabel='position_y'>

../../_images/b313e2d93c624ced26290d4eb450d08f3ad32ed41bf4c19bae1259abd43a0bf4.png

True clusters#

First we look at ground truth clusters.

grouped = locdata.data.groupby("cluster_label")
clust = lc.LocData.from_collection([lc.LocData.from_selection(locdata, indices=group.index) for name, group in grouped])

Filter out clusters with less than 3 localizations since on convex hull can be computed for such clusters.

clust_selection = lc.select_by_condition(clust, condition="2 < localization_count")
references_ = [clust.references[i] for i in clust_selection.indices]
clust_selection.reduce()
clust_selection.references = references_
clust = clust_selection

Number of clusters: 771
Number of clustered localizations: 10172
Ratio cluster to noise localizations: 0.98

fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()

../../_images/c10487ced524925a6e3590304061faa1a4b0434cccf74c02b3848f38736b7dc4.png

clust.data.head()

	localization_count	position_x	uncertainty_x	position_y	uncertainty_y	region_measure_bb	localization_density_bb	subregion_measure_bb	region_measure	localization_density	subregion_measure
0	24	1399.549521	1.877356	9539.052779	1.853978	1025.930045	0.023393	128.121678	100000000	2.400000e-07	40000
2	3	8303.079096	10.889812	4081.419445	3.852366	456.241155	0.006575	96.259339	100000000	3.000000e-08	40000
3	7	5507.141828	2.444467	224.387484	2.132102	336.278485	0.020816	73.989892	100000000	7.000000e-08	40000
4	4	7561.723474	3.515554	5385.880528	5.842642	443.103490	0.009027	87.512555	100000000	4.000000e-08	40000
5	7	3279.865197	5.284643	7918.206789	2.848332	774.964110	0.009033	117.649012	100000000	7.000000e-08	40000

clust.properties

{'localization_count': 771,
 'position_x': 5078.461816893804,
 'uncertainty_x': 138.598117645907,
 'position_y': 5571.31780554545,
 'uncertainty_y': 421.90449657593706,
 'region_measure_bb': 99518129.6386362,
 'localization_density_bb': 7.7473320971727e-06,
 'subregion_measure_bb': 39903.55763820844}

Investigate the convex hull areas#

Localization clusters can be analyzed with respect to their convex hull region properties as function of localization_count as outlined in Ebert et al. (https://doi:10.1093/bioinformatics/btac700).

che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results

<locan.analysis.convex_hull_expectation.ConvexHullExpectationResults at 0x7fd35a649490>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);

../../_images/a22120b471e42d814573ed357e500bf76fe54fbc652aef2026c3c6d1bea9a6b4.png

Investigate the position variances#

Localization coordinates in localization clusters come with a certain variance. The variance is related to the localization precision or other localization properties but also varies with localization_count if determined as biased sample variance (i.e. without Bessel’s correction).

pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results

<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd35a4af190>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);

../../_images/1ebaf12cc1d4f48163371ba831c80edfc10d1f1b9d696751109f610561c6d693.png

A similar analysis can be performed with unbiased variances in which Bessel’s correction is applied.

pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results

<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd359e616d0>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);

../../_images/5b63a7af0b090f25c5a045b13b7f1f017a47edaada5a8621eb2e738e6778fbf4.png

Investigate any grouped property#

A similar analysis can be carried out with any LocData property. For instance, let’s check the coordinate uncertainties of cluster centroids. The uncertainty in one dimension should follow the square root of the biased position variance for clusters with variable number of localizations.

It is important to consider the differences between variance and standard deviation. Position uncertainties are usually given as standard deviation with units equal to position units. Converting the ground truth for the coordinate standard deviation in each cluster (std as used in the simulations above) requires a Bessel correction on the squared std being the variance). In addition the coordinate uncertainties of cluster centroids should scale with the inverse square root of the number of localizations per cluster.

n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;

loc_property = "uncertainty_x"
other_loc_property = "localization_count"

gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results

<locan.analysis.grouped_property_expectation.GroupedPropertyExpectationResults at 0x7fd353e70310>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);

../../_images/e72f6207bfdd43dfec80fafdc2947a6981cf4fd487d3292c03b746f5677df688.png

Cluster localizations by dbscan#

When clustering data by dbscan slight deviations appear between expectation and computed properties.

noise, clust = lc.cluster_dbscan(locdata, eps=20, min_samples=3)

Number of clusters: 725
Number of noise localizations: 415
Number of clustered localizations: 9965
Ratio cluster to noise localizations: 0.96

fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()

../../_images/af61daefb2f728ec86c71df19c6c55859efa3ceef4d277ec17ee62f376ea83eb.png

Investigate the position variances#

pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results

<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd353ac8c10>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);

../../_images/aadf0aeb2ff192e58e67c8b586c0c3e04c13e06ed1359c6d9539ec6ca0795cae.png

pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results

<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd35e16b8d0>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);

../../_images/62740cf9534344d810daa622d442247981656dd46c9dc86bdcaf17d8169392df.png

Investigate the convex hull areas#

che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results

<locan.analysis.convex_hull_expectation.ConvexHullExpectationResults at 0x7fd3539916d0>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);

../../_images/29a70a473a7a2b85581a6a2aad6f1bf7513896e42aae067e706433f504d2a3fe.png

Investigate the uncertainties for cluster centroids#

n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;

loc_property = "uncertainty_x"
other_loc_property = "localization_count"

gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results

<locan.analysis.grouped_property_expectation.GroupedPropertyExpectationResults at 0x7fd359c0c890>

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);

../../_images/10c5997b10cee6140c40fd74ef197e12801c23b2e5dd96fe2acebde220d87e55.png