Tutorial about analyzing grouped cluster properties#

Localization properties vary within clusters. Analyzing such variations can help to characterize cluster populations. Here, we show examples for variations in convex hull properties or coordinate variances.

from pathlib import Path

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from colorcet import m_fire, m_gray, m_dimgray

import locan as lc
/tmp/ipykernel_1019/3459984724.py:6: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
lc.show_versions(system=False, dependencies=False, verbose=False)
Locan:
   version: 0.20.0.dev41+g755b969

Python:
   version: 3.11.6

Synthetic data#

We simulate localization data that follows a Neyman-Scott distribution in 2D:

rng = np.random.default_rng(seed=1)
locdata = lc.simulate_dstorm(parent_intensity=1e-5, region=((0, 10_000), (0, 10_000)), cluster_mu=10, cluster_std=10, seed=rng)

locdata.print_summary()
Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
identifier: "1"
comment: ""
source: SIMULATION
state: RAW
element_count: 10380
frame_count: 0
creation_time {
  2024-03-14T11:06:09.138715Z
}
bin_range=((0, 2_000), (0, 2_000))
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range)
<Axes: title={'center': 'counts'}, xlabel='position_x', ylabel='position_y'>
../../_images/b313e2d93c624ced26290d4eb450d08f3ad32ed41bf4c19bae1259abd43a0bf4.png

True clusters#

First we look at ground truth clusters.

grouped = locdata.data.groupby("cluster_label")
clust = lc.LocData.from_collection([lc.LocData.from_selection(locdata, indices=group.index) for name, group in grouped])

Filter out clusters with less than 3 localizations since on convex hull can be computed for such clusters.

clust_selection = lc.select_by_condition(clust, condition="2 < localization_count")
references_ = [clust.references[i] for i in clust_selection.indices]
clust_selection.reduce()
clust_selection.references = references_
clust = clust_selection
Hide code cell source
n_clustered_loc = np.sum([ref.properties['localization_count'] for ref in clust.references])
print(f"Number of clusters: {clust.properties['localization_count']}")
print(f"Number of clustered localizations: {n_clustered_loc}")
print(f"Ratio cluster to noise localizations: {n_clustered_loc / len(locdata):.3}")
Number of clusters: 771
Number of clustered localizations: 10172
Ratio cluster to noise localizations: 0.98
fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()
../../_images/c10487ced524925a6e3590304061faa1a4b0434cccf74c02b3848f38736b7dc4.png
clust.data.head()
localization_count position_x uncertainty_x position_y uncertainty_y region_measure_bb localization_density_bb subregion_measure_bb region_measure localization_density subregion_measure
0 24 1399.549521 1.877356 9539.052779 1.853978 1025.930045 0.023393 128.121678 100000000 2.400000e-07 40000
2 3 8303.079096 10.889812 4081.419445 3.852366 456.241155 0.006575 96.259339 100000000 3.000000e-08 40000
3 7 5507.141828 2.444467 224.387484 2.132102 336.278485 0.020816 73.989892 100000000 7.000000e-08 40000
4 4 7561.723474 3.515554 5385.880528 5.842642 443.103490 0.009027 87.512555 100000000 4.000000e-08 40000
5 7 3279.865197 5.284643 7918.206789 2.848332 774.964110 0.009033 117.649012 100000000 7.000000e-08 40000
clust.properties
{'localization_count': 771,
 'position_x': 5078.461816893804,
 'uncertainty_x': 138.598117645907,
 'position_y': 5571.31780554545,
 'uncertainty_y': 421.90449657593706,
 'region_measure_bb': 99518129.6386362,
 'localization_density_bb': 7.7473320971727e-06,
 'subregion_measure_bb': 39903.55763820844}

Investigate the convex hull areas#

Localization clusters can be analyzed with respect to their convex hull region properties as function of localization_count as outlined in Ebert et al. (https://doi:10.1093/bioinformatics/btac700).

che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results
<locan.analysis.convex_hull_expectation.ConvexHullExpectationResults at 0x7fd35a649490>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);
../../_images/a22120b471e42d814573ed357e500bf76fe54fbc652aef2026c3c6d1bea9a6b4.png

Investigate the position variances#

Localization coordinates in localization clusters come with a certain variance. The variance is related to the localization precision or other localization properties but also varies with localization_count if determined as biased sample variance (i.e. without Bessel’s correction).

pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results
<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd35a4af190>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);
../../_images/1ebaf12cc1d4f48163371ba831c80edfc10d1f1b9d696751109f610561c6d693.png

A similar analysis can be performed with unbiased variances in which Bessel’s correction is applied.

pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results
<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd359e616d0>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);
../../_images/5b63a7af0b090f25c5a045b13b7f1f017a47edaada5a8621eb2e738e6778fbf4.png

Investigate any grouped property#

A similar analysis can be carried out with any LocData property. For instance, let’s check the coordinate uncertainties of cluster centroids. The uncertainty in one dimension should follow the square root of the biased position variance for clusters with variable number of localizations.

It is important to consider the differences between variance and standard deviation. Position uncertainties are usually given as standard deviation with units equal to position units. Converting the ground truth for the coordinate standard deviation in each cluster (std as used in the simulations above) requires a Bessel correction on the squared std being the variance). In addition the coordinate uncertainties of cluster centroids should scale with the inverse square root of the number of localizations per cluster.

n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;
loc_property = "uncertainty_x"
other_loc_property = "localization_count"
gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results
<locan.analysis.grouped_property_expectation.GroupedPropertyExpectationResults at 0x7fd353e70310>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);
../../_images/e72f6207bfdd43dfec80fafdc2947a6981cf4fd487d3292c03b746f5677df688.png

Cluster localizations by dbscan#

When clustering data by dbscan slight deviations appear between expectation and computed properties.

noise, clust = lc.cluster_dbscan(locdata, eps=20, min_samples=3)
Hide code cell source
n_clustered_loc = np.sum([ref.properties['localization_count'] for ref in clust.references])
print(f"Number of clusters: {clust.properties['localization_count']}")
print(f"Number of noise localizations: {noise.properties['localization_count']}")
print(f"Number of clustered localizations: {n_clustered_loc}")
print(f"Ratio cluster to noise localizations: {n_clustered_loc / (n_clustered_loc + noise.properties['localization_count']):.3}")
Number of clusters: 725
Number of noise localizations: 415
Number of clustered localizations: 9965
Ratio cluster to noise localizations: 0.96
fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()
../../_images/af61daefb2f728ec86c71df19c6c55859efa3ceef4d277ec17ee62f376ea83eb.png

Investigate the position variances#

pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results
<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd353ac8c10>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);
../../_images/aadf0aeb2ff192e58e67c8b586c0c3e04c13e06ed1359c6d9539ec6ca0795cae.png
pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results
<locan.analysis.position_variance_expectation.PositionVarianceExpectationResults at 0x7fd35e16b8d0>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);
../../_images/62740cf9534344d810daa622d442247981656dd46c9dc86bdcaf17d8169392df.png

Investigate the convex hull areas#

che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results
<locan.analysis.convex_hull_expectation.ConvexHullExpectationResults at 0x7fd3539916d0>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);
../../_images/29a70a473a7a2b85581a6a2aad6f1bf7513896e42aae067e706433f504d2a3fe.png

Investigate the uncertainties for cluster centroids#

n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;
loc_property = "uncertainty_x"
other_loc_property = "localization_count"
gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results
<locan.analysis.grouped_property_expectation.GroupedPropertyExpectationResults at 0x7fd359c0c890>
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);
../../_images/10c5997b10cee6140c40fd74ef197e12801c23b2e5dd96fe2acebde220d87e55.png