Last active
April 29, 2024 11:35
-
-
Save robroc/24929ab015e4a0929bd358293fd1926d to your computer and use it in GitHub Desktop.
An example of clustering points of a map with HDBSCAN and using weighted averages to find the optimal cluster center
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Thanks to Leland McInnes (@leland_mcinnes) for devising the weighting solution | |
import hdbscan | |
# https://hdbscan.readthedocs.io/en/latest/index.html | |
import numpy as np | |
"""Assume df is a pandas dataframe with latitude and longitude columns in WGS84""" | |
# Define a cluster selection epsilon to group points within this distance. | |
# In this case, it's 10 meters (0.01 km divided by the radius of the Earth). | |
eps = .01 / 6378 | |
# Initiate HDBSCAN clusterer | |
clusterer = hdbscan.HDBSCAN(min_cluster_size = 200, | |
metric = 'haversine', | |
cluster_selection_epsilon = eps) | |
# Convert coordinates to radians for distance-based grouping and cluster them | |
clusterer.fit(np.radians(df[['latitude', 'longitude']]).values) | |
# Assign cluster labels to each point | |
df['clusters'] = clusterer.labels_ | |
# Assign the membership strenth to each point. The closer a point is the a high-density area, the stronger the membership. | |
# This will be the weighting factor. | |
df['membership_strength'] = clusterer.probabilities_ | |
# Define a weighted average function with numpy. | |
weighted_mean = lambda x: np.average(x, weights=df.loc[x.index, 'membership_strength']) | |
# Exclude points with label -1. These are points that weren't assigned to a cluster and have membership strength of 0. | |
# Then group them by cluster labels and use the weighted average of the coordinates to find the cluster center. | |
clustered_zones = (df.query('clusters != -1').groupby('clusters') | |
.agg(dict(cluster_size = 'sum', | |
longitude = weighted_mean, | |
latitude = weighted_mean)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment