name: geographic-clustering description: Grid-based geographic clustering with O(n) performance, medoid finding for map markers, and multi-factor risk scoring from event density, sentiment, and recency. license: MIT compatibility: TypeScript/JavaScript metadata: category: data-access time: 5h source: drift-masterguide

Geographic Clustering

Grid-based clustering with medoid finding and risk scoring from event density.

When to Use This Skill

Clustering thousands of geo-located events for visualization
Need O(n) performance instead of O(n²) distance-based clustering
Want actual data points as map markers (medoids, not synthetic centroids)
Calculating risk scores from cluster characteristics

Core Concepts

Grid-based clustering is much faster than distance-based algorithms:

O(n) grouping by grid cell vs O(n²) for distance-based
Medoid finding gives actual data point as center (better for map markers)
Multi-factor risk scoring combines density, sentiment, and recency

Implementation

TypeScript

interface ProcessedEvent {
  id: string;
  lat?: number;
  lon?: number;
  sentiment: number;
  seenDate: Date;
  sourceCountry: string;
}

interface Cluster {
  id: string;
  events: ProcessedEvent[];
  centroidLat: number;
  centroidLon: number;
  countryCode: string;
}

interface RiskPrediction {
  id: string;
  lat: number;
  lon: number;
  riskScore: number;
  sentiment: number;
  eventCount: number;
  topSignals: string[];
  summary: string;
  countryCode: string;
}

// Grid cell size in degrees (~100km at equator)
const GRID_SIZE = 1.0;

/**
 * Generate grid cell key for coordinates
 */
function getGridKey(lat: number, lon: number): string {
  const gridLat = Math.floor(lat / GRID_SIZE) * GRID_SIZE;
  const gridLon = Math.floor(lon / GRID_SIZE) * GRID_SIZE;
  return `${gridLat},${gridLon}`;
}

/**
 * Cluster events by geographic grid - O(n) complexity
 */
function clusterEvents(
  events: ProcessedEvent[], 
  minClusterSize: number = 3
): Cluster[] {
  const grid = new Map<string, ProcessedEvent[]>();

  // O(n) grouping by grid cell
  for (const event of events) {
    if (event.lat === undefined || event.lon === undefined) continue;
    
    const key = getGridKey(event.lat, event.lon);
    if (!grid.has(key)) {
      grid.set(key, []);
    }
    grid.get(key)!.push(event);
  }

  // Convert grid cells to clusters
  const clusters: Cluster[] = [];
  let clusterId = 1;

  for (const [key, clusterEvents] of grid) {
    if (clusterEvents.length < minClusterSize) continue;

    const centroidLat = clusterEvents.reduce((sum, e) => sum + (e.lat || 0), 0) 
                        / clusterEvents.length;
    const centroidLon = clusterEvents.reduce((sum, e) => sum + (e.lon || 0), 0) 
                        / clusterEvents.length;

    clusters.push({
      id: `cluster-${clusterId++}`,
      events: clusterEvents,
      centroidLat,
      centroidLon,
      countryCode: getDominantCountry(clusterEvents),
    });
  }

  return clusters;
}

/**
 * Find the medoid (most central actual point) in a cluster
 */
function findMedoid(events: ProcessedEvent[]): ProcessedEvent {
  if (events.length === 0) throw new Error('Cannot find medoid of empty cluster');
  if (events.length === 1) return events[0];

  const centroidLat = events.reduce((sum, e) => sum + (e.lat || 0), 0) / events.length;
  const centroidLon = events.reduce((sum, e) => sum + (e.lon || 0), 0) / events.length;

  let medoid = events[0];
  let minDist = Infinity;

  for (const event of events) {
    const dist = Math.hypot(
      (event.lat || 0) - centroidLat, 
      (event.lon || 0) - centroidLon
    );
    if (dist < minDist) {
      minDist = dist;
      medoid = event;
    }
  }

  return medoid;
}

/**
 * Get most common country code in cluster
 */
function getDominantCountry(events: ProcessedEvent[]): string {
  const counts: Record<string, number> = {};
  
  for (const event of events) {
    const country = event.sourceCountry || 'XX';
    counts[country] = (counts[country] || 0) + 1;
  }

  let maxCount = 0;
  let dominant = 'XX';
  
  for (const [country, count] of Object.entries(counts)) {
    if (count > maxCount) {
      maxCount = count;
      dominant = country;
    }
  }

  return dominant;
}

/**
 * Calculate risk score for a cluster (0-100)
 * Multi-factor: density + sentiment + recency
 */
function calculateRiskScore(cluster: Cluster): number {
  const eventCount = cluster.events.length;
  const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) 
                       / eventCount;

  // Factor 1: Event density (0-40 points)
  const eventFactor = Math.min(eventCount / 300, 1) * 40;

  // Factor 2: Sentiment (0-30 points)
  const sentimentFactor = Math.abs(Math.min(avgSentiment, 0)) * 30;

  // Factor 3: Recency (0-30 points)
  const now = Date.now();
  const avgAge = cluster.events.reduce((sum, e) => {
    return sum + (now - e.seenDate.getTime());
  }, 0) / eventCount;
  const hoursOld = avgAge / (1000 * 60 * 60);
  const recencyFactor = Math.max(0, 30 - hoursOld);

  const total = eventFactor + sentimentFactor + recencyFactor;
  return Math.round(Math.min(100, Math.max(0, total)));
}

/**
 * Convert clusters to risk predictions for API/UI
 */
function clustersToPredictions(clusters: Cluster[]): RiskPrediction[] {
  const predictions: RiskPrediction[] = [];

  for (const cluster of clusters) {
    const medoid = findMedoid(cluster.events);
    const riskScore = calculateRiskScore(cluster);
    const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) 
                         / cluster.events.length;

    const riskLevel = riskScore >= 80 ? 'Critical' 
                    : riskScore >= 60 ? 'High' 
                    : riskScore >= 40 ? 'Moderate' 
                    : 'Low';

    predictions.push({
      id: cluster.id,
      lat: medoid.lat || cluster.centroidLat,
      lon: medoid.lon || cluster.centroidLon,
      riskScore,
      sentiment: avgSentiment,
      eventCount: cluster.events.length,
      topSignals: generateSignals(cluster, riskScore),
      summary: `${riskLevel} risk in ${cluster.countryCode}: ${cluster.events.length} events`,
      countryCode: cluster.countryCode,
    });
  }

  return predictions.sort((a, b) => b.riskScore - a.riskScore);
}

function generateSignals(cluster: Cluster, riskScore: number): string[] {
  const signals: string[] = [];
  const avgSentiment = cluster.events.reduce((sum, e) => sum + e.sentiment, 0) 
                       / cluster.events.length;

  signals.push(`${cluster.events.length} events`);

  if (avgSentiment < -0.5) signals.push('Very negative tone');
  else if (avgSentiment < -0.2) signals.push('Negative tone');

  if (riskScore >= 80) signals.push('Critical risk');
  else if (riskScore >= 60) signals.push('High risk');

  return signals.slice(0, 3);
}

Usage Examples

Basic Clustering Pipeline

async function processEvents() {
  const events = await fetchEvents();
  
  // Cluster geographically (min 3 events per cluster)
  const clusters = clusterEvents(events, 3);
  
  // Convert to predictions for map
  const predictions = clustersToPredictions(clusters);
  
  // Return top 50 for visualization
  return predictions.slice(0, 50);
}

Best Practices

Grid-based O(n) - Much faster than distance-based for large datasets
Medoid vs Centroid - Use actual data point for map markers (won't be in ocean)
Multi-factor risk - Combine density, sentiment, and recency
Minimum cluster size - Filter noise with threshold (3+ events)
Configurable parameters - Make grid size and thresholds adjustable

Common Mistakes

Distance-based clustering on large datasets (O(n²) is too slow)
Synthetic centroids for map markers (may be in invalid locations)
Single-factor risk scoring (misses nuance)
No minimum cluster size (noise becomes clusters)
Hardcoded risk thresholds (should be configurable)

Related Patterns

deduplication - Deduplicate events before clustering
batch-processing - Process clusters efficiently
snapshot-aggregation - Aggregate cluster data for dashboards

Geographic Clustering

Skill Details

Repository Files

Geographic Clustering

When to Use This Skill

Core Concepts

Implementation

TypeScript

Usage Examples

Basic Clustering Pipeline

Best Practices

Common Mistakes

Related Patterns

Related Skills

Attack Tree Construction

Grafana Dashboards

Matplotlib

Scientific Visualization

Seaborn

Shap

Pydeseq2

Query Writing

Pydeseq2

Scientific Visualization

Skill Information