Step 3 of 7
Louvain Clustering
Hierarchical clustering with interactive visualization
Hierarchical clustering with interactive visualization
Using Louvain community detection, we build a two-level hierarchy of parent clusters and subclusters from keyword similarity networks.
# Louvain Clustering Algorithm # Hierarchical community detection with quality filtering import networkx as nx from community import community_louvain from sklearn.neighbors import NearestNeighbors import umap.umap_ as umap def build_graph(X, threshold, n_neighbors=10): """ Build similarity graph from embeddings 1) Find k-nearest neighbors 2) Add edges where similarity > threshold """ n_samples = X.shape[0] nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, metric='euclidean').fit(X) distances, indices = nbrs.kneighbors(X) G = nx.Graph() G.add_nodes_from(range(n_samples)) for i in range(n_samples): for j in indices[i, 1:]: sim = np.dot(X[i], X[j]) # cosine similarity if sim >= threshold: G.add_edge(i, j, weight=sim) return G def run_louvain(G): """Run Louvain community detection""" partition = community_louvain.best_partition(G, weight='weight') return np.array([partition[i] for i in range(G.number_of_nodes())]) def parent_clustering(X, threshold=0.73): """First level: Parent clusters""" G_parent = build_graph(X, threshold=threshold) parent_labels = run_louvain(G_parent) return parent_labels def subcluster_if_large(X, parent_labels, max_size=50, sub_threshold=0.83): """Second level: Subcluster large parents""" article_labels = np.array(parent_labels, copy=True) current_max_label = article_labels.max() for p_lbl in np.unique(parent_labels): idxs = np.where(parent_labels == p_lbl)[0] if len(idxs) > max_size: # Subcluster with stricter threshold X_sub = X[idxs] G_sub = build_graph(X_sub, threshold=sub_threshold) sub_labs = run_louvain(G_sub) # Assign new labels for sub_lab in np.unique(sub_labs): current_max_label += 1 mask = sub_labs == sub_lab article_labels[idxs[mask]] = current_max_label return article_labels def visualize_clusters(X, labels): """UMAP projection for visualization""" reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2) embedding_2d = reducer.fit_transform(X) return embedding_2d # Backend endpoint will be added later...