Step 3 of 7
Louvain Clustering
Hierarchical clustering with interactive visualization
Hierarchical clustering with interactive visualization
Using Louvain community detection, we build a two-level hierarchy of parent clusters and subclusters from keyword similarity networks.
# Louvain Clustering Algorithm
# Hierarchical community detection with quality filtering
import networkx as nx
from community import community_louvain
from sklearn.neighbors import NearestNeighbors
import umap.umap_ as umap
def build_graph(X, threshold, n_neighbors=10):
"""
Build similarity graph from embeddings
1) Find k-nearest neighbors
2) Add edges where similarity > threshold
"""
n_samples = X.shape[0]
nbrs = NearestNeighbors(n_neighbors=n_neighbors+1, metric='euclidean').fit(X)
distances, indices = nbrs.kneighbors(X)
G = nx.Graph()
G.add_nodes_from(range(n_samples))
for i in range(n_samples):
for j in indices[i, 1:]:
sim = np.dot(X[i], X[j]) # cosine similarity
if sim >= threshold:
G.add_edge(i, j, weight=sim)
return G
def run_louvain(G):
"""Run Louvain community detection"""
partition = community_louvain.best_partition(G, weight='weight')
return np.array([partition[i] for i in range(G.number_of_nodes())])
def parent_clustering(X, threshold=0.73):
"""First level: Parent clusters"""
G_parent = build_graph(X, threshold=threshold)
parent_labels = run_louvain(G_parent)
return parent_labels
def subcluster_if_large(X, parent_labels, max_size=50, sub_threshold=0.83):
"""Second level: Subcluster large parents"""
article_labels = np.array(parent_labels, copy=True)
current_max_label = article_labels.max()
for p_lbl in np.unique(parent_labels):
idxs = np.where(parent_labels == p_lbl)[0]
if len(idxs) > max_size:
# Subcluster with stricter threshold
X_sub = X[idxs]
G_sub = build_graph(X_sub, threshold=sub_threshold)
sub_labs = run_louvain(G_sub)
# Assign new labels
for sub_lab in np.unique(sub_labs):
current_max_label += 1
mask = sub_labs == sub_lab
article_labels[idxs[mask]] = current_max_label
return article_labels
def visualize_clusters(X, labels):
"""UMAP projection for visualization"""
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
embedding_2d = reducer.fit_transform(X)
return embedding_2d
# Backend endpoint will be added later...