CS 431

Hierarchical Agglomerative Clustering Algorithm



//D = {x₁, ..., x_N} is a dataset of points
AGGLOMERATIVE(D)
    
    // Initialization
    for i = 1...N
        for j = 1...N
          C(i,j) = SIM(x_i, x_j)
        end
        I(i) ← 1 // Indicates data point i is the "head" of a cluster
    end

    M ← {} // Keep track of merges

    for k = 1 ... N-1

        // Find 2 most similar clusters 
        maxSimilarity = 0
        maxPair = null
        for i = 1...N
            for j = 1...N
                if i != j && I(i) == 1 && I(j) == 1
                    if C(i,j) > maxSimilarity
                        maxSimilarity = C(i,j)
                        maxPair = (i, j)
                    end
                end
            end
        end

        M.append({i,j}) // We are merging clusters i and j

        // The ith row and column now hold distances for new cluster
        for k = 1...N
            C(i,k) = SIM({i, j}, k)
            C(k,i) = SIM({i, j}, k)
        end

        I(j) = 0 // Deactive the "head" of this cluster
    end

    return M
end