I'm trying to clusterize a dataset using k-means. When I run my algorithm with just one iteration returns random clusters, as it should, but when I try multiple iterations, it returns only 0s. The matrix I'm using is a 50k x 140 binary matrix. Each row represents a user and each column represents an item.
def clusterizator(matriz, nDeClusters, it=10): # matrix, number of clusters, number of iterations
nOfLines = matriz.shape[0] # number of lines (users)
nOfColumns = matriz.shape[1] # number of columns (items)
clusterCurrently = np.zeros((nOfLines, 1)) # currently cluster assigned to each user
listOfCurrentlyAssigneds = [] # list with numberOfClusters size, each element is a list of currently elements assigned to this cluster
clusterCentroid = [] # centroid of each cluster
clusterCentroid = np.random.randint(2, size=(nDeClusters, nOfColumns)) # starts with randoms centroids
for repeat in xrange(it): # number of iterations
listOfCurrentlyAssigneds = [[] for i in xrange(nDeClusters)] # create empty lists for each cluster
for i in xrange(nOfLines): # for each user
closestCentroid = clusterMaisProximo(matriz[i], clusterCentroid) # calculates the closest centroid
clusterCurrently[i] = closestCentroid # assign the user to closest centroid
listOfCurrentlyAssigneds[closestCentroid].append(matriz[i]) # put user on that centroid list
for i in xrange(nDeClusters): # for each cluster
if listOfCurrentlyAssigneds[i] != []: # if the list is not empty
clusterCentroid[i] = centeroidnp(listOfCurrentlyAssigneds[i]) # calculates the new centroid
return clusterCurrently # return 1-column matrix with user x cluster
def distanciaEucl(elemento1, elemento2):
return np.linalg.norm(elemento2-elemento1) #calculates the distance between to items (or one user and one cluster)
def clusterMaisProximo(elemento, listaDeClusters): # receive one user and the cluster's centroids list, return the closest one
closest = 0
closestDist = distanciaEucl(elemento, listaDeClusters[0]) # starts with the cluster[0]
for i in xrange(len(listaDeClusters)-1): # for each cluster
dist = distanciaEucl(elemento, listaDeClusters[i+1]) # get the distance to currently cluster's centroid
if dist < closestDist: # if it is closer to the element
closest = i+1 # update new closest element
closestDist = dist # update new closest distance
return closest # return closest
# from https://stackoverflow.com/questions/23020659/fastest-way-to-calculate-the-centroid-of-a-set-of-coordinate-tuples-in-python-wi
# by Retozi (adapted)
def centeroidnp(lista): # get a list of elements (number of elements x items)
shape = list(lista[0].shape)
shape[:0] = [len(lista)]
arr = np.concatenate(lista).reshape(shape) # get an array from the list
length = arr.shape[0]
somas = np.zeros(arr.shape[1])
for i in xrange(arr.shape[1]): # for each item (dimension)
somas[i] = (np.sum(arr[:, i]))/length # sum all elements and divide by number of elements
return somas # return array that will be the new centroid position
I commented everything to try to make clear what each row is doing, some comments are dumb because at first my variables were written in Portuguese, then I translated to make it more clear.
I'm running it like this:
clust = clusterizator(train, 10, 2)
Example matrix:
train = [[0, 1, 1, 0], [1, 0, 0, 0], [0, 1, 1, 1], [1, 0, 0, 1], [1, 0, 0, 0]]