// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Hannah Bast <bast@cs.uni-freiburg.de>.

// NOTE: this file contains specifications and design suggestions in
// pseudo-code. It is not supposed to be compilable in any language. The
// specifications are mandatory, the design suggestions are not.
// Matrix dimensions are a suggestion, you can, e.g., use an m x k matrix
// instead of a k x n matrix as long as your algebra works out.

// Class for a simple inverted index.
class Kmeans:

  // Build inverted index with BM25 scores from given file. Just re-use your
  // method from ES8 (which, in turn, was just a slight extension of the method
  // from ES2).
  void build_inverted_index(String file_name)

  // Build term-document matrix from inverted index. Again, just re-use your
  // method from ES8. You can store the term document matrix as member of this
  // object and re-use it throughout the next methods where necessary.
  void build_td_matrix()

  // Cluster into k cluster using k-means and return the k final centroids. Use
  // the auxiliary functions below. In particular, make sure that you use
  // matrix/vector operations wherever possible.
  Matrix k_means(int k)

  // Auxiliary functions for the k-means implementation. In the following, k is
  // always the number of clusters, n is the number of documents, and m is the
  // number of terms. The types "Matrix" and "Vector" are just placeholders for
  // whatever data structure you use to store a matrix / vector.

  // Compute an m x k matrix with the initial (random) centroids. Make sure that
  // no two centroids are the same.
  Matrix intitialize_centroids(int k)

  // Compute a k x n matrix such that the entry at i, j contains the distance
  // between the i-th centroid and the j-th document. If the centroids and the
  // documents are L2-normalized to 1, this can (and should) be done with a
  // single matrix operation.
  Matrix compute_distances(Matrix docs, Matrix centroids)

  // Assign each document to its closest centroid. Return a k x n matrix such
  // that the entry at i, j is 1 if document j is closes to centroid i, and 0
  // otherwise. Understand that the matrix must contain exactly one 1 in each
  // column (n 1s altogether). Have a look at the np.argmin function to
  // implement this efficiently without iterating over the matrix.
  Matrix compute_assignment(Matrix distances)

  // Compute an m x k matrix with new centroids. Each centroids should be
  // the average of tall the documents assigned to it in the given assignment.
  // This can also be done with a single matrix operation.
  Matrix compute_centroids(Matrix docs, Matrix assignment)

  // Functions to L2-normalize the rows / columns of the given matrix. You can
  // write this yourself (it's easy) or use a built-in function if available.
  // At the end of this file we show two functions to help you with that for
  // Python.
  void normalize_rows(Matrix matrix)
  void normalize_cols(Matrix matrix)


// Main program:
//
// 1. Arguments: <records> <k>
// 2. Construct inverted index from given file
// 3. Build term-document matrix
// 4. Run k-means with given k
// 5. Print the top-10 terms of each cluster.

void main


// Python code to normalize rows of a dense and sparse matrix.

def norm_row_l2(matrix):
    """ L2 normalize rows of a dense matrix.
    >>> m = np.matrix([[1, 2], [2, 3]], dtype=float)
    >>> norm_row_l2(m)
    >>> m
    matrix([[ 0.4472136 ,  0.89442719],
            [ 0.5547002 ,  0.83205029]])
    """
    sq = np.multiply(matrix, matrix)
    row_sums = np.array(sq.sum(axis=1))[:, 0]
    row_sums = np.sqrt(row_sums)
    matrix /= row_sums[:, None]


def norm_sp_row_l2(matrix):
    """ L2 normalize rows of a sparse csr.matrix.
    >>> m = np.matrix([[0, 1, 2], [0, 2, 3]], dtype=float)
    >>> m = csr_matrix(m)
    >>> norm_sp_row_l2(m)
    >>> m[0, 0]
    0.0
    >>> m[0, 1]
    0.44721359549995793
    >>> m[0, 2]
    0.89442719099991586
    >>> m[1, 0]
    0.0
    >>> m[1, 1]
    0.55470019622522915
    >>> m[1, 2]
    0.83205029433784372
    """
    sq = matrix.multiply(matrix)
    row_sums = np.array(sq.sum(axis=1))[:, 0]
    row_sums = np.sqrt(row_sums)
    row_indices, col_indices = matrix.nonzero()
    matrix[row_indices, col_indices] /= row_sums[row_indices]

def norm_sp_row_l1(matrix):
    """ L1 normalize rows of a dense matrix.
    >>> m = np.matrix([[1, 2], [3, 3]], dtype=float)
    >>> m = csr_matrix(m)
    >>> norm_sp_row_l1(m)
    >>> m.todense()
    matrix([[ 0.33333333,  0.66666667],
            [ 0.5       ,  0.5       ]])
    """
    row_sums = np.array(matrix.sum(axis=1))[:, 0]
    row_indices, col_indices = matrix.nonzero()
    matrix[row_indices, col_indices] /= row_sums[row_indices]