// Copyright 2014, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Hannah Bast <bast@cs.uni-freiburg.de>.

// NOTE: this is a code design suggestion in pseudo-code. It is not supposed to
// be compilable in any language. You have to translate it to Java or C++
// yourself.

// A class implementing Naive Bayes learning and prediction for text documents.
class NaiveBayes {
  // PUBLIC MEMBERS

  // Read documents from text file with one document oer line. The first column in
  // each line is the class label (a string). After that comes a tab and 
  // the text of the document, with words separated by whitespace, and no newline.  
  // The result is stored in the member variables documentWordIds, documentClassIds,
  // wordIds, and classIds; see below.
  void readDocumentsAndLabels(String fileName);
  
  // Learn prior probabilities from the documents with the given ids. The ids
  // refere to the member documentVectors below. The prior probabilities are
  // stored in the member variables pc and pwc; see below.
  void train(Array<int> documentIds);

  // Predict classes for the documents with the given ids. For each of these
  // documents return the (id of the) class label with the highest probability.
  Array<int> predict(Array<int> documentIds);

  // PRIVATE MEMBERS.

  // For each document, the ids of the words it contains. Note: if a document contains
  // a word multiple times, the respective word id is contained in the
  // Array<int> multiple times.
  Array<Array<int>> documentWordIds;
  
  // For each document, the id of its class. Note: for this exercise, there is
  // only one class per document. In principle, Naive Bayes could also deal with
  // multiple class labels per object / document.
  Array<int> documentClassIds;
  
  // For each word, its word id. Note: this is not strictly needed, but as
  // usual, it is more convenient, and also more efficient, to work with word
  // ids.
  Map<String, int> wordIds;

  // For each class, its class id. The same comment as for wordIds above applies.
  Map<String, int> classIds;

  // The counts from the training set, computed by method train above.
  //
  // IMPLEMENTATION NOTE 1: Since many of the nwc values are zero, you might also
  // want to store the non-zero counts in sparse representation. Then store ncw
  // instead of nwc (it's the same thing, just rows and columns switched), and
  // use an Array<Map<int, int>>
  //
  // IMPLEMENTATION NOTE 2: Instead of the nc and nwc, you may also want to
  // store the prior probabilities pc and pwc, which are computed from nc and
  // nwc. Since we conceptually add each word once to each class, these prior
  // probabilities are always dense however, and so the trick from the note
  // above will not work then.
  Array<int> nc;
  Array<Array<int>> nwc;
}