kfcv.sizes = function(n, k=10) {
  # generate sample sizes for k-fold cross validation on a data set of
  # size n
  
  # author: Matthias C. M. Troffaes
  # date: 22 Nov 2010
  # license: GPLv3
  # usage:
  #
  #   kfcv.sizes(n, k=...)
  #

  sizes = c()
  for (i in 1:k) {
    first = 1 + (((i - 1) * n) %/% k)
    last = ((i * n) %/% k)
    sizes = append(sizes, last - first + 1)
  }
  sizes
}

kfcv.testing = function(n, k=10) {
  # generate testing sample indices for k-fold cross validation on a
  # data set of size n
  
  # author: Matthias C. M. Troffaes
  # date: 22 Nov 2010
  # license: GPLv3
  # usage:
  #
  #   kfcv.testing(n, k=...)
  #

  indices = list()
  sizes = kfcv.sizes(n, k=k)
  values = 1:n
  for (i in 1:k) {
    # take a random sample of given size
    s = sample(values, sizes[i])
    # append random sample to list of indices
    indices[[i]] = s
    # remove sample from values
    values = setdiff(values, s)
  }
  indices
}

kfcv.classifier = function(data, class, classifier, k=10) {
  # run k-fold cross validation with an arbitrary classifier

  # author: Matthias C. M. Troffaes
  # date: 25 Nov 2010
  # license: GPLv3
  # usage:
  #
  #   kfcv.classifier(data, class, classifier, k=...)
  #
  # where data is the data frame (each column is an attribute, and
  # each row is an observation), class is the column index for the
  # attribute to be predicted, and classifier is a function which
  # takes a training set, a test set, and a class column index

  result = list()
  alltestingindices = kfcv.testing(dim(data)[1])
  for (i in 1:k) {
    testingindices = alltestingindices[[i]]
    train = data[-testingindices,]
    test = data[testingindices,]
    result[[i]] = classifier(train, test, class)
  }
  result
}

classifier.naivebayes = function(train, test, class) {
  # simple example of a classifier
  # requires library(e1071)
  model = naiveBayes(train[,-class], train[,class])
  table(predict(model, test[,-class]), test[,class])
}

kfcv.sizes.test = function() {
  # test simple cases
  stopifnot(kfcv.sizes(10, k=2) == c(5, 5))
  stopifnot(kfcv.sizes(10, k=5) == c(2, 2, 2, 2, 2))
  stopifnot(kfcv.sizes(12, k=5) == c(2, 2, 3, 2, 3))
  # test that sum of sample sizes is total sample size
  for (k in 1:10) {
    for (n in 1:100) {
      sizes = kfcv.sizes(n, k=k)
      stopifnot(length(sizes) == k)
      stopifnot(sum(sizes) == n);
    }
  }
}

kfcv.testing.test = function() {
  # set seed so test is deterministic
  set.seed(10)
  # 3 fold sample from 10 indices
  indices = kfcv.testing(10, k=3)
  stopifnot(length(indices) == 3)
  stopifnot(indices[[1]] == c(6, 3, 4))
  stopifnot(indices[[2]] == c(8, 1, 2))
  stopifnot(indices[[3]] == c(7, 5, 10, 9))
}

kfcv.example = function() {
  # the data
  data = c("hello", "world", "dog", "cat", "fox", "rabbit", "gnome", "orc", "imp", "zombie", "vampire")
  # generate testing and training samples for 4-fold cross validation
  for (testingindices in kfcv.testing(length(data), k=4)) {
    testing = data[testingindices]
    training = data[-testingindices]
    print(testing)
    print(training)
    print(NULL)
  }
}

#kfcv.sizes.test()
#kfcv.testing.test()
#kfcv.example()