kfcv.sizes = function(n, k=10) { # generate sample sizes for k-fold cross validation on a data set of # size n # author: Matthias C. M. Troffaes # date: 22 Nov 2010 # license: GPLv3 # usage: # # kfcv.sizes(n, k=...) # sizes = c() for (i in 1:k) { first = 1 + (((i - 1) * n) %/% k) last = ((i * n) %/% k) sizes = append(sizes, last - first + 1) } sizes } kfcv.testing = function(n, k=10) { # generate testing sample indices for k-fold cross validation on a # data set of size n # author: Matthias C. M. Troffaes # date: 22 Nov 2010 # license: GPLv3 # usage: # # kfcv.testing(n, k=...) # indices = list() sizes = kfcv.sizes(n, k=k) values = 1:n for (i in 1:k) { # take a random sample of given size s = sample(values, sizes[i]) # append random sample to list of indices indices[[i]] = s # remove sample from values values = setdiff(values, s) } indices } kfcv.classifier = function(data, class, classifier, k=10) { # run k-fold cross validation with an arbitrary classifier # author: Matthias C. M. Troffaes # date: 25 Nov 2010 # license: GPLv3 # usage: # # kfcv.classifier(data, class, classifier, k=...) # # where data is the data frame (each column is an attribute, and # each row is an observation), class is the column index for the # attribute to be predicted, and classifier is a function which # takes a training set, a test set, and a class column index result = list() alltestingindices = kfcv.testing(dim(data)[1]) for (i in 1:k) { testingindices = alltestingindices[[i]] train = data[-testingindices,] test = data[testingindices,] result[[i]] = classifier(train, test, class) } result } classifier.naivebayes = function(train, test, class) { # simple example of a classifier # requires library(e1071) model = naiveBayes(train[,-class], train[,class]) table(predict(model, test[,-class]), test[,class]) } kfcv.sizes.test = function() { # test simple cases stopifnot(kfcv.sizes(10, k=2) == c(5, 5)) stopifnot(kfcv.sizes(10, k=5) == c(2, 2, 2, 2, 2)) stopifnot(kfcv.sizes(12, k=5) == c(2, 2, 3, 2, 3)) # test that sum of sample sizes is total sample size for (k in 1:10) { for (n in 1:100) { sizes = kfcv.sizes(n, k=k) stopifnot(length(sizes) == k) stopifnot(sum(sizes) == n); } } } kfcv.testing.test = function() { # set seed so test is deterministic set.seed(10) # 3 fold sample from 10 indices indices = kfcv.testing(10, k=3) stopifnot(length(indices) == 3) stopifnot(indices[[1]] == c(6, 3, 4)) stopifnot(indices[[2]] == c(8, 1, 2)) stopifnot(indices[[3]] == c(7, 5, 10, 9)) } kfcv.example = function() { # the data data = c("hello", "world", "dog", "cat", "fox", "rabbit", "gnome", "orc", "imp", "zombie", "vampire") # generate testing and training samples for 4-fold cross validation for (testingindices in kfcv.testing(length(data), k=4)) { testing = data[testingindices] training = data[-testingindices] print(testing) print(training) print(NULL) } } #kfcv.sizes.test() #kfcv.testing.test() #kfcv.example()