Samuel Bosch

Warning this is more of a code dump instead of a blogpost but I'm still putting it out there hoping it's still useful for someone else as it has been sitting in my draft folder for far too long. Anyways the paper by Robert Hijmans is really worth reading if you're interested in the evaluation of species distribution models. If you have questions, don't hesitate to contact me.

Geographic null model from paper by Robert Hijsmans (link to paper).
package dismo (maintained by Robert J. Hijmans). and the package BIOMOD2.

Load libraries:

library("biomod2")
library("dismo")

The null_model_evaluation function takes as input a BIOMOD2 model.

null_model_evaluation <- function(models) {
  ## flow
  ## 1) create geographic null model from the points used as training data
  ## 2) compare AUC of the model with AUC of the geographic null model (use same presence/absence data)
  
  ## extract test data from models
  data <- get_formal_data(models)
  calib.lines <- get(load(models@calib.lines@link))
  nbOfRuns <- (length(calib.lines) / length(data@data.species))
  calib.lines.mat <- as.matrix(calib.lines, nrow=length(data@data.species), ncol=nbOfRuns)

  evaluations <- get_evaluations(models)
  dims <- dimnames(evaluations)
  rocIndex <- which(dims[1][[1]] == "ROC")
  testingDataIndex <- which( dims[2][[1]] == "Testing.data")
  
  species.data <- data@coord[which(data@data.species == 1),]
  background.data <- data@coord[is.na(data@data.species),]
  
  result <- array(dim=c(nbOfRuns, 2, length(dims[3][[1]])), dimnames=list(dims[4][[1]],c("model", "geo_null_model"), dims[3][[1]]))
  
  ## calculate geographic null models
  for( methodIndex in 1:length(dims[3][[1]])){
    for (runIndex in 1:nbOfRuns){
      model.auc <- evaluations[rocIndex, testingDataIndex,methodIndex,runIndex,1]
      result[runIndex,1,methodIndex] <- model.auc
      
      train.species.data <- species.data[calib.lines[1:nrow(species.data),runIndex],]
      test.species.data <- species.data[!calib.lines[1:nrow(species.data),runIndex],]
      if (nrow(test.species.data) > 0 & !is.na(model.auc)){
        train.background.data <- background.data[calib.lines[nrow(species.data)+1:nrow(calib.lines)-nrow(species.data),runIndex],]
        test.background.data <- background.data[!calib.lines[nrow(species.data)+1:nrow(calib.lines)-nrow(species.data),runIndex],]
        gd <- geoDist(train.species.data, lonlat=TRUE)
        #p <- predict(gd, env)
        #plot(p)
        #points(train.background.data, pch=16, col="purple")
        #points(test.background.data, pch=16, col="blue")
        #points(train.species.data, pch=16, col="green")
        #points(test.species.data, pch=16, col="red")
        gd.evaluation <- evaluate(gd, p=test.species.data, a=test.background.data)
        result[runIndex,2,methodIndex] <- gd.evaluation@auc
      }
    }
  }
  result
}

For my MarineSPEED dataset of species I'm trying to create non-random cross-validation datasets. In order to do this I was looking at the pwdSample and ssb functions from the dismo package. pwdSample calculates a pair-wise distance sampling and ssb calculates the spatial sorting bias. More information about these functions can be found in the excellent paper by Robert Hijmans (2012): "Cross-validation of species distribution models: removing spatial sorting bias and calibration with a null model".

Problem is that both functions calculate a distance matrix between two sets of points which starts to fail when the number of points is (very) large. To fix this I re-implemented the functions replacing the matrix calculation and subsequent row minimum finding with the same functionality but using a divide and conquer strategy by calculating distance matrices on subsets, taking the min distance and then taking the min distances of these min distances to get row based minima.

In other words I replaced:

fromd <- apply(distfun(fixed, reference), 1, min)

with:

mindist <- function(distfun, a, b) {
  partition_count <- (1 %/% (100 / NROW(b))) + 1
  parts <- dismo::kfold(x=b, k=partition_count)
  r <- c()
  for(i in 1:partition_count) {
    mind <- apply(distfun(a, b[parts==i,]), 1, min)
    r <- cbind(r, mind)
  }
  apply(r, 1, min)
}
  
fromd <- mindist(distfun, fixed, reference)

The full pwdSample function then becomes:

pwdSample_robust <- function(fixed, sample, reference, tr = 0.33, nearest= TRUE, n=1, lonlat = TRUE, warn = TRUE) {
  ## DIVIDE reference, calculate distance matrix, get min, COMBINE: take min of the min, do rest of the pwdSample logic
  distHaversine <- function(p1, p2) {
    r <- 6378137
    toRad <- pi/180
    p1 <- p1 * toRad
    p2 <- p2 * toRad
    p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
    dLat <- (p[, 4] - p[, 2])
    dLon <- (p[, 3] - p[, 1])
    a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[, 
                                                         4]) * sin(dLon/2) * sin(dLon/2)
    dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
    as.vector(dist)
  }
  distGeo <- function(x, y) {
    n <- nrow(x)
    m <- nrow(y)
    dm <- matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  distPlane <- function(x, y) {
    dfun <- function(x, y) {
      sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
    }
    n = nrow(x)
    m = nrow(y)
    dm = matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] = dfun(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  if (lonlat) {
    distfun <- distGeo
  }
  else {
    distfun <- distPlane
  }
  stopifnot(tr > 0)
  n <- round(n)
  stopifnot(n >= 1)
  if (inherits(fixed, "SpatialPoints")) 
    fixed <- coordinates(fixed)
  if (inherits(sample, "SpatialPoints")) 
    sample <- coordinates(sample)
  if (inherits(reference, "SpatialPoints")) 
    reference <- coordinates(reference)
  fixed <- as.matrix(fixed)[, 1:2]
  sample <- as.matrix(sample)[, 1:2]
  reference <- as.matrix(reference)[, 1:2]
  if (warn) {
    if (nrow(sample) < nrow(fixed)) {
      warning("nrow(sample) < nrow(fixed)")
    }
  }
  
  mindist <- function(distfun, a, b) {
    partition_count <- (1 %/% (100 / NROW(b))) + 1
    parts <- dismo::kfold(x=b, k=partition_count)
    r <- c()
    for(i in 1:partition_count) {
      mind <- apply(distfun(a, b[parts==i,]), 1, min)
      r <- cbind(r, mind)
    }
    apply(r, 1, min)
  }
  
  fromd <- mindist(distfun, fixed, reference) ##apply(distfun(fixed, reference), 1, min)
  tod <- mindist(distfun, sample, reference) ##apply(distfun(sample, reference), 1, min)
  ngb <- matrix(NA, nrow = length(fromd), ncol = n)
  iter <- sample(1:nrow(fixed))
  for (j in 1:n) {
    for (i in iter) {
      d <- abs(tod - fromd[i])
      if (min(d) < (tr * fromd[i])) {
        if (nearest) {
          x <- which.min(d)
        }
        else {
          x <- sample(which(d < (tr * fromd[i])), size = 1)
        }
        ngb[i, j] <- x
        tod[x] <- Inf
      }
    }
  }
  return(ngb)
}

and the ssb function then looks like this:

ssb <- function(p, a, reference, lonlat = TRUE, avg = TRUE) {
  distHaversine <- function(p1, p2) {
    r <- 6378137
    toRad <- pi/180
    p1 <- p1 * toRad
    p2 <- p2 * toRad
    p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
    dLat <- (p[, 4] - p[, 2])
    dLon <- (p[, 3] - p[, 1])
    a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[, 
                                                         4]) * sin(dLon/2) * sin(dLon/2)
    dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
    as.vector(dist)
  }
  distGeo <- function(x, y) {
    n <- nrow(x)
    m <- nrow(y)
    dm <- matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  distPlane <- function(x, y) {
    dfun <- function(x, y) {
      sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
    }
    n = nrow(x)
    m = nrow(y)
    dm = matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] = dfun(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  if (lonlat) {
    distfun <- distGeo
  }
  else {
    distfun <- distPlane
  }
  if (inherits(p, "SpatialPoints")) 
    p <- coordinates(p)
  if (inherits(a, "SpatialPoints")) 
    a <- coordinates(a)
  if (inherits(reference, "SpatialPoints")) 
    reference <- coordinates(reference)
  p <- as.matrix(p)[, 1:2]
  a <- as.matrix(a)[, 1:2]
  reference <- as.matrix(reference)[, 1:2]
  
  mindist <- function(distfun, a, b) {
    partition_count <- (1 %/% (1000 / NROW(b))) + 1
    parts <- dismo::kfold(x=b, k=partition_count)
    r <- c()
    for(i in 1:partition_count) {
      mind <- apply(distfun(a, b[parts==i,]), 1, min)
      r <- cbind(r, mind)
    }
    apply(r, 1, min)
  }
  pd <- mindist(distfun, p, reference) #   pd <- apply(distfun(p, reference), 1, min)
  ad <- mindist(distfun, a, reference) #   ad <- apply(distfun(a, reference), 1, min)
  if (avg) {
    res <- cbind(mean(pd), mean(ad))
    colnames(res) <- c("p", "a")
    return(res)
  }
  else {
    return(list(pd, ad))
  }
}

Samuel Bosch

Pages

Geographic null models for species distribution modeling: An implementation combining BIOMOD2 and dismo [Code Dump]

dismo pwdSample and ssb for a large number of species distribution records