dismo pwdSample and ssb for a large number of species distribution records

For my MarineSPEED dataset of species I'm trying to create non-random cross-validation datasets. In order to do this I was looking at the pwdSample and ssb functions from the dismo package. pwdSample calculates a pair-wise distance sampling and ssb calculates the spatial sorting bias. More information about these functions can be found in the excellent paper by Robert Hijmans (2012): "Cross-validation of species distribution models: removing spatial sorting bias and calibration with a null model".

Problem is that both functions calculate a distance matrix between two sets of points which starts to fail when the number of points is (very) large. To fix this I re-implemented the functions replacing the matrix calculation and subsequent row minimum finding with the same functionality but using a divide and conquer strategy by calculating distance matrices on subsets, taking the min distance and then taking the min distances of these min distances to get row based minima.

In other words I replaced:

fromd <- apply(distfun(fixed, reference), 1, min)

with:

mindist <- function(distfun, a, b) {
  partition_count <- (1 %/% (100 / NROW(b))) + 1
  parts <- dismo::kfold(x=b, k=partition_count)
  r <- c()
  for(i in 1:partition_count) {
    mind <- apply(distfun(a, b[parts==i,]), 1, min)
    r <- cbind(r, mind)
  }
  apply(r, 1, min)
}
  
fromd <- mindist(distfun, fixed, reference)

The full pwdSample function then becomes:

pwdSample_robust <- function(fixed, sample, reference, tr = 0.33, nearest= TRUE, n=1, lonlat = TRUE, warn = TRUE) {
  ## DIVIDE reference, calculate distance matrix, get min, COMBINE: take min of the min, do rest of the pwdSample logic
  distHaversine <- function(p1, p2) {
    r <- 6378137
    toRad <- pi/180
    p1 <- p1 * toRad
    p2 <- p2 * toRad
    p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
    dLat <- (p[, 4] - p[, 2])
    dLon <- (p[, 3] - p[, 1])
    a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[, 
                                                         4]) * sin(dLon/2) * sin(dLon/2)
    dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
    as.vector(dist)
  }
  distGeo <- function(x, y) {
    n <- nrow(x)
    m <- nrow(y)
    dm <- matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  distPlane <- function(x, y) {
    dfun <- function(x, y) {
      sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
    }
    n = nrow(x)
    m = nrow(y)
    dm = matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] = dfun(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  if (lonlat) {
    distfun <- distGeo
  }
  else {
    distfun <- distPlane
  }
  stopifnot(tr > 0)
  n <- round(n)
  stopifnot(n >= 1)
  if (inherits(fixed, "SpatialPoints")) 
    fixed <- coordinates(fixed)
  if (inherits(sample, "SpatialPoints")) 
    sample <- coordinates(sample)
  if (inherits(reference, "SpatialPoints")) 
    reference <- coordinates(reference)
  fixed <- as.matrix(fixed)[, 1:2]
  sample <- as.matrix(sample)[, 1:2]
  reference <- as.matrix(reference)[, 1:2]
  if (warn) {
    if (nrow(sample) < nrow(fixed)) {
      warning("nrow(sample) < nrow(fixed)")
    }
  }
  
  mindist <- function(distfun, a, b) {
    partition_count <- (1 %/% (100 / NROW(b))) + 1
    parts <- dismo::kfold(x=b, k=partition_count)
    r <- c()
    for(i in 1:partition_count) {
      mind <- apply(distfun(a, b[parts==i,]), 1, min)
      r <- cbind(r, mind)
    }
    apply(r, 1, min)
  }
  
  fromd <- mindist(distfun, fixed, reference) ##apply(distfun(fixed, reference), 1, min)
  tod <- mindist(distfun, sample, reference) ##apply(distfun(sample, reference), 1, min)
  ngb <- matrix(NA, nrow = length(fromd), ncol = n)
  iter <- sample(1:nrow(fixed))
  for (j in 1:n) {
    for (i in iter) {
      d <- abs(tod - fromd[i])
      if (min(d) < (tr * fromd[i])) {
        if (nearest) {
          x <- which.min(d)
        }
        else {
          x <- sample(which(d < (tr * fromd[i])), size = 1)
        }
        ngb[i, j] <- x
        tod[x] <- Inf
      }
    }
  }
  return(ngb)
}

and the ssb function then looks like this:


ssb <- function(p, a, reference, lonlat = TRUE, avg = TRUE) {
  distHaversine <- function(p1, p2) {
    r <- 6378137
    toRad <- pi/180
    p1 <- p1 * toRad
    p2 <- p2 * toRad
    p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
    dLat <- (p[, 4] - p[, 2])
    dLon <- (p[, 3] - p[, 1])
    a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[, 
                                                         4]) * sin(dLon/2) * sin(dLon/2)
    dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
    as.vector(dist)
  }
  distGeo <- function(x, y) {
    n <- nrow(x)
    m <- nrow(y)
    dm <- matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  distPlane <- function(x, y) {
    dfun <- function(x, y) {
      sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
    }
    n = nrow(x)
    m = nrow(y)
    dm = matrix(ncol = m, nrow = n)
    for (i in 1:n) {
      dm[i, ] = dfun(x[i, , drop = FALSE], y)
    }
    return(dm)
  }
  if (lonlat) {
    distfun <- distGeo
  }
  else {
    distfun <- distPlane
  }
  if (inherits(p, "SpatialPoints")) 
    p <- coordinates(p)
  if (inherits(a, "SpatialPoints")) 
    a <- coordinates(a)
  if (inherits(reference, "SpatialPoints")) 
    reference <- coordinates(reference)
  p <- as.matrix(p)[, 1:2]
  a <- as.matrix(a)[, 1:2]
  reference <- as.matrix(reference)[, 1:2]
  
  mindist <- function(distfun, a, b) {
    partition_count <- (1 %/% (1000 / NROW(b))) + 1
    parts <- dismo::kfold(x=b, k=partition_count)
    r <- c()
    for(i in 1:partition_count) {
      mind <- apply(distfun(a, b[parts==i,]), 1, min)
      r <- cbind(r, mind)
    }
    apply(r, 1, min)
  }
  pd <- mindist(distfun, p, reference) #   pd <- apply(distfun(p, reference), 1, min)
  ad <- mindist(distfun, a, reference) #   ad <- apply(distfun(a, reference), 1, min)
  if (avg) {
    res <- cbind(mean(pd), mean(ad))
    colnames(res) <- c("p", "a")
    return(res)
  }
  else {
    return(list(pd, ad))
  }
}