For my
MarineSPEED dataset of species I'm trying to create non-random cross-validation datasets. In order to do this I was looking at the pwdSample and ssb functions from the dismo package. pwdSample calculates a pair-wise distance sampling and ssb calculates the spatial sorting bias. More information about these functions can be found in the excellent paper by Robert Hijmans (2012):
"Cross-validation of species distribution models: removing spatial sorting bias and calibration with a null model".
Problem is that both functions calculate a distance matrix between two sets of points which starts to fail when the number of points is (very) large. To fix this I re-implemented the functions replacing the matrix calculation and subsequent row minimum finding with the same functionality but using a divide and conquer strategy by calculating distance matrices on subsets, taking the min distance and then taking the min distances of these min distances to get row based minima.
In other words I replaced:
fromd <- apply(distfun(fixed, reference), 1, min)
with:
mindist <- function(distfun, a, b) {
partition_count <- (1 %/% (100 / NROW(b))) + 1
parts <- dismo::kfold(x=b, k=partition_count)
r <- c()
for(i in 1:partition_count) {
mind <- apply(distfun(a, b[parts==i,]), 1, min)
r <- cbind(r, mind)
}
apply(r, 1, min)
}
fromd <- mindist(distfun, fixed, reference)
The full pwdSample function then becomes:
pwdSample_robust <- function(fixed, sample, reference, tr = 0.33, nearest= TRUE, n=1, lonlat = TRUE, warn = TRUE) {
## DIVIDE reference, calculate distance matrix, get min, COMBINE: take min of the min, do rest of the pwdSample logic
distHaversine <- function(p1, p2) {
r <- 6378137
toRad <- pi/180
p1 <- p1 * toRad
p2 <- p2 * toRad
p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
dLat <- (p[, 4] - p[, 2])
dLon <- (p[, 3] - p[, 1])
a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[,
4]) * sin(dLon/2) * sin(dLon/2)
dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
as.vector(dist)
}
distGeo <- function(x, y) {
n <- nrow(x)
m <- nrow(y)
dm <- matrix(ncol = m, nrow = n)
for (i in 1:n) {
dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
}
return(dm)
}
distPlane <- function(x, y) {
dfun <- function(x, y) {
sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
}
n = nrow(x)
m = nrow(y)
dm = matrix(ncol = m, nrow = n)
for (i in 1:n) {
dm[i, ] = dfun(x[i, , drop = FALSE], y)
}
return(dm)
}
if (lonlat) {
distfun <- distGeo
}
else {
distfun <- distPlane
}
stopifnot(tr > 0)
n <- round(n)
stopifnot(n >= 1)
if (inherits(fixed, "SpatialPoints"))
fixed <- coordinates(fixed)
if (inherits(sample, "SpatialPoints"))
sample <- coordinates(sample)
if (inherits(reference, "SpatialPoints"))
reference <- coordinates(reference)
fixed <- as.matrix(fixed)[, 1:2]
sample <- as.matrix(sample)[, 1:2]
reference <- as.matrix(reference)[, 1:2]
if (warn) {
if (nrow(sample) < nrow(fixed)) {
warning("nrow(sample) < nrow(fixed)")
}
}
mindist <- function(distfun, a, b) {
partition_count <- (1 %/% (100 / NROW(b))) + 1
parts <- dismo::kfold(x=b, k=partition_count)
r <- c()
for(i in 1:partition_count) {
mind <- apply(distfun(a, b[parts==i,]), 1, min)
r <- cbind(r, mind)
}
apply(r, 1, min)
}
fromd <- mindist(distfun, fixed, reference) ##apply(distfun(fixed, reference), 1, min)
tod <- mindist(distfun, sample, reference) ##apply(distfun(sample, reference), 1, min)
ngb <- matrix(NA, nrow = length(fromd), ncol = n)
iter <- sample(1:nrow(fixed))
for (j in 1:n) {
for (i in iter) {
d <- abs(tod - fromd[i])
if (min(d) < (tr * fromd[i])) {
if (nearest) {
x <- which.min(d)
}
else {
x <- sample(which(d < (tr * fromd[i])), size = 1)
}
ngb[i, j] <- x
tod[x] <- Inf
}
}
}
return(ngb)
}
and the ssb function then looks like this:
ssb <- function(p, a, reference, lonlat = TRUE, avg = TRUE) {
distHaversine <- function(p1, p2) {
r <- 6378137
toRad <- pi/180
p1 <- p1 * toRad
p2 <- p2 * toRad
p <- cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2])
dLat <- (p[, 4] - p[, 2])
dLon <- (p[, 3] - p[, 1])
a <- sin(dLat/2) * sin(dLat/2) + cos(p[, 2]) * cos(p[,
4]) * sin(dLon/2) * sin(dLon/2)
dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * r
as.vector(dist)
}
distGeo <- function(x, y) {
n <- nrow(x)
m <- nrow(y)
dm <- matrix(ncol = m, nrow = n)
for (i in 1:n) {
dm[i, ] <- distHaversine(x[i, , drop = FALSE], y)
}
return(dm)
}
distPlane <- function(x, y) {
dfun <- function(x, y) {
sqrt((x[, 1] - y[, 1])^2 + (x[, 2] - y[, 2])^2)
}
n = nrow(x)
m = nrow(y)
dm = matrix(ncol = m, nrow = n)
for (i in 1:n) {
dm[i, ] = dfun(x[i, , drop = FALSE], y)
}
return(dm)
}
if (lonlat) {
distfun <- distGeo
}
else {
distfun <- distPlane
}
if (inherits(p, "SpatialPoints"))
p <- coordinates(p)
if (inherits(a, "SpatialPoints"))
a <- coordinates(a)
if (inherits(reference, "SpatialPoints"))
reference <- coordinates(reference)
p <- as.matrix(p)[, 1:2]
a <- as.matrix(a)[, 1:2]
reference <- as.matrix(reference)[, 1:2]
mindist <- function(distfun, a, b) {
partition_count <- (1 %/% (1000 / NROW(b))) + 1
parts <- dismo::kfold(x=b, k=partition_count)
r <- c()
for(i in 1:partition_count) {
mind <- apply(distfun(a, b[parts==i,]), 1, min)
r <- cbind(r, mind)
}
apply(r, 1, min)
}
pd <- mindist(distfun, p, reference) # pd <- apply(distfun(p, reference), 1, min)
ad <- mindist(distfun, a, reference) # ad <- apply(distfun(a, reference), 1, min)
if (avg) {
res <- cbind(mean(pd), mean(ad))
colnames(res) <- c("p", "a")
return(res)
}
else {
return(list(pd, ad))
}
}