r - r - 按组分的地理距离 - 在每对行上应用一个函数

df1 <- data.frame(province = c(1, 1, 1, 2, 2, 2),

house = c(1, 2, 3, 4, 5, 6),

lat = c(-76.6, -76.5, -76.4, -75.4, -80.9, -85.7),

lon = c(39.2, 39.1, 39.3, 60.8, 53.3, 40.2))

library(geosphere)

distm(c(df1\$lon[1], df1\$lat[1]), c(df1\$lon[2], df1\$lat[2]), fun = distHaversine)

#11429.1

lapply(split(df1, df1\$province), function(df){

df <- Expand.Grid(df[, c("lat","lon")], df[, c("lat","lon")])

mean(distHaversine(df[, 1:2], df[, 3:4]))

})

1.性能测试

function (x, y, fun = distHaversine)

{

[...]

for (i in 1:n) {

dm[i, ] = fun(x[i, ], y)

}

return(dm)

}

distHaversine() 发送"整个对象"到C 时，distm() 将数据"行智能"发送到 distHaversine()，因这里在执行C 时强制 distHaversine() 执行相同的操作。 因此，不应该使用 distm() 。 在性能方面，我看到使用包装函数 distm() 带来更多危害，因为我看到。

2.解释"解决方案"中的代码：

a ) 分组：

b ) 分组"columns"

library(geosphere)

library(dplyr)

library(proxy)

df1 <- data.frame(province = as.integer(c(1, 1, 1, 2, 2, 2)),

house = as.integer(c(1, 2, 3, 4, 5, 6)),

lat = c(-76.6, -76.5, -76.4, -75.4, -80.9, -85.7),

lon = c(39.2, 39.1, 39.3, 60.8, 53.3, 40.2))

custom_haversine <- function(x, y) {

diff <- (y - x) * toRad

dLon <- diff[1L]

dLat <- diff[2L]

a <- sin(dLat/2) ^ 2 + cos(x[2L] * toRad) * cos(y[2L] * toRad) * sin(dLon/2) ^ 2

a <- min(a, 1)

# return

2 * atan2(sqrt(a), sqrt(1 - a)) * 6378137

}

pr_DB\$set_entry(FUN=custom_haversine, names="haversine", loop=TRUE, distance=TRUE)

average_dist <- df1 %>%

select(-house) %>%

group_by(province) %>%

group_map(~ data.frame(avg=mean(proxy::dist(.x[, c("lon","lat")], method="haversine"))))

//[[Rcpp::plugins(cpp11)]]

//[[Rcpp::depends(RcppParallel)]]

#include <cstddef>//size_t

#include <math.h>//sin, cos, sqrt, atan2, pow

#include <vector>

#include <Rcpp.h>

#include <RcppParallel.h>

using namespace std;

using namespace Rcpp;

using namespace RcppParallel;

//single to double indices for lower triangular of matrices without diagonal

void s2d(const size_t id, const size_t nrow, size_t& i, size_t& j) {

j = nrow - 2 - static_cast<size_t>(sqrt(-8 * id + 4 * nrow * (nrow - 1) - 7)/2 - 0.5);

i = id + j + 1 - nrow * (nrow - 1)/2 + (nrow - j) * ((nrow - j) - 1)/2;

}

class HaversineCalculator : public Worker

{

public:

HaversineCalculator(const NumericVector& lon,

const NumericVector& lat,

double& avg,

const int n)

: lon_(lon)

, lat_(lat)

, avg_(avg)

, n_(n)

, cos_lat_(lon.length())

{

//terms for distance calculation

for (size_t i = 0; i <cos_lat_.size(); i++) {

cos_lat_[i] = cos(lat_[i] * 3.1415926535897/180);

}

}

void operator()(size_t begin, size_t end) {

//for Kahan summation

double sum = 0;

double c = 0;

size_t i, j;

for (size_t ind = begin; ind <end; ind++) {

s2d(ind, lon_.length(), i, j);

//haversine distance

double d_lon = (lon_[j] - lon_[i]) * to_rad;

double d_lat = (lat_[j] - lat_[i]) * to_rad;

double d_hav = pow(sin(d_lat/2), 2) + cos_lat_[i] * cos_lat_[j] * pow(sin(d_lon/2), 2);

if (d_hav> 1) d_hav = 1;

d_hav = 2 * atan2(sqrt(d_hav), sqrt(1 - d_hav)) * 6378137;

//the average part

d_hav/= n_;

//Kahan sum step

double y = d_hav - c;

double t = sum + y;

c = (t - sum) - y;

sum = t;

}

mutex_.lock();

avg_ += sum;

mutex_.unlock();

}

private:

const RVector<double> lon_;

const RVector<double> lat_;

double& avg_;

const int n_;

vector<double> cos_lat_;

};

//[[Rcpp::export]]

double avg_haversine(const DataFrame& input, const int nthreads) {

NumericVector lon = input["lon"];

NumericVector lat = input["lat"];

double avg = 0;

int size = lon.length() * (lon.length() - 1)/2;

HaversineCalculator hc(lon, lat, avg, size);

parallelFor(0, size, hc, grain);

return avg;

}

library(dplyr)

library(Rcpp)

library(RcppParallel)

sourceCpp("haversine.cpp")

df1 %>%

group_by(province) %>%

group_map(~ data.frame(avg=avg_haversine(.x, parallel::detectCores())))

# A tibble: 2 x 2

# Groups: province [2]

province avg

<int> <dbl>

1 1 15379.

2 2 793612.

pr_DB\$set_entry(FUN=geosphere::distHaversine, names="distHaversine", loop=TRUE, distance=TRUE)

df1 %>%

select(-house) %>%

group_by(province) %>%

group_map(~ data.frame(avg=mean(proxy::dist(.x[, c("lon","lat")], method="distHaversine"))))

df <- data.frame(lon=runif(1e3, -90, 90), lat=runif(1e3, -90, 90))

system.time(proxy::dist(df, method="distHaversine"))

user system elapsed

34.353 0.005 34.394

system.time(proxy::dist(df, method="haversine"))

user system elapsed

0.789 0.020 0.809

system.time(avg_haversine(df, 4L))

user system elapsed

0.054 0.000 0.014

df <- data.frame(lon=runif(1e5, -90, 90), lat=runif(1e5, -90, 90))

system.time(avg_haversine(df, 4L))

user system elapsed

73.861 0.238 19.670

关于复杂性

int size = lon.length() * (lon.length() - 1)/2;