R语言 用于循环的替代方案,可动态过滤和汇总数百万个点

cwdobuhd  于 2023-07-31  发布在  其他
关注(0)|答案(4)|浏览(70)

我有一个包含500万条记录的数据集,格式如下:

library(dplyr)
    
Data <- tibble(X=runif(5000000, min=200000, max=400000),
                Y=runif(5000000, min=400000, max=500000),
                UID = 1:5000000,
                Count = runif(5000000, min=1, max=2000))

字符串
对于每个UID,我需要总结1)其他点的数量2)其他点的计数和3)200m,400m,800m和1600m范围内UID的密度计算。所需的最终输出是一个包含每个UID及其摘要输出的表。
这个for循环说明了所需的输出,但需要几周才能完成:

Summary <- NULL

for (n in 1:nrow(Data)){
Data_Row <- Data[n,]

Row_200 <- Data %>% filter(between(X, Data_Row$X-200,Data_Row$X+200) &  between(Y, Data_Row$Y-200, Data_Row$Y+200)) %>%
    summarise(Instances_200 = n(), Count_200 = sum(Count), Calcualtion_200 = Count_200/(Instances_200*0.000625))

Row_400 <- Data %>% filter(between(X, Data_Row$X-400,Data_Row$X+400) &  between(Y, Data_Row$Y-400, Data_Row$Y+400)) %>%
    summarise(Instances_400 = n(), Count_400 = sum(Count), Calcualtion_400 = Count_400/(Instances_400*0.000625))

Row_800 <- Data %>% filter(between(X, Data_Row$X-800,Data_Row$X+800) &  between(Y, Data_Row$Y-800, Data_Row$Y+800)) %>%
    summarise(Instances_800 = n(), Count_800 = sum(Count), Calcualtion_800 = Count_800/(Instances_800*0.000625))

Row_1600 <- Data %>% filter(between(X, Data_Row$X-1600,Data_Row$X+1600) &  between(Y, Data_Row$Y-1600, Data_Row$Y+1600)) %>%
    summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calcualtion_1600 = Count_1600/(Instances_1600*0.000625))

Summary_Row <- bind_cols(Data[n,"UID"], Row_200, Row_400, Row_800, Row_1600)

Summary <- bind_rows(Summary, Summary_Row)
}


我知道像lapplymap这样的东西,但我不知道如何使用它们来更快地进行动态过滤和汇总。

ttisahbt

ttisahbt1#

基于@Jon Spring的评论,我用这个答案作为灵感来加快这个过程。下面的答案现在需要大约12个小时来运行我的真实的数据(而不是我的问题中的代码需要几天/几周)。这是一个有点内存饥饿的点(我使用的是Windows桌面与32 GB的RAM),这就是为什么我已经尽量减少什么得到存储(iidoEe。为什么我把每个循环的输出写到CSV,然后在最后把它们带回来)。使用data.table确实加快了速度,但我相信仍然会有一种更快/更有效的方法来做到这一点-但这正是我所需要的(即使我的代码很混乱!).

# geosphere v1.5-19 required
install.packages('geosphere', repos='https://rspatial.r-universe.dev')

# Required libraries
library(data.table)
library(dplyr)
library(geosphere)
library(tidyr)
library(readr)
library(purrr)
   
set.seed(123)
   
# Input data
Data <- data.table(
    X=runif(5000000, min=200000, max=400000),
    Y=runif(5000000, min=400000, max=500000),
    UID = 1:5000000,
    Count = sample(1:2000,5000000,replace=TRUE))

# Add 50m grid cell ID for each point
Data[, ID := OSGB(Data[,c("X", "Y")], "50m")]   

# Summarise number of records for each 50m grid cell
Data_Summary <- Data[, .(Count = .N), by=.(ID)]

# Generate centroid coorindates for 50 metre grid squares that cover 'Data' extent
Coordinates <- CJ(X = seq((floor(min(Data$X)/1000)*1000)+25, ceiling(max(Data$X)/1000)*1000, 50),
                Y = seq((floor(min(Data$Y)/1000)*1000)+25, ceiling(max(Data$Y)/1000)*1000, 50))

# Add 50m grid cell ID for each centroid
Coordinates[, ID := OSGB(Coordinates[,c("X", "Y")], "50m")]

# Join 50m grid cell centroid coordinates to 'Data_Summary'
Data_Summary_with_Centroids <- Coordinates[Data_Summary, on = .(ID), nomatch = NULL]    

# Define maximum buffer size
Buffer <- 1650

# Generate offset XY coordinates for each grid centroid
Data_Summary_with_Centroids[,`:=`(X_NW = X-Buffer, Y_NW = Y+Buffer, 
                        X_NE = X+Buffer, Y_NE = Y+Buffer,
                        X_SW = X-Buffer, Y_SW = Y-Buffer,
                        X_SE = X+Buffer, Y_SE = Y-Buffer)]

# Assign 5km grid cell ID to all pairs of coordinates                       
Data_Summary_with_Centroids[,`:=`(HashCN = OSGB(Data_Summary_with_Centroids[,c("X", "Y")], "5km"), 
                        HashNW = OSGB(Data_Summary_with_Centroids[,c("X_NW", "Y_NW")], "5km"),
                        HashNE = OSGB(Data_Summary_with_Centroids[,c("X_NE", "Y_NE")], "5km"),
                        HashSW = OSGB(Data_Summary_with_Centroids[,c("X_SW", "Y_SW")], "5km"),
                        HashSE = OSGB(Data_Summary_with_Centroids[,c("X_SE", "Y_SE")], "5km"))]

# Remove offset XY coordinates
Data_Summary_with_Centroids[, c("X_NW", "Y_NW","X_NE", "Y_NE","X_SW", "Y_SW","X_SE", "Y_SE"):=NULL] 

Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

# List of all unique 5km grid cell IDs
Unique_5km_Hash <- unique(Data_Summary_with_Centroids$HashCN)

XY_Columns <- c("X", "Y")

for (n in 1:length(Unique_5km_Hash)){
    
    # Subset so only grid cells that have that loops 5km grid cell ID in any of HashCN, HashNW, HashNE, HashSW or HashSE remain
    Hash <- Data_Summary_with_Centroids[Data_Summary_with_Centroids[, Reduce(`|`, lapply(.SD, `==`, Unique_5km_Hash[n])), 
    .SDcols = c("HashCN", "HashNW","HashNE","HashSW","HashSE")]][, c("HashNW", "HashNE","HashSW", "HashSE"):=NULL] 
    
    # Select only the columns needed for 'crossing'
    Hash_Cross <- Hash[, ..XY_Columns]

    # Expand to include all possible combinations of values
    suppressMessages(Hash_5km <- Hash_Cross %>% 
        crossing(., ., .name_repair = "unique"))
        
    rm(Hash_Cross)
    # Clear memory
    invisible(gc())
    
    # Calculate distances, disgard those over 1600m, keep only 'origin' records that are in 'HashCN' and add Count
    Hash_5km <- Hash_5km %>%
        mutate(Distance = sqrt((X...1 - X...3)^2 + (Y...2 - Y...4)^2)) %>%
        filter(Distance <=1600) %>%
        left_join(.,Hash %>% select(X, Y, ID, HashCN), join_by(X...1 == X, Y...2 == Y)) %>%
        filter(HashCN == Unique_5km_Hash[n]) %>%
        left_join(.,Hash %>% select(X, Y, Count), join_by(X...3 == X, Y...4 == Y)) %>%
        select(ID, Count, Distance)

    # Clear memory
    invisible(gc())

    # Create summary statistics of other grid cells within 200m
    Hash_200 <- Hash_5km %>%
        filter(Distance <=200) %>%
        group_by(ID) %>%
        summarise(Instances_200 = n(), Count_200 = sum(Count), Calculation_200 = Count_200/(Instances_200*0.25))

    # Create summary statistics of other grid cells within 400m
    Hash_400 <- Hash_5km %>%
        filter(Distance <=400) %>%
        group_by(ID) %>%
        summarise(Instances_400 = n(), Count_400 = sum(Count), Calculation_400 = Count_400/(Instances_400*0.25))

    # Create summary statistics of other grid cells within 800m 
    Hash_800 <- Hash_5km %>%
        filter(Distance <=800) %>%
        group_by(ID) %>%
        summarise(Instances_800 = n(), Count_800 = sum(Count), Calculation_800 = Count_800/(Instances_800*0.25))

    # Create summary statistics of other grid cells within 1600m    
    Hash_1600 <- Hash_5km %>%
        group_by(ID) %>%
        summarise(Instances_1600 = n(), Count_1600 = sum(Count), Calculation_1600 = Count_1600/(Instances_1600*0.25))

    # Join summary statistics together
    Output <- Hash_200 %>%
        left_join(Hash_400, by='ID') %>%
        left_join(Hash_800, by='ID') %>%
        left_join(Hash_1600, by='ID') 

    # Export output
    write_csv(Output, paste0("C:/Temp/Hash/",Unique_5km_Hash[n],".csv"))

    rm(Hash_5km)
    rm(Hash)
    rm(Hash_200)
    rm(Hash_400)
    rm(Hash_800)
    rm(Hash_1600)
    rm(Output)
    invisible(gc())
}

# Read in all CSV files into single 
Grid_Calculations <- list.files( path="C:/Temp/Hash/", , pattern = "*.csv", full.names=TRUE ) %>% 
  map_dfr(read_csv, show_col_types = FALSE)

字符串

**编辑:**添加了set.seed(),并稍微改变了循环的开始,因此只有相关的列与'crossing'一起使用(这对我来说是内存效率)。
输出

第一环的前五行:
| 示例_200|计数_200|计算_200|示例_400|计数_400|计算_400|示例_800|计数_800|计算_800|示例_1600|计数_1600|计算_1600| Calculation_1600 |
| --|--|--|--|--|--|--|--|--|--|--|--| ------------ |
| 二十|二十四|四点八|八十八|一百一十九|5.40909090909| 360度|四七五|5.2777777777777|一五零三|2041| 5.43180306054557| 5.43180306054557 |
| 二十二|二十八|5.090909090909|八十七|一百一十九|5.47126436781609| 360度|四七五|5.2777777777777|一千五|2049| 5.464| 5.464 |
| 二十|三十|六|86|一百一十七|5.44186046511627|三六四|四七九|5.26373626373626|一四九四|二〇三二|5.44042838018741| 5.44042838018741 |
| 二十一|三十|5.71428571428571|八十五|一百一十三|5.31764705882352|三六四|四七九|5.26373626373626|一四九一|二〇三七|5.46478873239436| 5.46478873239436 |
| 二十八|四十二|六|八十五|一百一十三|5.31764705882352|三百六十九|四八五|5.25745257452574|一四八七|二〇二〇年|5.43375924680564| 5.43375924680564 |

mklgxw1f

mklgxw1f2#

我试图理解OP的回答帖子,特别是输出表。
关注输出表的第一行--我看到SC550650NE被报告为在(X=255085.6, Y=465070.1)的200个范围内有另一个点的20个示例,总和计数为24。

> Data[ID=="SC550650NE"]
          X        Y    UID Count         ID   distance
1: 255085.6 465070.1 832751  1908 SC550650NE 0.03272809

字符串
所以我计算所有点相对于(X=255085.6, Y=465070.1)的距离。然后按升序显示200以内的。我数了29,不是20。另外,当我计算总数时,我得到25,463,而不是24。

> Data[,distance:=sqrt((X-255085.6)^2 + (Y-465070.1)^2)]
> Data[distance <= 200][order(distance)]
           X        Y     UID Count         ID     distance
 1: 255085.6 465070.1  832751  1908 SC550650NE   0.03272809
 2: 255093.6 465124.7 1371253  1513 SC550651SE  55.16468957
 3: 255034.2 465011.8 3512521   396 SC550650SW  77.73592845
 4: 255049.8 464993.6 2603353   255 SC550649NW  84.47194005
 5: 255091.7 465162.7  193893  1784 SC550651NE  92.82767523
 6: 254968.1 465083.5 2245518   186 SC549650NE 118.25898594
 7: 255180.5 464970.4 3376500   826 SC551649NE 137.66330325
 8: 255148.5 464947.4 3473247  1526 SC551649SW 137.86046404
 9: 255136.8 465202.7 2560816   212 SC551652SW 142.12504000
10: 255089.6 465224.3 2788575  1429 SC550652SE 154.24575868
11: 255213.6 465158.4 2051343   331 SC552651NW 155.52946932
12: 255234.1 465015.9  601896  1175 SC552650SW 158.08971475
13: 255231.4 464993.8 2562794  1209 SC552649NW 164.57602726
14: 255236.8 465003.9 2579822   624 SC552650SW 165.05736525
15: 255163.8 465219.0 2793140   491 SC551652SE 168.19129349
16: 254997.8 465224.2 3686855  1948 SC549652SE 177.38964783
17: 255055.7 464895.0 3850088  1904 SC550648NE 177.60704244
18: 254952.1 465189.1 4992168   668 SC549651NE 178.80180844
19: 255237.6 465165.5 3778801   950 SC552651NW 179.50253647
20: 255148.1 464900.4 2842521   204 SC551649SW 180.86815284
21: 255035.4 464891.2  933305   854 SC550648NW 185.79738782
22: 255245.6 465175.2 1453263   446 SC552651NW 191.44182598
23: 255227.8 465200.5 1557782   851 SC552652SW 192.90236356
24: 254894.7 465031.3  657336  1247 SC548650SE 194.79510938
25: 255102.8 464875.4 3051082   148 SC551648NW 195.41964698
26: 254914.2 464971.4 3090917    28 SC549649NW 197.75429388
27: 255030.4 464879.7  895456   109 SC550648NW 198.20424559
28: 255281.9 465101.9 3970383  1810 SC552651SE 198.85139040
29: 255108.3 465268.3 1968621   431 SC551652NW 199.51847902
           X        Y     UID Count         ID     distance
> Data[distance <= 200, sum(Count)]
[1] 25463

编辑

500万行

因此,在不明智地使用评论进行讨论(对不起,所以!我们对自己的处境有了更好的认识。@Chris有一个解决方案,给出了一个近似值,但这是为了降低计算时间而做出的让步。在讨论过程中,我们还了解到距离度量可以是正方形或圆形,并且机器上有4个核心可用。
我将提出两个解决方案,一个是圆度量,一个是平方度量。圆度量更快。
两者都遵循this SO post中的方法。

圆度量

  • 例如distance = sqrt( (x1-x2)^2 + (y1-y2)^2) ) <= 200
library(tictoc)
library(data.table)


选择num.rows中的一个。从小处着手,逐步掌握时机。我也有32 GB(在Mac上),所以我希望它是可比的。粗略地看,行数增加一倍导致计算时间增加4倍。

### Pick one:
###
### num.rows <-     4e4     ###   12 seconds on 4 nodes
### num.rows <-  0.078125e6 ###   43 seconds on 4 nodes
### num.rows <-  0.15625e6  ###  154 seconds on 4 nodes
### num.rows <-  0.3125e6   ###   10.5 min   on 4 nodes
### num.rows  <- 0.625e6    ###   44.  min   on 4 nodes
### num.rows  <- 1.25e6     ###  ~3 hours
### num.rows  <- 2.5e6      ###  ~12 hours
## num.rows  <- 5.0e6      ###  ~ 42.53 hours on 4 nodes VERIFIED.
 
set.seed(123)
   
# Input data
tbl.data <- data.table(
    X=runif(num.rows, min=200000, max=400000),
    Y=runif(num.rows, min=400000, max=500000),
    UID = 1:num.rows,
    Count = sample(1:2000,num.rows,replace=TRUE))

setkey(tbl.data, UID)

tbl.data

setkey(tbl.data, UID)


此方法依赖于传递数据集的整个X、Y和Count列。我们分别将它们记为XvecYvecCvec。从那里我们创建了一个函数instances(),它将首先计算1个点(x.in, y.in)到(XvecYvec)中所有其他点的距离,计算示例并执行sumCount(密度“计算”在最后完成,不需要在此步骤中完成或并行化)。
这个函数instances()被部署在一个data.table中,并在函数instances.stacked()中进行了设置。由于我们传递了数据集的整个X、Y和Count列,并在原始数据集中按行操作,因此我们可以并行处理这个问题。如果我们有500万个parallel::detectCores(),那么每行就有一个核心,并且可以在瞬间完成。然而,我们有4个,所以我们花了一点时间来创建一个变量group_quarters,它将通过foo2()split_df()的部署将数据集分成4个块(这两个都改编自上述SO post中的示例)。

Xvec <- tbl.data$X
Yvec <- tbl.data$Y
Cvec <- tbl.data$Count

## based on https://stackoverflow.com/a/44814035/2727349
instances <- function(x=Xvec,y=Yvec, count=Cvec, x.in, y.in){
  
  calcdist <- sqrt( (x-x.in)^2+ (y-y.in)^2 )
  
  truth_1600 <- calcdist <= 1600
  result_1600 <- c("instances_1600" = sum( truth_1600 ), "sumCount_1600"= sum( Cvec*truth_1600))
  
  
  truth_0800 <- calcdist <=  800
  result_0800 <- c("instances_0800" = sum( truth_0800 ), "sumCount_0800"= sum( Cvec*truth_0800))

  
  truth_0400 <- calcdist <=  400
  result_0400 <- c("instances_0400" = sum( truth_0400 ), "sumCount_0400"= sum( Cvec*truth_0400))

  
  truth_0200 <- calcdist <=  200
  result_0200 <- c("instances_0200" = sum( truth_0200 ), "sumCount_0200"= sum( Cvec*truth_0200))
  
  result=c(result_0200, result_0400, result_0800, result_1600)
  return(result)
  
}
instances_stacked <- function(DT, x=Xvec,y=Yvec, count=Cvec){
  
  DT[,
     as.list(instances( x.in=X, y.in=Y)),
     by=c("UID")
     ]
  
}

split_df <- function(d, var) {
  base::split(d, get(var, as.environment(d)))
}
## define group_quarters roughly as 25% population
tbl.data[UID <  0.25 * num.rows                         , group_quarters:="1st q"]
tbl.data[UID <  0.50 * num.rows & UID >= 0.25 * num.rows, group_quarters:="2nd q"]
tbl.data[UID <  0.75 * num.rows & UID >= 0.50 * num.rows, group_quarters:="3rd q"]
tbl.data[                         UID >= 0.75 * num.rows, group_quarters:="4th q"]
tbl.data[, uniqueN(group_quarters)]
tbl.data[, table(group_quarters)]
foo2 <- function(dt) {
  dt2 <- split_df(dt, "group_quarters")
  require(parallel)
  cl <- parallel::makeCluster(4) ## 4 chosen based on parallel::detectCores()
  print(cl)
  clusterExport(cl, varlist= "instances_stacked") ## this was the `foo` in https://stackoverflow.com/a/44814035/2727349
  clusterExport(cl, varlist= "instances")
  clusterExport(cl, varlist= c("Xvec","Yvec","Cvec")) 
  clusterExport(cl, varlist= "dt2", envir = environment())
  clusterEvalQ(cl, library("data.table"))

  dt2 <- parallel::parLapply(cl, X= dt2, fun=instances_stacked)

  parallel::stopCluster(cl)
  return(rbindlist(dt2))
}

对foo 2()的调用将并行部署分析,并将所有内容组合回results_wide。然后我们可以进行密度计算并查看一些结果。

tic("foo2() runs on 4 data chunks (1/4 sized) in parallel: ")
results_wide <- foo2(tbl.data)
## next 4 lines done after the parallel part
results_wide[,calculation_1600:=sumCount_1600/(instances_1600*0.25)]
results_wide[,calculation_0800:=sumCount_0800/(instances_0800*0.25)]
results_wide[,calculation_0400:=sumCount_0400/(instances_0400*0.25)]
results_wide[,calculation_0200:=sumCount_0200/(instances_0200*0.25)]
toc()
## let's see the results
head(results_wide)
## for num.rows<-5e6, 
## should match the intro example at top of the post:
results_wide[UID==832751]

它给出(42 1/2小时后):

## for num.rows <- 5e6, should match the 
## intro example at top of the post for
## Data[ID=="SC550650NE"]
> results_wide[UID==832751] 
      UID instances_0200 sumCount_0200 instances_0400 sumCount_0400
1: 832751             29         25463            122        110993
   instances_0800 sumCount_0800 instances_1600 sumCount_1600 calculation_1600
1:            480        467032           2048       2054509         4012.713
   calculation_0800 calculation_0400 calculation_0200
1:         3891.933         3639.115         3512.138

平方度量

  • 例如,Question Post的(X-200, X+200) & (Y-200, Y+200)
    ...注意...我重载了一些函数名,因此最好将此代码放在单独的脚本中,并将R会话与圆度量代码分开

类似于上面的circle metric方法,但执行略有不同,因为它是基于边界而不是距离。基本上,我们制作一个数据的超级堆栈,并计算每个距离的特定边界。这四个“边界”是对数据进行分块以进行并行化的自然候选者--这段代码更进一步,将这四个“边界”中的每一个分成两半,得到8个组。但我们仍然在4个节点上运行。我在8个节点上测试了一堆,因为我的测试环境允许这样做,在我知道OP有4个核心环境之前,我开发了这个代码。

library(data.table)
library(tictoc)
library(dplyr)
set.seed(123)
### Pick one:
### num.rows <- 4e4 ###     15 seconds on 8 nodes | 23 seconds on 4 nodes
 num.rows <- 1e5 ###        86 seconds on 8 nodes | 140.034 seconds on 4 nodes
### num.rows <- 2e5 ###    347 seconds on 8 nodes
### num.rows <- 4e5 ###   1370 seconds on 8 nodes   
### num.rows <- 8e5 ###   1370*4 =  90 minutes?
### num.rows <- 1.6e6 ###   1370*4*4 =  360 minutes = 6 hours (confirmed: 21742.233 / 60 / 60 = 6 hours)
### num.rows <- 2.5e6 ###     47523 / 60 / 60 = 13 hours and 12 minutes VERIFIED.
### num.rows <- 5.0e6 ###   4*47523 / 60 / 60 = ~53 hours estimated.

 # Input data
tbl.data <- data.table(
    X=runif(num.rows, min=200000, max=400000),
    Y=runif(num.rows, min=400000, max=500000),
    UID = 1:num.rows,
    Count = sample(1:2000,num.rows,replace=TRUE))

setkey(tbl.data, UID)

tbl.data

setkey(tbl.data, UID)

创建一个超级堆栈:

tbl.data.1600 <- copy(tbl.data)
tbl.data.0800 <- copy(tbl.data)
tbl.data.0400 <- copy(tbl.data)
tbl.data.0200 <- copy(tbl.data)

tbl.data.1600[, within_metric:=1600]
tbl.data.0800[, within_metric:= 800]
tbl.data.0400[, within_metric:= 400]
tbl.data.0200[, within_metric:= 200]

tbl.data.1600[, group:=1]
tbl.data.0800[, group:=2]
tbl.data.0400[, group:=3]
tbl.data.0200[, group:=4]

tbl.data.stack <- copy(rbind(tbl.data.1600, tbl.data.0800, tbl.data.0400, tbl.data.0200))

setkey(tbl.data.stack, group, UID)

tbl.data.stack[,X_low_bound:=X-within_metric]
tbl.data.stack[,X_upp_bound:=X+within_metric]

tbl.data.stack[,Y_low_bound:=Y-within_metric]
tbl.data.stack[,Y_upp_bound:=Y+within_metric]

Xvec <- tbl.data.stack[group==1]$X
Yvec <- tbl.data.stack[group==1]$Y
Cvec <- tbl.data.stack[group==1]$Count

我包括了这个注解块--这是在不并行化的情况下运行它的方法。继续,现在跳到下一个块。

###################
## BEGIN ##########
###################
## no parallel. ###
## just stacked ###
###################

# instances_stacked <- function(x=Xvec,y=Yvec, count=Cvec, xlb, xub, ylb, yub){
#   
#   truth <- data.table::between(x, xlb, xub) & data.table::between(y, ylb, yub)
#   
#   c("instances" = sum( truth ), "sumCount"= sum( Cvec*truth))
#   
# }
# 
# 
# tic("external default vector approach -- STACKED -- all 4: ")
# results_stacked <- 
# tbl.data.stack[,
#          as.list(instances_stacked( xlb=X_low_bound, xub=X_upp_bound, ylb=Y_low_bound, yub=Y_upp_bound)),
#          by=c("group","UID")
# ]
# 
# results_stacked[,calculation:=sumCount/(instances*0.25)]
# toc()
# 
# head(results_stacked)

###################
## END.  ##########
###################
## no parallel. ###
## just stacked ###
###################

下面是并行代码。我分成了8个组,但将makeCluster行更改为4个节点。

###################
## BEGIN ##########
###################
## SOCKET.      ###
## ROCKET.      ###
###################
## follow the leader: https://stackoverflow.com/a/44814035/2727349
## step 1: change instances_stacked (which is our `foo`) so that it returns a list.  Also change it to where
##           it takes data table as an argument.
## step 2: use split_df -- keep in mind might need to make a concatenated grouping of "group" and "UID"

instances <- function(x=Xvec,y=Yvec, count=Cvec, xlb, xub, ylb, yub){
  
  truth <- data.table::between(x, xlb, xub) & data.table::between(y, ylb, yub)
  
  c("instances" = sum( truth ), "sumCount"= sum( Cvec*truth))
  
  
  
  
}

instances_stacked <- function(DT, x=Xvec,y=Yvec, count=Cvec){
  
  
  DT[,
     as.list(instances( xlb=X_low_bound, xub=X_upp_bound, ylb=Y_low_bound, yub=Y_upp_bound)),
     by=c("within_metric","UID")
     ]

  
}

split_df <- function(d, var) {
  base::split(d, get(var, as.environment(d)))
}

## define grps to be distance-bound and UID combo
tbl.data.stack[,grps:=paste0(within_metric,"_",UID)]
tbl.data.stack[, uniqueN(grps)]

## define group_half to be distance-bound and UID < 1/2 num.rows
tbl.data.stack[,group_half:=paste0(within_metric,"_", (UID < 0.5 * num.rows)+0L)]
tbl.data.stack[, uniqueN(group_half)]

foo2 <- function(dt) {
  #dt2 <- split_df(dt, "grps")
  #dt2 <- split_df(dt, "group")
  dt2 <- split_df(dt, "group_half")
  
  require(parallel)
  ##cl <- parallel::makeCluster(min(nrow(dt), parallel::detectCores()))
  #cl <- parallel::makeCluster(tbl.data.stack[, uniqueN(group_half)])
  cl <- parallel::makeCluster(4)
  print(cl)
  clusterExport(cl, varlist= "instances_stacked") ## this was the `foo` in https://stackoverflow.com/a/44814035/2727349
    clusterExport(cl, varlist= "instances")
  clusterExport(cl, varlist= c("Xvec","Yvec","Cvec")) 
  clusterExport(cl, varlist= "dt2", envir = environment())
  clusterEvalQ(cl, library("data.table"))

  dt2 <- parallel::parLapply(cl, X= dt2, fun=instances_stacked)

  parallel::stopCluster(cl)
  return(rbindlist(dt2))
}

tic("external default vector approach -- SOCKET -- all 4 in parallel: ")

results_stacked_socket <- foo2(tbl.data.stack)

results_stacked_socket[,calculation:=sumCount/(instances*0.25)]
toc()

setkey(results_stacked_socket, UID, within_metric)
head(results_stacked_socket)

###################
## END.  ##########
###################
## SOCKET.      ###
## ROCKET.      ###
###################
toiithl6

toiithl63#

作为一个说一定有人能比我做得更好的人,我会寻找别人提出的解决方案。
因此,我看到了使用sfspdep包的一些优势。两者都是为空间数据而设计的。
首先有两个功能。
创建一个sf-object

x <- sf::st_as_sf(Data, coords = c("X", "Y"))

字符串
距离可以用

st_distance()


当然,必须找到一个不计算所有距离的解决方案。一个简单的算法是将点分类到所需半径一半大小的箱中。然后,您只需检查相邻垃圾箱中的点。

ecfsfe2w

ecfsfe2w4#

使用data.table可能会有所帮助。但就这宗个案而言,我认为计算时间仍会太长。
可能是你无论如何都要试试:

library(data.table)
tbl.data <- as.data.table(Data)

tbl.data[, Instances_200 := mapply(function (x,y) {tbl.data[X<x+200 & X > x -200
                                                               & Y < y+200 & Y > y -200, .N ]}, X, Y)]

tbl.data[, Count_200 := mapply(function (x,y) {tbl.data[X<x+200 & X > x -200
                                                            & Y < y+200 & Y > y -200, sum(Count) ]}, X, Y)]

tbl.data[, Calculation_200 := Count_200/(Instances_200*0.000625)]

字符串
诸如此类。

相关问题