在R中使用自定义序列排序数据

j2qf4p5b  于 2023-04-09  发布在  其他
关注(0)|答案(2)|浏览(121)

我有一个问题与排序数据的基础上的文件的名称。首先,我有一个列表文件包含20 netcdf数据。这是数据列表:

file_all <- list.files(pattern=glob2rx("*.nc"))
> file_all
 [1] "bs-cnrmaladin-djf-tas-cru.nc"   "bs-cnrmaladin-jja-tas-cru.nc"  
 [3] "bs-cnrmaladin-mam-tas-cru.nc"   "bs-cnrmaladin-son-tas-cru.nc"  
 [5] "bs-ecearthcclm-djf-tas-cru.nc"  "bs-ecearthcclm-jja-tas-cru.nc" 
 [7] "bs-ecearthcclm-mam-tas-cru.nc"  "bs-ecearthcclm-son-tas-cru.nc" 
 [9] "bs-hadgemhirham-djf-tas-cru.nc" "bs-hadgemhirham-jja-tas-cru.nc"
[11] "bs-hadgemhirham-mam-tas-cru.nc" "bs-hadgemhirham-son-tas-cru.nc"
[13] "bs-mpiracmo-djf-tas-cru.nc"     "bs-mpiracmo-jja-tas-cru.nc"    
[15] "bs-mpiracmo-mam-tas-cru.nc"     "bs-mpiracmo-son-tas-cru.nc"    
[17] "bs-noresmremo-djf-tas-cru.nc"   "bs-noresmremo-jja-tas-cru.nc"  
[19] "bs-noresmremo-mam-tas-cru.nc"   "bs-noresmremo-son-tas-cru.nc"

我需要先对它进行排序,因为我需要顺序数据。我想要的顺序是:

djf -> mam -> jja -> son

这是我期望的输出顺序:

[1] "bs-cnrmaladin-djf-tas-cru.nc"   "bs-cnrmaladin-mam-tas-cru.nc"  
[3] "bs-cnrmaladin-jja-tas-cru.nc"   "bs-cnrmaladin-son-tas-cru.nc"  
[5] "bs-ecearthcclm-djf-tas-cru.nc"  "bs-ecearthcclm-mam-tas-cru.nc" 
[7] "bs-ecearthcclm-jja-tas-cru.nc"  "bs-ecearthcclm-son-tas-cru.nc" 
[9] "bs-hadgemhirham-djf-tas-cru.nc" "bs-hadgemhirham-mam-tas-cru.nc"
[11] "bs-hadgemhirham-jja-tas-cru.nc" "bs-hadgemhirham-son-tas-cru.nc"
[13] "bs-mpiracmo-djf-tas-cru.nc"     "bs-mpiracmo-mam-tas-cru.nc"    
[15] "bs-mpiracmo-jja-tas-cru.nc"     "bs-mpiracmo-son-tas-cru.nc"    
[17] "bs-noresmremo-djf-tas-cru.nc"   "bs-noresmremo-mam-tas-cru.nc"  
[19] "bs-noresmremo-jja-tas-cru.nc"   "bs-noresmremo-son-tas-cru.nc"

如果有人知道如何排序与此序列请帮助我。提前感谢

smdnsysy

smdnsysy1#

您可以提取字符串的这一部分并将其与您的订单匹配,即

my_order <- c("djf", "mam", "jja", "son")
i1 <- gsub(".*-(.*)-tas.*", "\\1", file_all)
i2 <- gsub("bs-([^-]*)-.*", "\\1", file_all)

file_all[order(i2, match(i1, my_order))]

 [1] "bs-cnrmaladin-djf-tas-cru.nc"   "bs-cnrmaladin-mam-tas-cru.nc"   "bs-cnrmaladin-jja-tas-cru.nc"   "bs-cnrmaladin-son-tas-cru.nc"  
 [5] "bs-ecearthcclm-djf-tas-cru.nc"  "bs-ecearthcclm-mam-tas-cru.nc"  "bs-ecearthcclm-jja-tas-cru.nc"  "bs-ecearthcclm-son-tas-cru.nc" 
 [9] "bs-hadgemhirham-djf-tas-cru.nc" "bs-hadgemhirham-mam-tas-cru.nc" "bs-hadgemhirham-jja-tas-cru.nc" "bs-hadgemhirham-son-tas-cru.nc"
[13] "bs-mpiracmo-djf-tas-cru.nc"     "bs-mpiracmo-mam-tas-cru.nc"     "bs-mpiracmo-jja-tas-cru.nc"     "bs-mpiracmo-son-tas-cru.nc"    
[17] "bs-noresmremo-djf-tas-cru.nc"   "bs-noresmremo-mam-tas-cru.nc"   "bs-noresmremo-jja-tas-cru.nc"   "bs-noresmremo-son-tas-cru.nc"

以下是一些备选方案:
使用strsplit。这可以提高效率,因为它不使用正则表达式。

split_files <- strsplit(strings, "-")
i3 <- sapply(split_files, "[[", 3)
i4 <- sapply(split_files, "[[", 2)
file_all[order(i4, match(i3, my_order))]
xzlaal3s

xzlaal3s2#

有点冗长,但您可以使用基于tidyverse的解决方案:

library(dplyr)
library(tidyr)

file_all %>% 
  data.frame(model = .) %>% 
  separate_wider_delim(model, delim = "-", names_sep = "", cols_remove = FALSE) %>% 
  mutate(model3 = match(model3, my_order)) %>% 
  arrange(model2, model3) %>% 
  pull(modelmodel)

这个返回

[1] "bs-cnrmaladin-djf-tas-cru.nc"   "bs-cnrmaladin-mam-tas-cru.nc"   "bs-cnrmaladin-jja-tas-cru.nc"  
 [4] "bs-cnrmaladin-son-tas-cru.nc"   "bs-ecearthcclm-djf-tas-cru.nc"  "bs-ecearthcclm-mam-tas-cru.nc" 
 [7] "bs-ecearthcclm-jja-tas-cru.nc"  "bs-ecearthcclm-son-tas-cru.nc"  "bs-hadgemhirham-djf-tas-cru.nc"
[10] "bs-hadgemhirham-mam-tas-cru.nc" "bs-hadgemhirham-jja-tas-cru.nc" "bs-hadgemhirham-son-tas-cru.nc"
[13] "bs-mpiracmo-djf-tas-cru.nc"     "bs-mpiracmo-mam-tas-cru.nc"     "bs-mpiracmo-jja-tas-cru.nc"    
[16] "bs-mpiracmo-son-tas-cru.nc"     "bs-noresmremo-djf-tas-cru.nc"   "bs-noresmremo-mam-tas-cru.nc"  
[19] "bs-noresmremo-jja-tas-cru.nc"   "bs-noresmremo-son-tas-cru.nc"

数据

file_all <- c("bs-cnrmaladin-djf-tas-cru.nc"   ,"bs-cnrmaladin-jja-tas-cru.nc"  ,
              "bs-cnrmaladin-mam-tas-cru.nc"   ,"bs-cnrmaladin-son-tas-cru.nc"  ,
              "bs-ecearthcclm-djf-tas-cru.nc"  ,"bs-ecearthcclm-jja-tas-cru.nc" ,
              "bs-ecearthcclm-mam-tas-cru.nc"  ,"bs-ecearthcclm-son-tas-cru.nc" ,
              "bs-hadgemhirham-djf-tas-cru.nc" ,"bs-hadgemhirham-jja-tas-cru.nc",
              "bs-hadgemhirham-mam-tas-cru.nc" ,"bs-hadgemhirham-son-tas-cru.nc",
              "bs-mpiracmo-djf-tas-cru.nc"     ,"bs-mpiracmo-jja-tas-cru.nc"    ,
              "bs-mpiracmo-mam-tas-cru.nc"     ,"bs-mpiracmo-son-tas-cru.nc"    ,
              "bs-noresmremo-djf-tas-cru.nc"   ,"bs-noresmremo-jja-tas-cru.nc"  ,
              "bs-noresmremo-mam-tas-cru.nc"   ,"bs-noresmremo-son-tas-cru.nc" )

my_order <- c("djf", "mam", "jja", "son")

基准测试

由于我很好奇,我做了一个小的基准测试@Sotos答案与这个答案的变体和基于tidyverse的解决方案:

library(microbenchmark)

my_order <- c("djf", "mam", "jja", "son")
my_files_2 <- sample(file_all, 100000, replace = TRUE)

microbenchmark(
  sotos = my_files_2[order(match(gsub(".*-(.*)-tas.*", "\\1", my_files_2), my_order))],
  sotos_2 = { split_files <- strsplit(my_files_2, "-")
  i3 <- unlist(lapply(split_files, "[[", 3))
  i4 <- unlist(lapply(split_files, "[[", 2))
  my_files_2[order(i4, match(i3, my_order))] },
  sotos_3 = { split_files <- strsplit(my_files_2, "-", fixed = TRUE)
  i3 <- unlist(lapply(split_files, "[[", 3))
  i4 <- unlist(lapply(split_files, "[[", 2))
  my_files_2[order(i4, match(i3, my_order))] },
  alt = my_files_2[unlist(lapply(my_order, \(n) which(grepl(n, my_files_2))))],
  tidyverse = my_files_2 %>% 
    data.frame(model = .) %>% 
    separate_wider_delim(model, delim = "-", names_sep = "", cols_remove = FALSE) %>% 
    mutate(model3 = match(model3, my_order)) %>% 
    arrange(model2, model3) %>% 
    pull(modelmodel)
)

结果让我相当吃惊

#> Unit: milliseconds
#>       expr      min       lq      mean    median        uq      max neval
#>      sotos 126.5779 127.5660 129.03582 128.17515 128.83190 199.8997   100
#>    sotos_2 403.7318 412.5335 430.80871 419.50455 433.35235 511.1252   100
#>    sotos_3 220.1570 228.3046 244.84893 235.50230 246.01510 314.8171   100
#>        alt 113.9459 114.7081 115.86338 115.50140 116.37555 121.4351   100
#>  tidyverse  75.5209  83.2439  93.82626  85.46735  92.48615 183.7602   100

以下是基于my_list_2大小的几个基准测试结果:

my_files_2 <- sample(file_all, 100, replace = TRUE)
#> Unit: microseconds
#>       expr    min      lq     mean  median      uq     max neval
#>      sotos  155.6  162.95  175.719  173.70  184.05   263.3   100
#>    sotos_2  399.6  416.35  446.641  427.35  448.80   705.4   100
#>    sotos_3  217.6  229.60  278.047  237.20  252.25  3846.8   100
#>        alt  139.6  150.00  169.377  154.75  160.30  1452.0   100
#>  tidyverse 7912.6 8265.80 8927.205 8529.85 8987.45 14056.3   100

my_files_2 <- sample(file_all, 1000, replace = TRUE)
#> Unit: milliseconds
#>       expr    min      lq     mean  median      uq     max neval
#>      sotos 1.2786 1.31485 1.368087 1.33890 1.37390  1.9579   100
#>    sotos_2 3.6715 3.77725 3.849989 3.83145 3.90755  4.3005   100
#>    sotos_3 1.8969 1.94865 2.004843 1.97525 2.01915  2.4871   100
#>        alt 1.1500 1.18695 1.229627 1.20130 1.22640  2.4962   100
#>  tidyverse 8.6329 8.99985 9.710873 9.21335 9.63085 14.6684   100

my_files_2 <- sample(file_all, 10000, replace = TRUE)
#> Unit: milliseconds
#>       expr     min       lq     mean   median       uq     max neval
#>      sotos 12.5224 12.81450 12.92100 12.89505 13.00510 13.5006   100
#>    sotos_2 37.5119 38.09420 38.98338 38.34025 38.76710 51.7555   100
#>    sotos_3 19.3073 19.70905 20.63162 19.88925 20.29015 25.1160   100
#>        alt 11.2854 11.54560 11.74392 11.63175 11.72850 15.7317   100
#>  tidyverse 14.4663 15.03035 16.42199 15.27875 15.49390 83.9643   100

~20000的样本大小下,tidyverse解决方案似乎领先。

相关问题