R语言 动态“跨行”相对频率

0md85ypi  于 12个月前  发布在  其他
关注(0)|答案(4)|浏览(83)

我有一个包含n个变量的数据集(示例中有3个变量,但各不相同),我想找出这些变量之间的相对频率。它们的名称始终相同,前缀后跟数字序列。

have <-
  data.frame(
    x1 = sample(1:10, 20, replace = TRUE),
    x2 = sample(1:10, 20, replace = TRUE),
    x3 = sample(1:10, 20, replace = TRUE)
  )

want <-
  have |>
  mutate(
    x1_prop = x1 / (x1 + x2 + x3),
    x2_prop = x2 / (x1 + x2 + x3),
    x3_prop = x3 / (x1 + x2 + x3))

我认为dplyr中的一个解决方案可以使用mutate(across,但在语法方面有困难...

want <-
  have |>
  mutate(across(everything()), . / rowSums(.)) # does not work
wwodge7n

wwodge7n1#

几件事:

  • ~中使用匿名函数(或者可以使用\(x)function
  • across调用中使用pick(everything())表示您希望获得所有列的总和。如果只使用..x,则只会得到当前列的和。
have |>
  mutate(across(everything(), ~ .x / rowSums(pick(everything())), 
                .names = "{col}_prop"))

#    x1 x2 x3   x1_prop   x2_prop    x3_prop
# 1   9  5  2 0.5625000 0.3125000 0.12500000
# 2   6  4  6 0.3750000 0.2500000 0.37500000
# 3   9  6  1 0.5625000 0.3750000 0.06250000
# 4   8  7  8 0.3478261 0.3043478 0.34782609
# 5   1  8  7 0.0625000 0.5000000 0.43750000
# 6   9  3  9 0.4285714 0.1428571 0.42857143
# 7   7  4  8 0.3684211 0.2105263 0.42105263
# 8   3  5  5 0.2307692 0.3846154 0.38461538
# 9  10  8 10 0.3571429 0.2857143 0.35714286
# 10  5  2  6 0.3846154 0.1538462 0.46153846
# 11  3 10  1 0.2142857 0.7142857 0.07142857
# 12 10  8  2 0.5000000 0.4000000 0.10000000
# 13  3  7  6 0.1875000 0.4375000 0.37500000
# 14  4  4 10 0.2222222 0.2222222 0.55555556
# 15  2  7  4 0.1538462 0.5384615 0.30769231
# 16  2  3  2 0.2857143 0.4285714 0.28571429
# 17  3  9  1 0.2307692 0.6923077 0.07692308
# 18  8  6  2 0.5000000 0.3750000 0.12500000
# 19  2  2  8 0.1666667 0.1666667 0.66666667
# 20  7  4  3 0.5000000 0.2857143 0.21428571
e0bqpujr

e0bqpujr2#

例如,尝试rowSums

> cbind(have, setNames(have / rowSums(have), paste0(names(have), "_prop")))
   x1 x2 x3   x1_prop   x2_prop    x3_prop
1   8  4  8 0.4000000 0.2000000 0.40000000
2   7  7  9 0.3043478 0.3043478 0.39130435
3   2  6  8 0.1250000 0.3750000 0.50000000
4   8  2 10 0.4000000 0.1000000 0.50000000
5   7  3  6 0.4375000 0.1875000 0.37500000
6   8  7  6 0.3809524 0.3333333 0.28571429
7   5  3  7 0.3333333 0.2000000 0.46666667
8   4  8  7 0.2105263 0.4210526 0.36842105
9   7  3  4 0.5000000 0.2142857 0.28571429
10 10  3  5 0.5555556 0.1666667 0.27777778
11  6  9  7 0.2727273 0.4090909 0.31818182
12 10  2  6 0.5555556 0.1111111 0.33333333
13  8  5  3 0.5000000 0.3125000 0.18750000
14 10  5  5 0.5000000 0.2500000 0.25000000
15  4 10  2 0.2500000 0.6250000 0.12500000
16  8 10  1 0.4210526 0.5263158 0.05263158
17  8  8  3 0.4210526 0.4210526 0.15789474
18  7  1  2 0.7000000 0.1000000 0.20000000
19 10  8  8 0.3846154 0.3076923 0.30769231
20  9  7  9 0.3600000 0.2800000 0.36000000
zazmityj

zazmityj3#

下面是一个基本的R方法,在数据上使用prop.table()

have <-
  data.frame(
    x1 = sample(1:10, 20, replace = TRUE),
    x2 = sample(1:10, 20, replace = TRUE),
    x3 = sample(1:10, 20, replace = TRUE)
  )

props <- prop.table(as.matrix(have), 1)
colnames(props) <- paste(colnames(props), "prop", sep="_")
cbind(have, 
      props
)
#>    x1 x2 x3   x1_prop    x2_prop   x3_prop
#> 1   8  5  3 0.5000000 0.31250000 0.1875000
#> 2   3  5  6 0.2142857 0.35714286 0.4285714
#> 3   9  6  9 0.3750000 0.25000000 0.3750000
#> 4   8 10  7 0.3200000 0.40000000 0.2800000
#> 5   6 10  8 0.2500000 0.41666667 0.3333333
#> 6  10  5  4 0.5263158 0.26315789 0.2105263
#> 7  10  8  4 0.4545455 0.36363636 0.1818182
#> 8   7  5  8 0.3500000 0.25000000 0.4000000
#> 9   5  2  8 0.3333333 0.13333333 0.5333333
#> 10  3  4  8 0.2000000 0.26666667 0.5333333
#> 11  3  2  1 0.5000000 0.33333333 0.1666667
#> 12  8  1  5 0.5714286 0.07142857 0.3571429
#> 13  7 10  3 0.3500000 0.50000000 0.1500000
#> 14  5 10  5 0.2500000 0.50000000 0.2500000
#> 15  1  4  4 0.1111111 0.44444444 0.4444444
#> 16  9  6  1 0.5625000 0.37500000 0.0625000
#> 17  5  7  7 0.2631579 0.36842105 0.3684211
#> 18  7  3  7 0.4117647 0.17647059 0.4117647
#> 19  2  9  5 0.1250000 0.56250000 0.3125000
#> 20  4  1  5 0.4000000 0.10000000 0.5000000

创建于2023-09-20使用reprex v2.0.2
以下是现有数据的基准:

library(dplyr)  
  have <-
    data.frame(
      x1 = sample(1:10, 20, replace = TRUE),
      x2 = sample(1:10, 20, replace = TRUE),
      x3 = sample(1:10, 20, replace = TRUE)
    )
  
f1 <- function(){
  have |>
  mutate(across(everything(), ~ .x / rowSums(pick(everything())), 
                .names = "{col}_prop"))
}

f2 <- function(){
  have |>
    mutate(
      total = rowSums(across(starts_with("x"))),
      across(starts_with("x"), \(x) x / total, .names = "{.col}_prop")
    )
}

f3 <- function(){
  props <- prop.table(as.matrix(have), 1)
  colnames(props) <- paste(colnames(props), "prop", sep="_")
  cbind(have, 
        props
  )
}
library(microbenchmark)
microbenchmark(f1(), f2(), f3())
#> Warning in microbenchmark(f1(), f2(), f3()): less accurate nanosecond times to
#> avoid potential integer overflows
#> Unit: microseconds
#>  expr      min       lq      mean   median       uq       max neval cld
#>  f1() 2609.199 2701.962 3180.9354 2869.119 3203.515 15608.577   100 a  
#>  f2() 1381.741 1470.875 1903.8781 1546.090 1804.389  9406.753   100  b 
#>  f3()   79.171   98.113  135.9298  113.652  132.143  1543.486   100   c

创建于2023-09-20使用reprex v2.0.2
我在其他很多地方也看到过这种情况。在小数据中,整理操作的开销使它们变慢。然而,当数据更大时,整理操作优于其他一些操作。这里是相同的基准,但有20000个obs而不是20个。

have <-
    data.frame(
      x1 = sample(1:10, 20000, replace = TRUE),
      x2 = sample(1:10, 20000, replace = TRUE),
      x3 = sample(1:10, 20000, replace = TRUE)
    )

microbenchmark(f1(), f2(), f3())
#> Warning in microbenchmark(f1(), f2(), f3()): less accurate nanosecond times to
#> avoid potential integer overflows
#> Unit: milliseconds
#>  expr       min        lq      mean    median        uq       max neval cld
#>  f1()  3.164011  3.326145  3.929811  3.671857  3.939916  7.224569   100 a  
#>  f2()  1.646273  1.768474  2.138184  1.893769  2.175686  8.752803   100  b 
#>  f3() 12.781504 14.658791 16.305533 15.854946 16.253732 58.099706   100   c

创建于2023-09-20使用reprex v2.0.2

fnvucqvd

fnvucqvd4#

have |>
  mutate(
    total = rowSums(across(starts_with("x"))),
    across(starts_with("x"), \(x) x / total, .names = "{.col}_prop")
  )
#    x1 x2 x3 total    x1_prop   x2_prop    x3_prop
# 1   8  3  2    13 0.61538462 0.2307692 0.15384615
# 2   7  5  1    13 0.53846154 0.3846154 0.07692308
# 3   2  5  5    12 0.16666667 0.4166667 0.41666667
# 4   5 10 10    25 0.20000000 0.4000000 0.40000000
# 5   1  7  4    12 0.08333333 0.5833333 0.33333333
# 6   5  5  1    11 0.45454545 0.4545455 0.09090909
# 7  10  7  9    26 0.38461538 0.2692308 0.34615385
# 8   1  6  5    12 0.08333333 0.5000000 0.41666667
# ...

相关问题