在base R或tidyverse中,是否有替代reshape2::melt的多维数组?

gv8xihay  于 2023-06-03  发布在  其他
关注(0)|答案(4)|浏览(230)

bounty还有4天到期。回答此问题可获得+100声望奖励。Ben Bolker希望吸引更多的注意力这个问题:这是stackoverflow.com/questions/75228268/…的一个同伴问题,但是矩阵的优雅答案在高维中不起作用。评论里有一些答案,可以清理一下贴出来或者有人能想出更好的...

假设我有一个三维数组g,维度为[x,y,z]reshape2::melt(g)将产生具有给出索引x,y,zvalue的列的 Dataframe ,其中value包含先前阵列的每个条目中的值。
假设reshape2superseded,那么在base R中是否有一个“one function”替代reshape2::melt的功能,或者我缺少的一个更积极支持的tidyverse包?
reshape2建议人们使用tidyr,但我似乎无法在tidyr中找到多维数组的解决方案。有cubelyr,但似乎也不是很活跃,这些天。
我可以编写一个自定义解决方案,只需要为这类问题寻找一个稳定的、具有reshape2::melt简单功能的解决方案。

library(reshape2)

g_as_array <- array(rnorm(9), dim = c(3,3,3)) # create a 3D array

g_as_data_frame <- reshape2::melt(g_as_array) # melt down to "tidy" format

head(g_as_data_frame)
#>   Var1 Var2 Var3      value
#> 1    1    1    1  1.4092362
#> 2    2    1    1 -2.1606972
#> 3    3    1    1  0.4334404
#> 4    1    2    1  0.2390544
#> 5    2    2    1 -0.9673617
#> 6    3    2    1  0.5668378

reprex package(v2.0.1)于2022-08-25创建

pbossiut

pbossiut1#

a <- array(1:27, dim = c(3,3,3))

library(reshape2)
DF1 <- melt(a)

DF2 <- data.frame(
  expand.grid(lapply(dim(a), seq_len)),
  value = as.vector(a)
)

identical(DF1, DF2)
#[1] TRUE

如果数组具有维名称:

a <-array(letters[1:27], dim = c(3, 3, 3), dimnames = list(letters[1:3],
                                                           letters[4:6],
                                                           letters[7:9]))

library(reshape2)
DF1 <- melt(a)
    
DF2 <- data.frame(
  expand.grid(dimnames(a)),
  value = as.vector(a)
)

identical(DF1, DF2)
#[1] TRUE

如果并非所有维都有名称,则需要首先填写缺少的名称,例如:

Map(\(x, y) if (is.null(x)) seq_len(y) else x , dimnames(a), dim(a))
bq8i3lrv

bq8i3lrv2#

一个选项是使用arrayInd

A <- array(1:8, c(2,2,2))

data.frame(arrayInd(seq_along(A), dim(A)), value = as.vector(A))
#  X1 X2 X3 value
#1  1  1  1     1
#2  2  1  1     2
#3  1  2  1     3
#4  2  2  1     4
#5  1  1  2     5
#6  2  1  2     6
#7  1  2  2     7
#8  2  2  2     8

或者类似于@ThomasIsCoding使用which

data.frame(which(array(TRUE, dim(A)), arr.ind = TRUE), value = as.vector(A))
#  dim1 dim2 dim3 value
#1    1    1    1     1
#2    2    1    1     2
#3    1    2    1     3
#4    2    2    1     4
#5    1    1    2     5
#6    2    1    2     6
#7    1    2    2     7
#8    2    2    2     8

如果数组具有维度名称。

A <- array(1:8, c(2,2,2), list(X=c("a","b"), Y=c("c","d"), Z=c("e","f")))

i <- arrayInd(seq_along(A), dim(A), dimnames(A), TRUE)
data.frame(mapply(`[`, dimnames(A), asplit(i, 2)), value = as.vector(A))
#  X Y Z value
#1 a c e     1
#2 b c e     2
#3 a d e     3
#4 b d e     4
#5 a c f     5
#6 b c f     6
#7 a d f     7
#8 b d f     8

但这可以通过as.data.frame(ftable(A))@Jon Spring或as.data.frame.table(A)@Onyambu来实现,如评论中所示。
如果查看as.data.frame.table的源代码,您会发现它使用的是expand.grid

as.data.frame.table(A)    #@Onyambu.
#as.data.frame(ftable(A)) #@Jon Spring
#  X Y Z Freq
#1 a c e    1
#2 b c e    2
#3 a d e    3
#4 b d e    4
#5 a c f    5
#6 b c f    6
#7 a d f    7
#8 b d f    8

但如果需要 * 数字索引 *,则可以使用此。

sapply(as.data.frame.table(A), unclass)
#     X Y Z Freq
#[1,] 1 1 1    1
#[2,] 2 1 1    2
#[3,] 1 2 1    3
#[4,] 2 2 1    4
#[5,] 1 1 2    5
#[6,] 2 1 2    6
#[7,] 1 2 2    7
#[8,] 2 2 2    8

或者一个变体-感谢@Onyambu的提示!

type.convert(as.data.frame.table(`dimnames<-`(A, NULL),
             base = list(as.character(seq_len(max(dim(A)))))), as.is = TRUE)
#  Var1 Var2 Var3 Freq
#1    1    1    1    1
#2    2    1    1    2
#3    1    2    1    3
#4    2    2    1    4
#5    1    1    2    5
#6    2    1    2    6
#7    1    2    2    7
#8    2    2    2    8

另一种选择是使用%%%/%“手工”计算。

cbind(1 + mapply(`%%`,
    Reduce(`%/%`, dim(A)[-length(dim(A))], 0:(length(A)-1), accumulate = TRUE),
    dim(A)), Value=as.vector(A))
#           Value
#[1,] 1 1 1     1
#[2,] 2 1 1     2
#[3,] 1 2 1     3
#[4,] 2 2 1     4
#[5,] 1 1 2     5
#[6,] 2 1 2     6
#[7,] 1 2 2     7
#[8,] 2 2 2     8

#Alternative
. <- 0:(length(A)-1)
cbind(1 +
    t(t(cbind(., outer(., cumprod(dim(A)[-length(dim(A))]), `%/%`))) %% dim(A)),
    Value=A)

或者使用rep

list2DF(c(Map(\(i, j, n) rep(rep(1:i, each=j), length.out=n),
    dim(A),
    c(1, cumprod(dim(A)[-length(dim(A))])),
    length(A)), Value=list(as.vector(A))))
#        Value
#1 1 1 1     1
#2 2 1 1     2
#3 1 2 1     3
#4 2 2 1     4
#5 1 1 2     5
#6 2 1 2     6
#7 1 2 2     7
#8 2 2 2     8
kkih6yb8

kkih6yb83#

下面是一些使用which技巧的R基替代方案,它们应该适用于一般数组,即数字和字符:

  1. which(1^is.na(g) > 0, arr.ind = TRUE)
cbind(as.data.frame(which(1^is.na(g) > 0, arr.ind = TRUE)), value = c(g))
  1. which(TRUE | is.na(g), arr.ind = TRUE)
cbind(as.data.frame(which(TRUE | is.na(g), arr.ind = TRUE)), value = c(g))
  1. nchar(g, "width") > -1
cbind(as.data.frame(which(nchar(g, "width") > -1, arr.ind = TRUE)), value = c(g))

我们将获得

dim1 dim2 dim3 value
1     1    1    1     a
2     2    1    1     b
3     3    1    1     c
4     1    2    1     d
5     2    2    1     e
6     3    2    1     f
7     1    3    1     g
8     2    3    1     h
9     3    3    1     i
10    1    1    2     j
11    2    1    2     k
12    3    1    2     l
13    1    2    2     m
14    2    2    2     n
15    3    2    2     o
16    1    3    2     p
17    2    3    2     q
18    3    3    2     r
19    1    1    3     s
20    2    1    3     t
21    3    1    3     u
22    1    2    3     v
23    2    2    3     w
24    3    2    3     x
25    1    3    3     y
26    2    3    3     z
27    3    3    3  <NA>

虚拟数据

> (g <- array(letters[1:27], dim = c(3, 3, 3)))
, , 1

     [,1] [,2] [,3]
[1,] "a"  "d"  "g"
[2,] "b"  "e"  "h"
[3,] "c"  "f"  "i"

, , 2

     [,1] [,2] [,3]
[1,] "j"  "m"  "p"
[2,] "k"  "n"  "q"
[3,] "l"  "o"  "r"

, , 3

     [,1] [,2] [,3]
[1,] "s"  "v"  "y"
[2,] "t"  "w"  "z"
[3,] "u"  "x"  NA
pbpqsu0x

pbpqsu0x4#

对标,好玩而已

以下是针对不同维度的数组的一些有趣的基准测试观察结果(不考虑维度名称以进行简化),其中考虑了对所发布问题的多个现有解决方案。
下面是关于随机数组的维数参数的基准测试函数

library(microbenchmark)
library(data.table)

fbench <- function(dims) {
    expgrd <- function(g) {
        data.frame(expand.grid(lapply(dim(g), seq_len)), value = as.vector(g))
    }

    arrind <- function(g) {
        data.frame(arrayInd(seq_along(g), dim(g)), value = as.vector(g))
    }

    which0 <- function(g) {
        data.frame(which(array(TRUE, dim(g)), arr.ind = TRUE), value = as.vector(g))
    }

    which1 <- function(g) {
        cbind(as.data.frame(which(1^is.na(g) > 0, arr.ind = TRUE)), value = c(g))
    }

    which2 <- function(g) {
        cbind(as.data.frame(which(TRUE | is.na(g), arr.ind = TRUE)), value = c(g))
    }

    which3 <- function(g) {
        cbind(as.data.frame(which(nchar(g, "width") > -1, arr.ind = TRUE)), value = c(g))
    }

    dftable0 <- function(g) {
        list2DF(lapply(as.data.frame.table(g), unclass))
    }

    dftable1 <- function(g) {
        list2DF(lapply(as.data.frame(ftable(g)), unclass))
    }

    dttable <- function(g) {
        as.data.table(g, sorted = FALSE, na.rm = FALSE)
    }

    set.seed(0)
    g <- array(sample(prod(dims)), dim = dims)
    mbm <- microbenchmark(
        expgrd(g),
        arrind(g),
        which0(g),
        which1(g),
        which2(g),
        which3(g),
        dftable0(g),
        dftable1(g),
        dttable(g),
        check = "equivalent"
    )

    boxplot(mbm, main = sprintf("dim = [%s]", toString(dims)))
}

1.对于dim <- rep(5, 3),我们运行fbench(dims)并获得

1.对于dims <- rep(5, 4),我们运行fbench(dims)并获得

1.对于dims <- rep(5, 5),我们运行fbench(dims)并获得

1.对于dims <- rep(5, 6),我们运行fbench(dims)并获得

1.对于dims <- rep(5, 7),我们运行fbench(dims)并获得

1.对于dims <- rep(5, 8),我们运行fbench(dims)并获得

相关问题