R reform()非常慢

mf98qq94  于 2023-10-13  发布在  其他
关注(0)|答案(3)|浏览(69)

我需要对数据进行简单的整形,从长到宽,这需要在R中工作。对于这个用例,reshape()似乎非常慢(尽管Assert它非常快https://stackoverflow.com/a/12073077/3017280)。这个例子是我的数据的合理近似。我知道在这个例子中我不需要两个Index列,但是我在真实的数据中需要。在我的笔记本电脑上,10,000行需要3秒,40,000行需要200多秒。真实的数据有超过一百万行,所以repeat()显然是不可行的。有人能解释一下为什么在这种情况下需要这么长时间吗?我使用split / lapply / Reduce + merge解决了这个问题,这是笨拙的,但非常快。

n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
                  Index2 = rep(sample(3E6:4E6, n), 4),
                  Key = rep(1:4, each = n),
                  Date = sample(seq.Date(as.Date("2020-01-01"),
                                         as.Date("2021-12-31"), 
                                         by = "1 day"),
                                size = n * 4, replace = TRUE),
                  Score = sample(0:48, n * 4, replace = TRUE))
                                
system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))
n3h0vuf2

n3h0vuf21#

如果您查看reshape使用profvis包调用的函数,您可以看到几乎所有的总时间都花在函数的这一行上。interaction函数仅用于将两个id列合并合并为一列。

data[, tempidname] <- interaction(data[, idvar], 
                drop = TRUE)

而不是interaction,您可以使用do.call(paste0, data[, idvar])。您可以使用一个函数来创建一个interaction等于这个更快的函数的环境。

new_reshape <- function(...){
  interaction <- function(x, drop) do.call(paste0, x)
  environment(reshape) <- environment()
  reshape(...)
}

现在快多了

system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))
 #   user  system elapsed 
 # 35.292   0.538  36.236 

system.time(new_dfWide <- new_reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

  # user  system elapsed 
  # 0.071   0.009   0.081  

all.equal(new_dfWide, dfWide)
# [1] TRUE

您可以使用plyr:::ninteraction来实现更快的速度。这个函数唯一的非基础依赖是plyr:::id_var,它没有依赖,这意味着如果你不能安装软件包,你可以很容易地复制粘贴这个函数定义(添加一个注解)。

new_reshape <- function(...){
  # interaction = plyr:::ninteraction
  # id_var = plyr:::id_var
  interaction <- 
    function (.variables, drop = FALSE) 
    {
        lengths <- vapply(.variables, length, integer(1))
        .variables <- .variables[lengths != 0]
        if (length(.variables) == 0) {
            n <- nrow(.variables) %||% 0L
            return(structure(seq_len(n), n = n))
        }
        if (length(.variables) == 1) {
            return(id_var(.variables[[1]], drop = drop))
        }
        ids <- rev(lapply(.variables, id_var, drop = drop))
        p <- length(ids)
        ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), 
            USE.NAMES = FALSE)
        n <- prod(ndistinct)
        if (n > 2^31) {
            char_id <- do.call("paste", c(ids, sep = "\r"))
            res <- match(char_id, unique(char_id))
        }
        else {
            combs <- c(1, cumprod(ndistinct[-p]))
            mat <- do.call("cbind", ids)
            res <- c((mat - 1L) %*% combs + 1L)
        }
        attr(res, "n") <- n
        if (drop) {
            id_var(res, drop = TRUE)
        }
        else {
            structure(as.integer(res), n = attr(res, "n"))
        }
    }  
  id_var <- 
  function (x, drop = FALSE) 
  {
      if (length(x) == 0) 
          return(structure(integer(), n = 0L))
      if (!is.null(attr(x, "n")) && !drop) 
          return(x)
      if (is.factor(x) && !drop) {
          x <- addNA(x, ifany = TRUE)
          id <- as.integer(x)
          n <- length(levels(x))
      }
      else {
          levels <- sort(unique(x), na.last = TRUE)
          id <- match(x, levels)
          n <- max(id)
      }
      structure(id, n = n)
  }
  environment(reshape) <- environment()
  reshape(...)
}
system.time(new_dfWide <- new_reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

  #  user  system elapsed 
  # 0.015   0.000   0.015
8zzbczxx

8zzbczxx2#

我不知道我曾经声称stats::reshape是最快的。
相比之下,stats::reshape在我的i9/64 GB-ram系统上没有那么快:

system.time(
dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide")
)
#    user  system elapsed 
#   19.63    0.03   19.73

但其他重塑功能做得更好:

system.time(
  tidyrWide <- pivot_wider(
    dfLong, c("Index1", "Index2"),
    names_prefix = "Q", names_from = "Key",
    values_from = c("Date", "Score"))
)
#    user  system elapsed 
#    0.01    0.00    0.02 

nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE

同样,data.table::dcast也同样快:

dtLong <- as.data.table(dfLong)
system.time(
  dtWide <- data.table::dcast(
    Index1 + Index2 ~ paste0("Q", Key),
    data = dtLong, value.var = c("Date", "Score"))
)
#    user  system elapsed 
#    0.00    0.01    0.02 

dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE
nkhmeac6

nkhmeac63#

考虑一个高级修改版本的@Moody_Mudskipper的matrix_spread,使用基R。由于matrix将简化像Date这样的复杂类型,因此需要进行一些临时更改:

功能

matrix_spread <- function(df1, id, key, value, sep){
  unique_ids <-  unique(df1[[key]])
  mats <- lapply(df1[value], function(x) 
    matrix(x, ncol=length(unique_ids), byrow = FALSE)
  )
  df2 <- do.call(
    data.frame, list(unique(df1[id]), mats)
  )
  
  # RENAME COLS
  names(df2)[(length(id)+1):ncol(df2)] <- as.vector(
    sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
  )
  # REORDER COLS
  df2 <- df2[c(id, as.vector(
    outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
  ))]
  
  return(df2)
}

申请

system.time(
  dfWide2 <- matrix_spread(
    df1 = dfLong, 
    id = c("Index1", "Index2"),
    key = "Key",
    value = c("Date", "Score"),
    sep = "_Q"
  )
)
#  user  system elapsed 
# 0.022   0.000   0.023 

# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
  dfWide2[grep("Date", names(dfWide2))],
  as.Date,
  origin = "1970-01-01"
)

# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE

注意:如果跨多列运行,请确保通过matrix_spread运行前 * 按key和所有id变量对数据进行排序。

相关问题