R语言 按多个分隔符拆分列(保留它们),并拆分为不等数量的列

fcg9iug3  于 2023-11-14  发布在  其他
关注(0)|答案(5)|浏览(195)

假设我有一个这样的框架(简化的,类似于我的问题的版本):

ID <- c(1,2,3)
value <- c("1+4-3", "2+7-6+4-3", "-1+3")
df <- data.frame(ID, value)

ID  value
1   1+4-3
2   2+7-6+4-3
3   -1+3

字符串
我需要通过多个分隔符(+-)将value列拆分为多个列,同时在单独的列中保留分隔符。
生成的框架应该是这样的:

ID  x1  x2  x3  x4  x5   x6   x7   x8   x9
1   1   +   4   -   3    <NA> <NA> <NA> <NA>
2   2   +   7   -   6    +    4    -    3
3   -   1   +   3   <NA> <NA> <NA> <NA> <NA>


此外,我不知道需要多少个结果列(可能不是示例中的9,而是50)。
实现这一目标的最佳途径是什么?
谢谢

fiei3ece

fiei3ece1#

您可以从tidyr使用separate_wider_delim()

library(tidyr)

df %>%
  separate_wider_delim(value,
                       delim = stringr::regex("(?<=\\d)(?=\\D)|(?<=\\D)(?=\\d)"),
                       too_few = "align_start",
                       names_sep = '',
                       names_repair = ~ sub("value", "X", .x))

# # A tibble: 3 × 10
#      ID X1    X2    X3    X4    X5    X6    X7    X8    X9   
#   <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1     1 1     +     4     -     3     NA    NA    NA    NA   
# 2     2 2     +     7     -     6     +     4     -     3    
# 3     3 -     1     +     3     NA    NA    NA    NA    NA

字符串

vuktfyat

vuktfyat2#

如果你有只包含digits的数字,你可以尝试

df %>%
  mutate(value = str_extract_all(value, "\\d+|\\D")) %>%
  unnest(value) %>%
  mutate(name = seq_len(n()), .by = ID) %>%
  pivot_wider(names_prefix = "X")

字符串
这给

# A tibble: 2 × 10
     ID X1    X2    X3    X4    X5    X6    X7    X8    X9
  <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1     1 1     +     4     -     3     NA    NA    NA    NA
2     2 2     +     7     -     6     +     4     -     3

szqfcxe2

szqfcxe23#

我的方法:

ID <- c(1,2,3)
value <- c("1+4-3","2+7-6+4-3","25+110/2*214")
# added example 3 to show effect on numbers with >1 digit
df <- data.frame(ID,value)

df |> dplyr::mutate(
  X = lapply(value, \(x) {
    # split by word/nonword boundaries
    y <- stringr::str_split(x, pattern = "\\b", simplify = TRUE)
    # drop the empty first and last strings
    y[nzchar(y)]
  })) |> tidyr::unnest_wider(X, names_sep = "")

字符串

# A tibble: 3 × 11
     ID value        X1    X2    X3    X4    X5    X6    X7    X8    X9   
  <dbl> <chr>        <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1     1 1+4-3        1     +     4     -     3     NA    NA    NA    NA   
2     2 2+7-6+4-3    2     +     7     -     6     +     4     -     3    
3     3 25+110/2*214 25    +     110   /     2     *     214   NA    NA


如果你删除unnest_wider的管道,你会得到这个,IMO在某些方面可能更整洁:

ID        value                         X
1  1        1+4-3             1, +, 4, -, 3
2  2    2+7-6+4-3 2, +, 7, -, 6, +, 4, -, 3
3  3 25+110/2*214  25, +, 110, /, 2, *, 214

92vpleto

92vpleto4#

您可以:

library(tidyverse)

df |>
  separate_longer_delim(cols = value, delim = regex("(?=\\+|-)")) |> 
  separate_longer_position(cols = value, width = 1) |> 
  mutate(pos = row_number(), .by = ID) |> 
  pivot_wider(values_from = value,
              names_from = "pos",
              names_prefix = "X")

# A tibble: 3 × 10
     ID X1    X2    X3    X4    X5    X6    X7    X8    X9   
  <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1     1 1     +     4     -     3     NA    NA    NA    NA   
2     2 2     +     7     -     6     +     4     -     3    
3     3 -     1     +     3     NA    NA    NA    NA    NA

字符串

798qvoo8

798qvoo85#

[编辑为处理多于一位的数字]
使用R-base:

# Example data
ID <- c(1,2)
value <- c("1+4-3","2+72-6+42-3")
df <- data.frame(ID,value)

# Function to do custom split
mysplit <- function(x){
  a <-gregexpr('[0-9]+',x)
  b <- gregexpr('[+-]{1}',x)
  res <- unlist(c(regmatches(x,a),regmatches(x,b)))
  res[order(unlist(c(a,b)))]
}

# split and fill with NAs
s <- sapply(df$value,mysplit)
mlength <- max(sapply(s,length))
s <- sapply(s, function(x) c(x,rep(NA,mlength - length(x))))

# Return dataframe
data.frame(ID = df$ID,t(s))

            ID X1 X2 X3 X4 X5   X6   X7   X8   X9
1+4-3        1  1  +  4  -  3 <NA> <NA> <NA> <NA>
2+72-6+42-3  2  2  + 72  -  6    +   42    -    3

字符串
编辑

相关问题