R:从 Dataframe 中的引号内提取字母字符串

moiiocjp  于 2023-02-14  发布在  其他
关注(0)|答案(4)|浏览(224)

我有一个数据框

data.frame(string = c('["jewelry","tailor","Jewelry"]', '["apple","banana","orange"]'))

我想创建三列和两行...我想让 Dataframe 看起来像

data.frame(string1 = c('jewelry','apple'), string2=c('tailor','banana'), string3=c('jewelry','orange'))
qkf9rpyu

qkf9rpyu1#

base R中,我们可以去掉方括号,然后使用read.csv将列读入data.frame

read.csv(text = gsub('\\[|\\]|"', "", df1$string),
    header = FALSE, col.names =  paste0("string", 1:3))
  • 输出
string1 string2 string3
1 jewelry  tailor Jewelry
2   apple  banana  orange

或者使用tidyverse

library(dplyr)
library(stringr)
library(tidyr)
df1 %>%
   mutate(string = str_remove_all(string, '\\[|\\]|"')) %>% 
   separate_wider_delim(string, delim = ',', 
     names = c("string1", "string2", "string3"))
  • 输出
# A tibble: 2 × 3
  string1 string2 string3
  <chr>   <chr>   <chr>  
1 jewelry tailor  Jewelry
2 apple   banana  orange

数据

df1 <- data.frame(string = c('["jewelry","tailor","Jewelry"]', 
     '["apple","banana","orange"]'))
nbysray5

nbysray52#

如果您事先知道最大列数,并且在排序或列分配方面没有间隙(本例中的行少于3列),则使用可选的tidyverse方法:

# dummy data
myDf <- data.frame(string = c('["jewelry","tailor","Jewelry"]', '["apple","banana","orange"]')) 

library(dplyr)
library(tidyr)

myDf %>% 
    # select column to split, new column names and regex with capture groups (parts between brakets
    tidyr::extract(string, into = c("a", "b", "c"), regex = '"(\\w*)","(\\w*)","(\\w*)"')

        a      b       c
1 jewelry tailor Jewelry
2   apple banana  orange
wh6knrhe

wh6knrhe3#

这看起来像一个有效的python/json列表。
使用网状:

library(tidyverse)

df1%>%
  rowwise() %>%
  transmute(string=list(reticulate::py_eval(string)))%>%
  unnest_wider(string, names_sep = '')

#> # A tibble: 2 × 3
#>   string1 string2 string3
#>   <chr>   <chr>   <chr>  
#> 1 jewelry tailor  Jewelry
#> 2 apple   banana  orange

使用jsonlite

a <- jsonlite::fromJSON(paste('[', paste(df1$string, collapse = ','), ']'))
setNames(data.frame(a), paste0('string', seq(ncol(a))))

#>   string1 string2 string3
#> 1 jewelry  tailor Jewelry
#> 2   apple  banana  orange

或者甚至:

d <- do.call(rbind, lapply(df1$string, jsonlite::fromJSON))
setNames(data.frame(d), paste0('string', seq(ncol(d))))

  string1 string2 string3
1 jewelry  tailor Jewelry
2   apple  banana  orange
relj7zay

relj7zay4#

jsonlite到列表列和unnest_wider

library(dplyr)
library(tidyr)
library(purrr)
library(jsonlite)

df <- data.frame(string = c('["jewelry","tailor","Jewelry"]', 
                            '["apple","banana","orange"]')) 
                            
df %>%  
  mutate(string = map(string, ~ parse_json(.x))) %>% 
  unnest_wider(string, names_sep = "")
#> # A tibble: 2 × 3
#>   string1 string2 string3
#>   <chr>   <chr>   <chr>  
#> 1 jewelry tailor  Jewelry
#> 2 apple   banana  orange

创建于2023年2月13日,使用reprex v2.0.2

相关问题