将一个 Dataframe 中的一列中的字符串与R中另一个 Dataframe 中的两列中的任一列进行匹配

pkwftd7m  于 2023-04-27  发布在  其他
关注(0)|答案(4)|浏览(164)

我有两个 Dataframe df1和df2。df1包括一个列df1$product,其中包含产品名称。第二个 Dataframe 包括三个列:df2$product_code、df2$product_name和df2$product_aka_name。我需要在df1中添加一个列df1$product_code,该列基于产品的调用方式(假定它可以使用df2中列出的两个不同名称)。
到目前为止,我已经尝试使用火柴()在整理df2$product_name和df2$product_aka_name之后我也试过用grep()但几个小时后不得不停止,没有任何结果。最后,我研究了fuzzyjoin,但它似乎不适合我的问题。真实的的数据集分别有700,000行和20列df1,和40,000行和5列的df2。所以我会需要一些快速和有效的...
地雷危险教育可如下:

df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"))
df1
          product      col2            col3
1         Abcd         DLK             qdsf88
2         Efgh         CBN             sdf63
3         Ijkl         ABC             dd995
4         Mnop         ZHU             dgsg1
5         Qrst         HSC             xxx587
6         Uvwx         LJK             dfr55

df2 <- data.frame(product_code = c("1001", "1002", "1003", "1004", "1005", "1006", "1007", "1008"), product_name = c("Fcde", "Abcd", "Efgh", "Mlfr", "Mnop", "Plor", "Kdlr", "Vfsd"), product_aka_name = c(NA, NA, NA, "Qrst", NA, "Uvwx", "Azer", "Qwer"))
df2
 product_code product_name product_aka_name
1         1001         Fcde             <NA>
2         1002         Abcd             <NA>
3         1003         Efgh             <NA>
4         1004         Mlfr             Qrst
5         1005         Mnop             <NA>
6         1006         Plor             Uvwx
7         1007         Kdlr             Azer
8         1008         Vfsd             Qwer

我需要的结果如下:

df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"), product_code = c("1002", "1003", NA, "1005", "1004", "1006"))
df1
 product col2   col3 product_code
1    Abcd  DLK qdsf88         1002
2    Efgh  CBN  sdf63         1003
3    Ijkl  ABC  dd995         <NA>
4    Mnop  ZHU  dgsg1         1005
5    Qrst  HSC xxx587         1004
6    Uvwx  LJK  dfr55         1006

非常感谢你的帮助。

ef1yzkbh

ef1yzkbh1#

分两步完成,并使用NA更新它们。

i <- match(df1$product, df2$product_name)
j <- is.na(i)
i[j] <- match(df1$product[j], df2$product_aka_name)
df2$product_code[i]
#[1] "1002" "1003" NA     "1005" "1004" "1006"

如果这是为了慢使用fastmatch::fmatch而不是match,可能使用which(is.na(i))而不是is.na(i)
或者创建一个索引向量,设置名称并使用此子集。

df2$product_code[setNames(rep(seq_len(nrow(df2)), 2),
        c(df2$product_name, df2$product_aka_name))[df1$product]]
#[1] "1002" "1003" NA     "1005" "1004" "1006"

基准

library(data.table)
dt0 <- copy(df1)
dt2 <- copy(df2)
setDT(dt0); setDT(dt2)
library(dplyr)
library(tidyr)

microbenchmark::microbenchmark(setup = dt1 <- copy(dt0),
"data.table" = {
dt1[dt2,
    on = c("product" = "product_name"),
    code1 := i.product_code
][dt2,
    on = c("product" = "product_aka_name"),
    code2 := i.product_code
][, 
    product_code := fcoalesce(
        code1, code2
    )
][, `:=`(
    code1 = NULL,
    code2 = NULL
 )]
dt1$product_code},
base = {i <- match(df1$product, df2$product_name)
  j <- is.na(i)
  i[j] <- match(df1$product[j], df2$product_aka_name)
  df2$product_code[i] },
"dplyr-TarJae" = {df1 %>% 
  left_join(df2, by = c("product" = "product_name")) %>% 
  left_join(df2, by = c("product" = "product_aka_name")) %>% 
  mutate(product_code = coalesce(product_code.x, product_code.y), .keep = "unused") %>% 
    select(-c(product_aka_name, product_name)) },
"dplyr-Ben Norris" = {df1 %>%
  left_join( # keeps all rows in df1 even if returns NA
    df2 %>%
      pivot_longer(cols = ends_with("name"), # pivots the columns with product names
                   values_to = "product", # names the new column to match df1
                   names_to = NULL) # discards old column names
  )}
)

结果

Unit: microseconds
             expr       min        lq        mean     median        uq       max neval
       data.table  1885.500  2050.626  2238.04686  2223.5400  2281.879  5869.183   100
             base    12.285    14.954    19.24151    19.7455    21.667    38.147   100
     dplyr-TarJae  5724.538  6041.483  6515.73361  6367.4275  6552.855 14358.221   100
 dplyr-Ben Norris 16517.120 17091.760 18016.42934 17427.3015 18800.145 22272.853   100
mzmfm0qo

mzmfm0qo2#

下面是一个使用dplyrtiydr函数的连接策略。它的工作原理是将df2旋转成更长的形式,其中每行是与产品代码匹配的名称(包括aka名称):

library(dplyr)
library(tidyr)
df1 %>%
  left_join( # keeps all rows in df1 even if returns NA
    df2 %>%
      pivot_longer(cols = ends_with("name"), # pivots the columns with product names
                   values_to = "product", # names the new column to match df1
                   names_to = NULL) # discards old column names
  )

Joining, by = "product"
  product col2   col3 product_code
1    Abcd  DLK qdsf88         1002
2    Efgh  CBN  sdf63         1003
3    Ijkl  ABC  dd995         <NA>
4    Mnop  ZHU  dgsg1         1005
5    Qrst  HSC xxx587         1004
6    Uvwx  LJK  dfr55         1006
d6kp6zgx

d6kp6zgx3#

我们可以用dplyr来实现:

library(dplyr)

df1 %>% 
  left_join(df2, by = c("product" = "product_name")) %>% 
  left_join(df2, by = c("product" = "product_aka_name")) %>% 
  mutate(product_code = coalesce(product_code.x, product_code.y), .keep = "unused") %>% 
  select(-c(product_aka_name, product_name))

product col2   col3 product_code
1    Abcd  DLK qdsf88         1002
2    Efgh  CBN  sdf63         1003
3    Ijkl  ABC  dd995         <NA>
4    Mnop  ZHU  dgsg1         1005
5    Qrst  HSC xxx587         1004
6    Uvwx  LJK  dfr55         1006
flvlnr44

flvlnr444#

当你正在寻找一些快速和高效的东西时,我会使用data.table

library(data.table)

setDT(df1); setDT(df2)

df1[df2,
    on = c("product" = "product_name"),
    code1 := i.product_code
][df2,
    on = c("product" = "product_aka_name"),
    code2 := i.product_code
][, 
    product_code := fcoalesce(
        code1, code2
    )
][, `:=`(
    code1 = NULL,
    code2 = NULL
)]

df1
#    product col2   col3 product_code
# 1:    Abcd  DLK qdsf88         1002
# 2:    Efgh  CBN  sdf63         1003
# 3:    Ijkl  ABC  dd995         <NA>
# 4:    Mnop  ZHU  dgsg1         1005
# 5:    Qrst  HSC xxx587         1004
# 6:    Uvwx  LJK  dfr55         1006

data.table允许您modify by reference,这通常会避免创建列或 Dataframe 的副本。这通常意味着对于大型数据集,它比等效的tidyverse方法更快。

相关问题