在R中比较字符向量中的值

ehxuflar  于 2023-04-09  发布在  其他
关注(0)|答案(1)|浏览(153)

我有两个 Dataframe

分类

structure(list(category_code = 1:12, category_name = c("Kanton", 

"Bezirk", "Gemeinde", "Ort", "Ortsteil,Siedlung", "Quartier", 
"Region", "Gebiet", "Flurname", "Strasse,Platz", "Hof,Haus", 
"Berg"), radius = c(25000L, 8000L, 2500L, 1000L, 500L, 400L, 
15000L, 300L, 300L, 150L, 40L, 300L), category_values = list(
    c("kanton", "bistum"), "bezirk", c("gemeinde", "stadt", "gletscher", 
    "see", "gewässer"), c("ort", "dorf", "alp", "gewaesser", 
    "Gelände", "administrative", "seeteil", "schutzzone"), c("ortsteil", 
    "siedlung", "areal", "flug", "Wald"), c("quartier", "weiler", 
    "abhang", "bauzone", "gemeingut", "lehen", "nutzungszone", 
    "pacht"), "region", c("gebiet", "tal", "ebene", "gebirge", 
    "massiv", "herrschaft"), c("flur", "mündung", "steinbruch", 
    "insel", "weiher", "deponie", "lokalname", "kulturland", 
    "quartierteil", "land", "allmend", "park", "lichtung", "rodung", 
    "bucht", "pflanzenbestand"), c("strasse", "weg", "kreuzung", 
    "gasse", "graben", "platz", "bach", "kanal", "verzweigung", 
    "pfad", "fluss", "fliessend", "spur"), c("haus", "hof", "bauernhof", 
    "gebaeude", "gebäude", "wohngebäude", "wohngebaeude", "scheune", 
    "ruine", "burg", "brunnen", "schloss", "käserei", "mühle", 
    "turm", "sägerei", "kirche", "anstalt", "heim", "gefängnis", 
    "bild", "stock", "denkmal", "anlage", "stall", "wirtschaft", 
    "restaurant", "wehr", "halle", "grenzstein", "einkaufszentrum", 
    "grotte", "hoehle", "museum", "leitung", "bäckerei", "parkplatz", 
    "wasserfall", "seilbahn", "lift", "kapelle", "bauwerk", "haltestelle", 
    "aussichtspunkt", "ziegelei", "zisterne", "stadion", "baum", 
    "werk", "grundstück", "teich", "hafen", "treppe", "quelle", 
    "kloster", "findling", "mauer", "trotte", "stätte", "bahn", 
    "bunker", "apotheke", "metzgerei", "block", "walke", "tunnel", 
    "infrastruktur"), c("gipfel", "grat", "first", "huegel", 
    "hügel", "graben", "tobel", "pass", "sattel", "rücken", 
    "aussicht", "berg", "runse", "fels", "geländeerhebung", 
    "geländevertiefung", "firnschnee", "fischereistelle"))), row.names = c(NA, 
-12L), class = "data.frame")

现在我想比较categories$category_values和names_df$category_string,对于每个匹配项,最低的(1,2,3,4,5,6,7,8,9,10,11,12)对应categories$category应该写入names_df$category_code,对应的radius也应该写入一个新字段,如names_df$radius。
它也应该是不区分大小写的,然后还检查部分(SQL与%ILIKE%或其他)。例如。类别$category_values是“gewässer”,它还应该检查它的names_df$category是“Fliessgewässer”。
以下是我目前为止的代码,但它没有正确地完成这项工作:

# loop through each row in names_df
for (i in seq_along(names_df$category_string)) {
  # check for matches between categories$category_values and names_df$category_string
  match_rows <- categories[sapply(categories$category_values, function(x) any(grep(tolower(x), tolower(names_df$category_string[[i]])))), ]
  
  # extract the category codes from the matched rows and add them to the list
  matched_codes[[i]] <- match_rows$category_code
  
  # concatenate the matched category codes into a string and write to names_df$category_code
  names_df$category_code[i] <- paste0(sort(unlist(matched_codes[[i]])), collapse = ", ")
}

谢谢你的帮助

mhd8tkvw

mhd8tkvw1#

这里我尝试使用dplyrstringr,困难的部分是将字符串与列表的列表进行比较,因此我将每个列表元素合并到一个正则表达式中,并对每个category_string进行逐行比较。然后我取最小索引并连接回categories df。

library(dplyr)
library(stringr)
names_df %>%
  rowwise() %>%
  mutate(category_code = min(which(str_detect(category_string, sapply(categories$category_values, function(x) paste0(x, collapse = "|")))))) %>%
  left_join(categories %>% select(category_code, radius), by = "category_code")

# A tibble: 5 × 7
# Rowwise: 
#  id    `topographic-name` category_string                    category `year-#from` category_code radius
#  <chr> <chr>              <chr>                              <chr>    <chr>               #<int>  <int>
#1 1     Cholenholz         "c(\"flurname\", \",\", \"wald\")" 9        NA                      9    300
#2 2     Lisen              "c(\"bauernhof,\", \"ort\")"       4        NA                      4   1000
#3 3     Lochboden          "c(\"bauernhof,\", \"flurname\")"  9        NA                      9    300
#4 4     Löchli             "c(\"bauernhof,\", \"flurname\")"  9        NA                      9    300
#5 5     Lochweid           "c(\"alp,\", \"flurname\")"        9        NA                      4   1000

相关问题