R语言 如何准确地获取两列之间的最高值,同时考虑其他列中的多个缺失值?

r1wp621o  于 2023-07-31  发布在  其他
关注(0)|答案(3)|浏览(95)

我在下面创建了一个数据框,表示家庭中的个人(hid)。我给他们分配了一个社会等级(class),从1到4编码。列relation表示户主(编码为0)和伴侣(编码为1)。列cntry代表不同的国家:英国、美国。did列表示国家/地区年份:美国2008年、美国2009年、英国2008年、英国2009年。
我想创建一个新的专栏,以采取双方的最高社会阶层。我已经这样做了,但我在输出中仍然有一些问题。
主要问题:如果个人的家庭ID(hid)缺少值,代码会自动为他们分配最高值。我想把这些人保留在数据框架中,同时也让他们保持原来的社会阶层。家庭ID号码3原始的class列被编码为3,但被分配为4,即使我们不知道家庭中其他人的社会阶级是什么,因为它缺失了。
有人能帮帮忙吗?
复制代码:

library(missForest)
library(dplyr)
library(tidyr)
# Set seed for reproducibility
set.seed(123)

# Create Variables
class <- sample(1:4, 10000, replace = TRUE)
hid <- rep(1:(10000/2), each = 2) 
relation <-  rep(0:1, 10000/2)
did <- rep(sample(2006:2009, 10000/2, replace = TRUE), each = 2)
cntry <- sample(c("US", "UK"), 10000, replace = TRUE)

# Combine into a data frame
df <- data.frame(class = class,
                 hid = hid,
                 relation = relation,
                 did = did,
                 cntry = cntry)

df <- prodNA(df, noNA = 0.1)

dominant_class_df <- df %>%     
  drop_na("class")  %>% 
  filter(relation < 3) %>%
  group_by(hid,did) %>%      
  summarise(dominant_class = max(class,na.rm=T)) %>%
  mutate(dominant_class = if_else(is.infinite(dominant_class), NA_real_, dominant_class))

df2 <- df %>%
  left_join(dominant_class_df, by = c("hid", "did"))

字符串

dtcbnfnu

dtcbnfnu1#

像这样的吗

new_class <- function(x, hid) {
  f <- (seq_along(x) - 1L)%/% 2L
  out <- by(data.frame(x, hid), f, FUN = \(X) {
    y <- X[["x"]]
    hid <- X[["hid"]]
    if(any(!is.na(hid))) {
      if(any(!is.na(y))) {
        m <- max(y, na.rm = TRUE)
        y[!is.na(hid)] <- m
        y
      } else y
    } else c(NA_integer_, NA_integer_)
  })
  out |> unlist() |> unname()
}

df2 <- df %>% 
  mutate(dominant_class = new_class(class, hid))
head(df2, n = 14L)
#>    class hid relation  did cntry dominant_class
#> 1      3   1        0 2008    UK              3
#> 2      3   1        1 2008    US              3
#> 3      3   2        0   NA    US              3
#> 4      2   2       NA 2009  <NA>              3
#> 5      3  NA        0 2006    UK              3
#> 6      2   3        1 2006    UK              3
#> 7      2   4        0 2009    US              2
#> 8      2   4        1 2009    UK              2
#> 9      3   5        0 2007    US              3
#> 10     1   5        1 2007    US              3
#> 11     4   6        0 2007    US              4
#> 12     2  NA        1 2007    UK              2
#> 13     2   7        0 2008    US              2
#> 14     1   7        1 2008    US              2

字符串
创建于2023-07-27带有reprex v2.0.2

bweufnob

bweufnob2#

看起来left_join()可能会导致您的问题;如果你使用mutate()而不是summarise(),你会得到你期望的结果吗?
例如:

library(missForest)
library(tidyverse)
set.seed(123)

class <- sample(1:4, 10000, replace = TRUE)
hid <- rep(1:(10000/2), each = 2) 
relation <-  rep(0:1, 10000/2)
did <- rep(sample(2006:2009, 10000/2, replace = TRUE), each = 2)
cntry <- sample(c("US", "UK"), 10000, replace = TRUE)

df <- data.frame(class = class,
                 hid = hid,
                 relation = relation,
                 did = did,
                 cntry = cntry)
df <- prodNA(df, noNA = 0.1)

dominant_class_df <- df %>%     
  drop_na("class")  %>% 
  filter(relation < 3) %>%
  group_by(hid,did) %>%      
  summarise(dominant_class = max(class,na.rm=T)) %>%
  mutate(dominant_class = if_else(is.infinite(dominant_class), NA_real_, dominant_class))

df2 <- df %>%
  left_join(dominant_class_df, by = c("hid", "did"))

###########

df %>%     
#  drop_na("class") %>% # do you need this?
  filter(relation < 3) %>%
  group_by(hid, did) %>%      
  mutate(dominant_class = ifelse(!is.na(hid), max(class, na.rm = TRUE), class)) %>%
  mutate(dominant_class = ifelse(!is.finite(dominant_class), NA, dominant_class)) %>%
  print(n = 25)
#> Warning: There were 308 warnings in `mutate()`.
#> The first warning was:
#> ℹ In argument: `dominant_class = ifelse(!is.na(hid), max(class, na.rm = TRUE),
#>   class)`.
#> ℹ In group 16: `hid = 14`, `did = NA`.
#> Caused by warning in `max()`:
#> ! no non-missing arguments to max; returning -Inf
#> ℹ Run `dplyr::last_dplyr_warnings()` to see the 307 remaining warnings.
#> # A tibble: 8,994 × 6
#> # Groups:   hid, did [5,417]
#>    class   hid relation   did cntry dominant_class
#>    <int> <int>    <int> <int> <chr>          <dbl>
#>  1     3     1        0  2008 UK                 3
#>  2     3     1        1  2008 US                 3
#>  3     3     2        0    NA US                 3
#>  4     3    NA        0  2006 UK                 3
#>  5     2     3        1  2006 UK                 2
#>  6     2     4        0  2009 US                 2
#>  7     2     4        1  2009 UK                 2
#>  8     3     5        0  2007 US                 3
#>  9     1     5        1  2007 US                 3
#> 10     4     6        0  2007 US                 4
#> 11     2    NA        1  2007 UK                 2
#> 12     2     7        0  2008 US                 2
#> 13     1     7        1  2008 US                 2
#> 14     2     8        0  2009 UK                 3
#> 15     3     8        1  2009 US                 3
#> 16     4     9        0  2007 UK                 4
#> 17    NA     9        1  2007 US                 4
#> 18     3    10        0  2008 US                 3
#> 19     3    10        1    NA UK                 3
#> 20     1    11        0  2006 UK                 4
#> 21     4    11        1  2006 <NA>               4
#> 22     1    NA        0  2008 UK                 1
#> 23     1    12        1  2008 US                 1
#> 24     1    NA        0  2007 UK                 1
#> 25     3    13        1  2007 UK                 3
#> # ℹ 8,969 more rows

字符串
创建于2023-07-26带有reprex v2.0.2

b09cbbtk

b09cbbtk3#

另一种方法:为NA提供新的HID-HID:

df  <- df |> mutate(hid = ifelse(is.na(hid),
                                 sprintf('row_%s', row_number()), 
                                 sprintf('hid_%s', hid))
                    )

字符串
...像以前一样创建dominant_class_df
结果:

> dominant_class_df |> filter(hid == 'hid_3')
# A tibble: 1 x 3
# Groups:   hid [1]
  hid     did dominant_class
  <chr> <int>          <dbl>
1 hid_3  2006              2

相关问题