R语言 标记另一列中特定值前后的观测值

4ngedf3f  于 2022-12-20  发布在  其他
关注(0)|答案(5)|浏览(127)

假设我有一个df:

df <- data.frame(flag = c(rep(0, 20)),
                 include = c(rep(1, 20)))
df[c(4,8,16), ]$flag <- 1
df

   flag include
1     0       1
2     0       1
3     0       1
4     1       1
5     0       1
6     0       1
7     0       1
8     1       1
9     0       1
10    0       1
11    0       1
12    0       1
13    0       1
14    0       1
15    0       1
16    1       1
17    0       1
18    0       1
19    0       1
20    0       1

我想要做的是,如果该行在flag == 1所在行的+/-两行内,则将include标志更改为0。

flag include
1     0       1
2     0       0
3     0       0
4     1       1
5     0       0
6     0       0
7     0       0
8     1       1
9     0       0
10    0       0
11    0       1
12    0       1
13    0       1
14    0       0
15    0       0
16    1       1
17    0       0
18    0       0
19    0       1
20    0       1

我想到了一些“创新”(读作:效率低下和过于复杂)的方法,但我认为一定有一个简单的方法,我忽略了。
如果答案是这样的,我可以将其推广到+/-n行,那就太好了,因为我有更多的数据,可能会在+/- 10行内搜索...

wz8daaqr

wz8daaqr1#

data.table的另一个选项:

library(data.table)
n = 2
# find the row number where flag is one
flag_one = which(df$flag == 1)

# find the index where include needs to be updated
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)

# update include in place
setDT(df)[idx[idx >= 1 & idx <= nrow(df)], include := 0][]

# or as @Frank commented the last step with base R would be
# df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0

#    flag include
# 1:    0       1
# 2:    0       0
# 3:    0       0
# 4:    1       1
# 5:    0       0
# 6:    0       0
# 7:    0       0
# 8:    1       1
# 9:    0       0
#10:    0       0
#11:    0       1
#12:    0       1
#13:    0       1
#14:    0       0
#15:    0       0
#16:    1       1
#17:    0       0
#18:    0       0
#19:    0       1
#20:    0       1

放入一个函数:

update_n <- function(df, n) {
    flag_one = which(df$flag == 1)
    idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
    df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
    df
}
xzlaal3s

xzlaal3s2#

肯定还有其他更简单的方法,但我能想到的第一种方法是使用sapplywhich

df$include[sapply(which(df$flag == 1) , function(x) c(x-2, x-1, x+1, x+2))] <- 0

df
#   flag include
#1     0       1
#2     0       0
#3     0       0
#4     1       1
#5     0       0
#6     0       0
#7     0       0
#8     1       1
#9     0       0
#10    0       0
#11    0       1
#12    0       1
#13    0       1
#14    0       0
#15    0       0
#16    1       1
#17    0       0
#18    0       0
#19    0       1
#20    0       1

我们首先找出flag为1的所有索引,然后围绕每个索引创建所需的数字序列,并将include的索引设置为0。
对于变量n,我们可以

n = 2
df$include[sapply(which(df$flag == 1),function(x) setdiff(seq(x-n, x+n),x))] <- 0
svujldwt

svujldwt3#

replace(x = df$include,
        list = sapply(1:NROW(df), function(i)
            any(df$flag[c(max(1, i-2):max(1, i-1),
                          min(i+1, NROW(df)):min(i+2, NROW(df)))] == 1)), values = 0)
# [1] 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1

对于n行,

replace(x = df$include,
        list = sapply(1:NROW(df), function(i)
            any(df$flag[c(max(1, i-n):max(1, i-1),
                          min(i+1, NROW(df)):min(i+n, NROW(df)))] == 1)), values = 0)
ha5z0ras

ha5z0ras4#

另一种方法是使用zoo::rollapply,为了确定一行是否在flag == 1所在行的+/-两行之内,我们检查窗口中的最大flag是否为1。
我们需要rollapply而不是rollmax,因为我们需要指定partial = T

is_within_flag_window <- function(flag, n) {
  zoo::rollapply(flag, width = (2 * n) + 1, partial = T, FUN = max) == 1
}

df %>%
  mutate(include = ifelse(flag == 1, 1,
                   ifelse(is_within_flag_window(flag, 2), 0,
                   1)))
jogvjijk

jogvjijk5#

使用whichouter

df$include[outer(which(df$flag==1), -2:2, `+`)] <- 0

如果在一个或两个位置内flag=1,则恢复在位置0被覆盖的数据。注意,如果“flag”在特定范围内重叠,则此步骤至关重要。

df$include[which(df$flag==1)] <- 1

   flag include
1     0       1
2     0       0
3     0       0
4     1       1
5     0       0
6     0       0
7     0       0
8     1       1
9     0       0
10    0       0
11    0       1
12    0       1
13    0       1
14    0       0
15    0       0
16    1       1
17    0       0
18    0       0
19    0       1
20    0       1

如果在数据集开头或结尾的一行或两行中flag = 1,R将抛出错误。

## assign i for convenience/readability
i <- pmax(1, pmin(nrow(df), outer(which(df$flag==1), -2:2, `+`)))
df$include[i] <- 0

恢复1和以前一样

相关问题