R语言 根据另一列中的值计算列中某些值的出现次数

cdmah0mi  于 2023-04-27  发布在  其他
关注(0)|答案(4)|浏览(144)

我在R中有一个很大的dataframe,在第一列中有Names(重复),然后有几列具有不同的字符值。dataframe是这样的:

Names <- c("Benjamin Blue", "Benjamin Blue", "Benjamin Blue", "Sarah Red", "Sarah Red", "Mark Black", "Mark Black", "Mark Black", "Leonie White")
c1 <- c("Health", "Health", "Infrastructure", "Traffic", "Security", "Security", "Security", "Social", "" )
c2 <- c("Social", "", "Traffic", "", "Traffic", "Energy", "Health", "Social", "Security")
c3 <- c("", "", "Infrastructure", "Energy", "Energy", "", "Health", "Securtiy", "Social")
df_test <- data.frame(Names, c1, c2, c3)
df_test

如何计算所有个体(Benjamin BlueSarah RedMark BlackLeonie White)在c1c2c3列中命名主题TrafficSocialHealth的次数?
我的结果应该是这样的:

Names_result <- c("Benjamin Blue", "Sarah Red", "Mark Black", "Leonie White")
Traffic <- c(1, 2, 0, 0)
Social <- c(1, 0, 2, 1)
Health <- c(2, 0, 2, 0)

我尝试了以下代码:

library(dplyr)

df_test %>%
rowwise() %>% 
mutate(Traffic = sum(na.omit(c_across(c1:c3)) == "Social"),
       Traffic = ifelse(all(is.na(c_across(c1:c3))), NA, Traffic))

df_test %>% 
rowwise() %>% 
mutate(Social = sum(na.omit(c_across(c1:c3)) == "Social"),
       Social = ifelse(all(is.na(c_across(c1:c3))), NA, Social))

df_test %>% 
rowwise() %>% 
mutate(Health = sum(na.omit(c_across(c1:c3)) == "Health"),
       Health = ifelse(all(is.na(c_across(c1:c3))), NA, Health))

但是这并没有将一行中每个个体的值相加,也不会导致一个 Dataframe 。

mbskvtky

mbskvtky1#

我们可以先将“wide”格式重塑为“long”格式,filter只包含相关项,然后pivot_wider重塑回“wide”格式。
我们可以使用values_fn = length来“计数”每一项的出现,values_fill = 0pivot_wider中用0填充缺失的项。

library(tidyverse)

df_test %>% pivot_longer(-Names, names_to = NULL, values_to = "value") %>% 
  filter(value %in% c("Traffic", "Social", "Health")) %>% 
  pivot_wider(names_from = value, values_fn = length, values_fill = 0)

# A tibble: 4 × 4
  Names         Health Social Traffic
  <chr>          <int>  <int>   <int>
1 Benjamin Blue      2      1       1
2 Sarah Red          0      0       2
3 Mark Black         2      2       0
4 Leonie White       0      1       0
r6l8ljro

r6l8ljro2#

base解决方案:

cbind(df_test[1], stack(df_test, -1))[1:2] |>
  table() |>
  as.data.frame.matrix() |>
  subset(select = c(Traffic, Social, Health))

#               Traffic Social Health
# Benjamin Blue       1      1      2
# Leonie White        0      1      0
# Mark Black          0      2      2
# Sarah Red           2      0      0
nbysray5

nbysray53#

以下是使用pivot_longercountpivot_wider的一种方法:

library(dplyr)
library(tidyr)

df_test %>% 
  pivot_longer(cols = c1:c3) %>% 
  filter(value != "") %>% 
  count(Names, value) %>% 
  pivot_wider(names_from = value,
              values_from = n,
              values_fill = 0)

#> # A tibble: 4 × 8
#>   Names         Health Infrastructure Social Traffic Security Energy Securtiy
#>   <chr>          <int>          <int>  <int>   <int>    <int>  <int>    <int>
#> 1 Benjamin Blue      2              2      1       1        0      0        0
#> 2 Leonie White       0              0      1       0        1      0        0
#> 3 Mark Black         2              0      2       0        2      1        1
#> 4 Sarah Red          0              0      0       2        1      2        0

另一种方法是使用dplyover::over。免责声明:我是维护者,软件包不在CRAN上。

library(dplyr)
library(dplyover) # https://github.com/TimTeaFan/dplyover

df_test %>% 
  group_by(Names) %>% 
  summarise(over(c("Traffic", "Social", "Health"),
                 ~ sum(unlist(pick(c1:c3)) %in% .x)))

#> # A tibble: 4 × 4
#>   Names         Traffic Social Health
#>   <chr>           <int>  <int>  <int>
#> 1 Benjamin Blue       1      1      2
#> 2 Leonie White        0      1      0
#> 3 Mark Black          0      2      2
#> 4 Sarah Red           2      0      0

数据来自OP

Names <- c("Benjamin Blue", "Benjamin Blue", "Benjamin Blue", "Sarah Red", "Sarah Red", "Mark Black", "Mark Black", "Mark Black", "Leonie White")
c1 <- c("Health", "Health", "Infrastructure", "Traffic", "Security", "Security", "Security", "Social", "" )
c2 <- c("Social", "", "Traffic", "", "Traffic", "Energy", "Health", "Social", "Security")
c3 <- c("", "", "Infrastructure", "Energy", "Energy", "", "Health", "Securtiy", "Social")
df_test<- data.frame(Names, c1, c2, c3)

创建于2023-04-26带有reprex v2.0.2

u4dcyp6a

u4dcyp6a4#

data.table解决方案

library(data.table)
setDT(df_test)

choices <- c("Traffic", "Social", "Health")

dcast(df_test[, .(topic = unlist(.SD[, c1:c3])), Names][topic %in% choices, ], Names ~ topic, fun.aggregate = length)

dcast(melt(df_test, id = "Names")[value %in% choices], Names ~ value, fun.aggregate = length)

结果

Names Health Social Traffic
1: Benjamin Blue      2      1       1
2:  Leonie White      0      1       0
3:    Mark Black      2      2       0
4:     Sarah Red      0      0       2

相关问题