改变R中df的结构

y3bcpkx1  于 2023-03-10  发布在  其他
关注(0)|答案(2)|浏览(116)

我有一个来自秘鲁人口普查的数据框,看起来像这样:(抱歉,我无法翻译每一行)

df <- data.frame(
  column1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
              NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
              NA, NA, NA, NA, NA, NA, NA), column2 = c("AREA # 010101", NA, 
                                                       "P5a+: Last week, according to large group, what is the main occupation?", 
                                                       NA, "Miembros del Poder Ejecutivo, Legislativo, Judicial y personal directivo de la administración pública y privada", 
                                                       "Scientific and intellectual professionals", "Profesionales técnicos", 
                                                       "Jefes y empleados administrativos", "Trabajadores de los servicios y vendedores de comercios y mercados", 
                                                       "Agricultores y trabajadores calificados agropecuarios, forestales y pesqueros", 
                                                       "Trabajadores de la construcción, edificación, productos artesanales, electricidad y las telecomunicaciones", 
                                                       "Operadores de maquinaria industrial, ensambladores y conductores de transporte", 
                                                       "Ocupaciones elementales", "Ocupaciones militares y policiales", 
                                                       "Total", NA, "No Aplica :", NA, "AREA # 010102", NA, "P5a+: Last week, according to large group, what is the main occupation?", 
                                                       NA, "Miembros del Poder Ejecutivo, Legislativo, Judicial y personal directivo de la administración pública y privada", 
                                                       "Scientific and intellectual professionals", "Profesionales técnicos", 
                                                       "Jefes y empleados administrativos", "Trabajadores de los servicios y vendedores de comercios y mercados", 
                                                       "Agricultores y trabajadores calificados agropecuarios, forestales y pesqueros", 
                                                       "Trabajadores de la construcción, edificación, productos artesanales, electricidad y las telecomunicaciones", 
                                                       "Ocupaciones elementales", "Ocupaciones militares y policiales", 
                                                       "Total"), column3 = c("010101 Amazonas, Chachapoyas, district: Chachapoyas", 
                                                                             NA, "P: Gender", "Men", "59", "1510", "623", "551", "1022", 
                                                                             "567", "1509", "820", "1136", "212", "8009", NA, "17898", NA, 
                                                                             "010102 Amazonas, Chachapoyas, district: Asunción", NA, "P: Gender", 
                                                                             "Men", "1", "5", "1", "1", "1", "4", "1", "17", "3", "34"), 
  column4 = c(NA, NA, NA, "Women", "28", "1411", "491", "915", 
              "1864", "113", "136", "9", "1091", "61", "6119", NA, NA, 
              NA, NA, NA, NA, "Women", "-", "2", "1", "-", "2", "8", "-", 
              "6", "-", "19"), column5 = c(NA, NA, NA, "Total", "87", "2921", 
                                           "1114", "1466", "2886", "680", "1645", "829", "2227", "273", 
                                           "14128", NA, NA, NA, NA, NA, NA, "Total", "1", "7", "2", 
                                           "1", "3", "12", "1", "23", "3", "53"))
)

我想获得一个新的 Dataframe ,其中总结了一些具体信息,如以下所示:

df_to_obtain <- data.frame(
  "District" = c("Chachapoyas","Asunción"), #From rows '010101 Amazonas, Chachapoyas, distrito: Chachapoyas' and '010102 Amazonas, Chachapoyas, distrito: Asunción'
  "Women Scientific and intellectual professionals(PR)" = 
    c(0.4830537, #1411 divided by 2921
      0.2857143 #2 divided by 7
    ),
  "Scientific and intellectual professionals(PR)" =
    c( 0.2067525,#2921 divided by 14128
       0.1320755#7 divided by 53
    ),
  "Total workers" = c(14128,53)
)

有没有办法得到它?

rks48beu

rks48beu1#

我们可以

library(stringr)
library(dplyr)
df %>% 
 group_by(grp = cumsum(str_detect(column2, "AREA #") & 
    !is.na(column2))) %>%
  summarise(district = str_extract(first(column3), ".*district: (\\w+)", 
   group = 1),
  Women.Scientific.and.intellectual.professionals.PR. = as.numeric(column4[column2 %in% 
   "Scientific and intellectual professionals"])/
   as.numeric(column5[column2 %in% 
    "Scientific and intellectual professionals"]), "Scientific and intellectual professionals(PR)" = as.numeric(column5[column2 %in% 
    "Scientific and intellectual professionals"])/as.numeric(column5[column2 %in% "Total"]), Total_workers = as.numeric(column5[column2 %in% "Total"]), .groups = "drop") %>%
  select(-grp)
  • 输出
# A tibble: 2 × 4
  district    Women.Scientific.and.intellectual.professionals.PR. `Scientific and intellectual professionals(PR)` Total_workers
  <chr>                                                     <dbl>                                           <dbl>         <dbl>
1 Chachapoyas                                               0.483                                           0.207         14128
2 Asunción                                                  0.286                                           0.132            53
f0ofjuux

f0ofjuux2#

library(tidyverse)
df %>%
  mutate(district = if_else(lead(column2 == "P: Gender"), 
                          word(column2, start = -1), NA_character_),
         across(column2:column4, as.numeric)) %>%
  fill(district) %>%
  filter(column1 == "Scientific and intellectual professionals") %>%
  group_by(district) %>%
  summarize("Women scientific and intellectual professionals(PR)" =            
            sum(column3, na.rm = TRUE) / sum(column4, na.rm = TRUE))

结果

# A tibble: 2 × 2
  district    `Women scientific and intellectual professionals(PR)`
  <chr>                                                       <dbl>
1 Asunción                                                    0.5  
2 Chachapoyas                                                 0.337

相关问题