如何在R中迭代读取文件并获得隔离输出?

chhkpiq4  于 2023-05-20  发布在  其他
关注(0)|答案(1)|浏览(117)

我有多个.gz文件(每个月1个文件,它包含每分钟的数据)存储在多个文件夹(1个文件夹为1个状态)。文件顺序如下:

C:/Users/Desktop/Data/Test_OP (i.e., Main folder or wd)
 Ala (i.e. Folder 1)
   Ala_2021-03_0100_JC_qualitya.txt.gz
   Ala_2021-05_0100_JC_qualitya.txt.gz
   Ala_2022-02_0100_JC_qualitya.txt.gz
 
 Chi (i.e. Folder 2)
   TEX_2019-06_0100+0500_JC_qualitya.txt.gz
   TEX_2020-07_0100+0300_JC_qualitya.txt.gz
   ...
   ...

我有ungz他们得到各种.txt文件在各自的文件夹和处理他们得到15行的所有列的平均值。我用下面的R代码来实现它。但是作为输出,我得到了一个.csv文件,其中包含所有文件的15行平均值(即,所有文件的数据都聚集在一起)。然而,我希望在.csv文件中有相同的输出,但要按照车站的隔离方式,如下所述。另外,我使用的代码一次性读取所有文件,我想通过每个状态迭代地处理文件。任何人都可以请帮助我修改代码,以实现所需的输出。

Desired output
Ala
Time_sp C1 [C1c] C2 [C2c] C3 [C3c] C4 [C3c] C5 [h]               
2021-03-01 00:00:00+00 257 285 255 .....
2021-03-01 00:15:00+00  .....
2021-03-01 00:30:00+00  .....
..

Chi
Time_sp  C1 [CC]  C2 [C2]  C3 [C2]  C4  C5 [h]
2020-03-01 00:00:00+00  267  215  250 .....
2020-03-01 00:15:00+00  .....
2020-03-01 00:30:00+00  .....
..

我在R中使用的代码如下:

setwd("D:/Test2/")

decompress <- function(file, dest = sub("\\.gz$", "", file)) {
  # Set up source and destination connections
  src <- gzfile(file, "rb")
  on.exit(close(src), add = TRUE)
  dst <- file(dest, "wb")
  on.exit(close(dst), add = TRUE)

 # Copy decompressed contents from source to destination
 BATCH_SIZE <- 10 * 1024^2
 repeat {
   bytes <- readBin(src, raw(), BATCH_SIZE)
   if (length(bytes) != 0) {
     writeBin(bytes, dst)
   } else {
     break
   }
 }
 invisible(dest)
}

files <- list.files(pattern = "*.gz", full.names = TRUE, recursive = 
TRUE)
for (file in files) {
  decompress(file)
}

library(data.table)
library(tidyr)

#List of files
filelist <- list.files("D:/Test2/", full.names = TRUE, recursive 
                   = TRUE, pattern = ".txt$")

#Read the files
dt <- lapply(filelist, function(file) {
  lines <- readLines(file)
  comment_end = match("*/", lines)
  fread(file, skip = comment_end)
})

#Adjust Column names
dt.tidied <- lapply(dt, FUN = function(x){
  #adjust ? to degree
  setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)

  colnames(x) <- gsub("\\[", "(", colnames(x))
  colnames(x) <- gsub("\\]", ")", colnames(x))

  #return
  return(x)
})

 #bind, filling missing columns to NA
merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)

mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep = 
 "T")

mnf <- mn %>% 
  as_tibble() %>%
  group_by(group = as.integer(gl(n(), 15, n()))) %>%
  summarise(across(everything(), ~ if(mean(is.na(.x)) > 0.8) NA else 
mean(.x, na.rm = TRUE))) 

mnf

write.csv(mnf, 'D:/Test2/15min_full.csv')

输入数据的dput()如下:

structure(list(`Date/Time` = c("2021-03-01T00:01:00", "2021-03- 
01T00:02:00", "2021-03-01T00:03:00", "2021-03-01T00:04:00", "2021-03- 
01T00:05:00", "2021-03-01T00:06:00", "2021-03-01T00:07:00", "2021-03- 
01T00:08:00", "2021-03-01T00:09:00", "2021-03-01T00:10:00", "2021-03- 
01T00:11:00", "2021-03-01T00:12:00", "2021-03-01T00:13:00", "2021-03- 
01T00:14:00", "2021-03-01T00:15:00", "2021-03-01T00:16:00", "2021-03- 
01T00:17:00", "2021-03-01T00:18:00", "2021-03-01T00:19:00", "2021-03- 
01T00:20:00", "2021-03-01T00:21:00", "2021-03-01T00:22:00", "2021-03- 
01T00:23:00", "2021-03-01T00:24:00", "2021-03-01T00:25:00", "2021-03- 
01T00:26:00", "2021-03-01T00:27:00", "2021-03-01T00:28:00", "2021-03- 
01T00:29:00", "2021-03-01T00:30:00"), `XY [XY]` = c(0.990641, 
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 
0.990641), `C1 [CC]` = c(257L, 257L, 257L, 257L, 257L, 257L, 257L, 
257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 
257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 
257L, 257L, 257L), Cc = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `C2 [C2]` = c(285L, 
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L), Dc = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
`C3 [C2]` = c(255L, 255L, 255L, 255L, 255L, 255L, 255L, 
255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 
255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 
255L, 255L, 255L), Ac = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), C4 = c(0.463735, 0.463735, 
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 
0.463735, 0.463735, 0.463735, 0.463735), `C5 [h]` = c(1013L, 
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 
1013L, 1013L), `C6 [%]` = c(43L, 43L, 43L, 43L, 43L, 43L, 
43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 
43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L
), `C7 [E2]` = c(390L, 390L, 390L, 390L, 390L, 390L, 
390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 
390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 
390L, 390L, 390L, 390L), Jc = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `D [S]` = c(62.3716, 
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 
62.3716), `Sw [S2]` = c(1392.95, 1392.95, 1392.95, 1392.95, 
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 
1392.95, 1392.95, 1392.95, 1392.95, 1392.95), `SW [Q2]` = 
c(389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 
389.164, 389.164), `OA [H2]` = c(646.61, 646.61, 646.61, 646.61, 
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 
646.61, 646.61), `T2 [C]` = c(3.7, 3.7, 3.7, 3.7, 3.7, 
3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 
3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 
3.7), Lc = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-30L))
wkftcu5l

wkftcu5l1#

如果你将文件路径包含到你的 Dataframe 中,就可以按州和月份进行分组和过滤。我在这里也使用readr::read_*,因为它可以很好地处理文件列表,可以包含结果的路径,而不需要任何额外的努力,也可以处理压缩文件。
所以基本上:

  • 包括 Dataframe 中文件路径
  • 根据需要对数据集进行分组和聚合
  • 拆分为单独的 Dataframe
  • 将每个部分写入单独的文件

最后一部分包括玩具数据生成,原则上处理可能看起来像这样:

library(fs)
library(tidyr)
library(purrr)
library(readr)
library(dplyr)
library(stringr)

# read all files to a single df, include file_path and state in resulting df
main_d <- path("Test_OP")
all_storms <- dir_ls(main_d, glob = "*.tsv.gz", recurse = TRUE) |>
  read_tsv(id = "file_path", show_col_types = FALSE) |>
  # read_tsv skip argument works the same as in read.tsv:
  # read_tsv(id = "file_path", skip = 27, show_col_types = FALSE)
  mutate(state = str_split_i(file_path, "/",2), 
         month = str_split_i(file_path, "/",3), .after = 1) 
all_storms
#> # A tibble: 9,000 × 5
#>    file_path                     state month  wind pressure
#>    <chr>                         <chr> <chr> <dbl>    <dbl>
#>  1 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      85      940
#>  2 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      35     1007
#>  3 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      55      990
#>  4 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      40      997
#>  5 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      45      988
#>  6 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      40     1000
#>  7 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      45     1000
#>  8 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb     130      927
#>  9 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb      55      984
#> 10 Test_OP/Ala/Feb/storms.tsv.gz Ala   Feb     105      959
#> # ℹ 8,990 more rows

# some dummy processing on grouped data
storms_agr <- all_storms |>
  group_by(state, month) |> 
  summarise(avg_w = mean(wind)) |> 
  # currently grouped by state
  pivot_wider(names_from = month, values_from = avg_w) |>
  ungroup() 
#> `summarise()` has grouped output by 'state'. You can override using the
#> `.groups` argument.
storms_agr
#> # A tibble: 3 × 4
#>   state   Feb   Jan   Mar
#>   <chr> <dbl> <dbl> <dbl>
#> 1 Ala    48.2  49.3  49.5
#> 2 Chi    51.0  49.5  48.6
#> 3 Mis    50.7  51.3  50.0

# split by state and save each part to a separate file
split(storms_agr, ~ state) |>
  iwalk(~ write_csv(.x, path(main_d, .y, ext = "csv")))

# resulting files:
dir_tree(main_d, recurse = FALSE)
#> Test_OP
#> ├── Ala
#> ├── Ala.csv
#> ├── Chi
#> ├── Chi.csv
#> ├── Mis
#> └── Mis.csv
# one sample:
read_csv("Test_OP/Ala.csv", show_col_types = FALSE)
#> # A tibble: 1 × 4
#>   state   Feb   Jan   Mar
#>   <chr> <dbl> <dbl> <dbl>
#> 1 Ala    48.2  49.3  49.5

如果你宁愿做最小的改动,你可以替换你的

xy <- do.call(rbind, lapply(fnames, read.table, header=TRUE, sep = "\t", check.names = FALSE, 
skip=27))

未测试(Untested)

xy <- list_rbind(lapply(fnames, read.table, header=TRUE, sep = "\t", check.names = FALSE, skip=27), names_to = "file_path")

以获得结果 Dataframe 的文件名并从那里继续。
对于较大的数据集,避免一次性阅读所有文件是有意义的,而是迭代状态目录并单独处理每个文件。
生成样本数据:

# create sample files
setwd(tempdir())
main_d <- path("Test_OP") |> dir_create()
expand_grid(state = c("Ala", "Chi", "Mis"), 
            month = month.abb[1:3]) |>
  tibble::deframe() |>
  iwalk(~ {
    tsv_dest <- dir_create(main_d, .y, .x)
    dplyr::storms |>
      select(wind, pressure) |>
      slice_sample(, n = 1000) |>
      write_tsv(path(tsv_dest, "storms", ext = "tsv.gz"))
})
dir_tree(main_d)
#> Test_OP
#> ├── Ala
#> │   ├── Feb
#> │   │   └── storms.tsv.gz
#> │   ├── Jan
#> │   │   └── storms.tsv.gz
#> │   └── Mar
#> │       └── storms.tsv.gz
#> ├── Chi
#> │   ├── Feb
#> │   │   └── storms.tsv.gz
#> │   ├── Jan
#> │   │   └── storms.tsv.gz
#> │   └── Mar
#> │       └── storms.tsv.gz
#> └── Mis
#>     ├── Feb
#>     │   └── storms.tsv.gz
#>     ├── Jan
#>     │   └── storms.tsv.gz
#>     └── Mar
#>         └── storms.tsv.gz

创建于2023-05-15带有reprex v2.0.2

相关问题