我有多个.gz文件(每个月1个文件,它包含每分钟的数据)存储在多个文件夹(1个文件夹为1个状态)。文件顺序如下:
C:/Users/Desktop/Data/Test_OP (i.e., Main folder or wd)
Ala (i.e. Folder 1)
Ala_2021-03_0100_JC_qualitya.txt.gz
Ala_2021-05_0100_JC_qualitya.txt.gz
Ala_2022-02_0100_JC_qualitya.txt.gz
Chi (i.e. Folder 2)
TEX_2019-06_0100+0500_JC_qualitya.txt.gz
TEX_2020-07_0100+0300_JC_qualitya.txt.gz
...
...
我有ungz他们得到各种.txt文件在各自的文件夹和处理他们得到15行的所有列的平均值。我用下面的R代码来实现它。但是作为输出,我得到了一个.csv文件,其中包含所有文件的15行平均值(即,所有文件的数据都聚集在一起)。然而,我希望在.csv文件中有相同的输出,但要按照车站的隔离方式,如下所述。另外,我使用的代码一次性读取所有文件,我想通过每个状态迭代地处理文件。任何人都可以请帮助我修改代码,以实现所需的输出。
Desired output
Ala
Time_sp C1 [C1c] C2 [C2c] C3 [C3c] C4 [C3c] C5 [h]
2021-03-01 00:00:00+00 257 285 255 .....
2021-03-01 00:15:00+00 .....
2021-03-01 00:30:00+00 .....
..
Chi
Time_sp C1 [CC] C2 [C2] C3 [C2] C4 C5 [h]
2020-03-01 00:00:00+00 267 215 250 .....
2020-03-01 00:15:00+00 .....
2020-03-01 00:30:00+00 .....
..
我在R中使用的代码如下:
setwd("D:/Test2/")
decompress <- function(file, dest = sub("\\.gz$", "", file)) {
# Set up source and destination connections
src <- gzfile(file, "rb")
on.exit(close(src), add = TRUE)
dst <- file(dest, "wb")
on.exit(close(dst), add = TRUE)
# Copy decompressed contents from source to destination
BATCH_SIZE <- 10 * 1024^2
repeat {
bytes <- readBin(src, raw(), BATCH_SIZE)
if (length(bytes) != 0) {
writeBin(bytes, dst)
} else {
break
}
}
invisible(dest)
}
files <- list.files(pattern = "*.gz", full.names = TRUE, recursive =
TRUE)
for (file in files) {
decompress(file)
}
library(data.table)
library(tidyr)
#List of files
filelist <- list.files("D:/Test2/", full.names = TRUE, recursive
= TRUE, pattern = ".txt$")
#Read the files
dt <- lapply(filelist, function(file) {
lines <- readLines(file)
comment_end = match("*/", lines)
fread(file, skip = comment_end)
})
#Adjust Column names
dt.tidied <- lapply(dt, FUN = function(x){
#adjust ? to degree
setnames(x, old = "T2 [?C]", new = "T2 [°C]", skip_absent = TRUE)
colnames(x) <- gsub("\\[", "(", colnames(x))
colnames(x) <- gsub("\\]", ")", colnames(x))
#return
return(x)
})
#bind, filling missing columns to NA
merged <- rbindlist(dt.tidied, fill = TRUE, use.names = TRUE)
mn <- merged %>% separate(`Date/Time`, into = c("Date", "Time"), sep =
"T")
mnf <- mn %>%
as_tibble() %>%
group_by(group = as.integer(gl(n(), 15, n()))) %>%
summarise(across(everything(), ~ if(mean(is.na(.x)) > 0.8) NA else
mean(.x, na.rm = TRUE)))
mnf
write.csv(mnf, 'D:/Test2/15min_full.csv')
输入数据的dput()如下:
structure(list(`Date/Time` = c("2021-03-01T00:01:00", "2021-03-
01T00:02:00", "2021-03-01T00:03:00", "2021-03-01T00:04:00", "2021-03-
01T00:05:00", "2021-03-01T00:06:00", "2021-03-01T00:07:00", "2021-03-
01T00:08:00", "2021-03-01T00:09:00", "2021-03-01T00:10:00", "2021-03-
01T00:11:00", "2021-03-01T00:12:00", "2021-03-01T00:13:00", "2021-03-
01T00:14:00", "2021-03-01T00:15:00", "2021-03-01T00:16:00", "2021-03-
01T00:17:00", "2021-03-01T00:18:00", "2021-03-01T00:19:00", "2021-03-
01T00:20:00", "2021-03-01T00:21:00", "2021-03-01T00:22:00", "2021-03-
01T00:23:00", "2021-03-01T00:24:00", "2021-03-01T00:25:00", "2021-03-
01T00:26:00", "2021-03-01T00:27:00", "2021-03-01T00:28:00", "2021-03-
01T00:29:00", "2021-03-01T00:30:00"), `XY [XY]` = c(0.990641,
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641,
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641,
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641,
0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641, 0.990641,
0.990641), `C1 [CC]` = c(257L, 257L, 257L, 257L, 257L, 257L, 257L,
257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L,
257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L, 257L,
257L, 257L, 257L), Cc = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `C2 [C2]` = c(285L,
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L,
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L,
285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L, 285L), Dc = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
`C3 [C2]` = c(255L, 255L, 255L, 255L, 255L, 255L, 255L,
255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L,
255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L, 255L,
255L, 255L, 255L), Ac = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), C4 = c(0.463735, 0.463735,
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735,
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735,
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735,
0.463735, 0.463735, 0.463735, 0.463735, 0.463735, 0.463735,
0.463735, 0.463735, 0.463735, 0.463735), `C5 [h]` = c(1013L,
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L,
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L,
1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L,
1013L, 1013L), `C6 [%]` = c(43L, 43L, 43L, 43L, 43L, 43L,
43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L,
43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L, 43L
), `C7 [E2]` = c(390L, 390L, 390L, 390L, 390L, 390L,
390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L,
390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L, 390L,
390L, 390L, 390L, 390L), Jc = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `D [S]` = c(62.3716,
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716,
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716,
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716,
62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716, 62.3716,
62.3716), `Sw [S2]` = c(1392.95, 1392.95, 1392.95, 1392.95,
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95,
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95,
1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95, 1392.95,
1392.95, 1392.95, 1392.95, 1392.95, 1392.95), `SW [Q2]` =
c(389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164,
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164,
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164,
389.164, 389.164, 389.164, 389.164, 389.164, 389.164, 389.164,
389.164, 389.164), `OA [H2]` = c(646.61, 646.61, 646.61, 646.61,
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61,
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61,
646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61, 646.61,
646.61, 646.61), `T2 [C]` = c(3.7, 3.7, 3.7, 3.7, 3.7,
3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7,
3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7, 3.7,
3.7), Lc = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-30L))
1条答案
按热度按时间wkftcu5l1#
如果你将文件路径包含到你的 Dataframe 中,就可以按州和月份进行分组和过滤。我在这里也使用
readr::read_*
,因为它可以很好地处理文件列表,可以包含结果的路径,而不需要任何额外的努力,也可以处理压缩文件。所以基本上:
最后一部分包括玩具数据生成,原则上处理可能看起来像这样:
如果你宁愿做最小的改动,你可以替换你的
未测试(Untested)
以获得结果 Dataframe 的文件名并从那里继续。
对于较大的数据集,避免一次性阅读所有文件是有意义的,而是迭代状态目录并单独处理每个文件。
生成样本数据:
创建于2023-05-15带有reprex v2.0.2