这是chunk.c文件的代码,它将一个大文件(filename.txt)按行划分为较小的分块文件。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#define DEFAULT_PREFIX "x"
#define DEFAULT_CHUNK_SIZE 1000
#define ALPHABET_SIZE 26
#define MAX_DIGITS 2
void print_usage() {
printf("Usage: chunk [-l line_count | -w word_count | -c character_count] [-p prefix] [-s suffix] [-f filename.txt | < filename.txt]\n");
}
int main(int argc, char *argv[]) {
char *prefix = DEFAULT_PREFIX;
int chunk_size = DEFAULT_CHUNK_SIZE;
int suffix_start = 0;
char *filename = NULL;
// Parse command line arguments
int opt;
while ((opt = getopt(argc, argv, "l:p:s:f:")) != -1) {
switch (opt) {
case 'l':
chunk_size = atoi(optarg);
break;
case 'p':
prefix = optarg;
break;
case 's':
suffix_start = atoi(optarg);
break;
case 'f':
filename = optarg;
break;
default:
print_usage();
return 1;
}
}
// Open input file
int input_fd = STDIN_FILENO;
if (filename != NULL) {
input_fd = open(filename, O_RDONLY);
if (input_fd == -1) {
printf("Error: could not open file '%s': %s\n", filename, strerror(errno));
return -1;
}
}
// Read input file and write output files
int line_count = 0;
int chunk_count = 0;
char suffix[MAX_DIGITS + 1];
suffix[MAX_DIGITS] = '\0';
int output_fd = -1;
while (1) {
if (line_count == 0) {
// Close previous output file
if (output_fd != -1) {
close(output_fd);
output_fd = -1;
}
// Open new output file (get new filename)
snprintf(suffix, MAX_DIGITS + 1, "%02d", suffix_start + chunk_count);
char *filename = malloc(strlen(prefix) + strlen(suffix) + 1);
strcpy(filename, prefix);
strcat(filename, suffix);
output_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR |
S_IRGRP | S_IWGRP | S_IROTH);
if (output_fd == -1) {
printf("Error: could not create file '%s': %s\n", filename,
strerror(errno));
return -1;
}
free(filename);
chunk_count++;
} // close if loop
// Read input
char buffer[chunk_size];
ssize_t bytes_read = read(input_fd, buffer, chunk_size);
if (bytes_read == -1) {
printf("Error: could not read input: %s\n", strerror(errno));
return -1;
}
if (bytes_read == 0) {
break;
}
// write output
ssize_t bytes_written = write(output_fd, buffer, bytes_read);
if (bytes_written == -1) {
printf("Error: could not write output : %s\n", strerror(errno));
return -1;
}
// Update line count
for (int i = 0; i < bytes_written; i++) {
if (buffer[i] == '\n') {
line_count++;
}
}
// Check if it's time to start a new chunk
if (line_count >= chunk_size) {
line_count = 0;
}
} // close while loop
// Close input and output files
if (input_fd != STDIN_FILENO) {
close(input_fd);
}
if (output_fd != -1) {
close(output_fd);
}
return 0;
} // close main
运行该示例,预期结果为
$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $? # check exit status
0
$ wc *part* z_answer.jok.txt
100 669 4052 part-00
100 725 4221 part-01
100 551 3373 part-02
100 640 3763 part-03
100 588 3685 part-04
100 544 3468 part-05
90 473 3017 part-06
690 4190 25579 z_answer.jok.txt
1380 8380 51158 total
但是当我运行上面的代码时,结果是这样的。
$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $? # check exit status
0
$ wc *part* z_answer.jok.txt
102 675 4100 part-00
101 745 4300 part-01
100 554 3400 part-02
101 640 3800 part-03
103 609 3800 part-04
100 534 3400 part-05
83 434 2779 part-06
690 4190 25579 z_answer.jok.txt
1380 8381 51158 total
我认为问题是在//更新行计数部分,并试图修复,但仍然卡住。任何想法,可以划分行的数字,用户设置?
1条答案
按热度按时间kyxcudwk1#
OP需要一个新的算法。
每个缓冲液,多次:寻找一个
\n
,写这一行,可能关闭/打开一个文件。