在c#中,将大文件(filename.txt)按行划分为较小的分块文件的代码存在问题

flmtquvp  于 2023-03-29  发布在  C#
关注(0)|答案(1)|浏览(109)

这是chunk.c文件的代码,它将一个大文件(filename.txt)按行划分为较小的分块文件。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>

#define DEFAULT_PREFIX "x"
#define DEFAULT_CHUNK_SIZE 1000
#define ALPHABET_SIZE 26
#define MAX_DIGITS 2

void print_usage() {
    printf("Usage: chunk [-l line_count | -w word_count | -c character_count] [-p prefix] [-s suffix] [-f filename.txt | < filename.txt]\n");
}

int main(int argc, char *argv[]) {
    char *prefix = DEFAULT_PREFIX;
    int chunk_size = DEFAULT_CHUNK_SIZE;
    int suffix_start = 0;
    char *filename = NULL;

    // Parse command line arguments
    int opt;
    while ((opt = getopt(argc, argv, "l:p:s:f:")) != -1) {
        switch (opt) {
            case 'l':
                chunk_size = atoi(optarg);
                break;
  
            case 'p':
                prefix = optarg;
                break;
  
            case 's':
                suffix_start = atoi(optarg);
                break;
  
            case 'f':
                filename = optarg;
                break;
  
            default:
                print_usage();
                return 1;
        }
    }

    // Open input file
    int input_fd = STDIN_FILENO;
    if (filename != NULL) {
        input_fd = open(filename, O_RDONLY);
        if (input_fd == -1) {
            printf("Error: could not open file '%s': %s\n", filename, strerror(errno));
            return -1;
        }
    }

    // Read input file and write output files
    int line_count = 0;
    int chunk_count = 0;
    char suffix[MAX_DIGITS + 1];
    suffix[MAX_DIGITS] = '\0';
    int output_fd = -1;

    while (1) {
        if (line_count == 0) {
            // Close previous output file
            if (output_fd != -1) {
                close(output_fd);
                output_fd = -1;
            }

            // Open new output file (get new filename)
            snprintf(suffix, MAX_DIGITS + 1, "%02d", suffix_start + chunk_count);
            char *filename = malloc(strlen(prefix) + strlen(suffix) + 1);
            strcpy(filename, prefix);
            strcat(filename, suffix);
            
            output_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | 
                        S_IRGRP | S_IWGRP | S_IROTH);
            if (output_fd == -1) {
                printf("Error: could not create file '%s': %s\n", filename, 
                        strerror(errno));
                return -1;
            }
            free(filename);

            chunk_count++;
        } // close if loop 

        // Read input
        char buffer[chunk_size];
        ssize_t bytes_read = read(input_fd, buffer, chunk_size);
        if (bytes_read == -1) {
            printf("Error: could not read input: %s\n", strerror(errno));
            return -1;
        }
        if (bytes_read == 0) {
            break;
        }
        
        // write output
        ssize_t bytes_written = write(output_fd, buffer, bytes_read);
        if (bytes_written == -1) {
            printf("Error: could not write output : %s\n", strerror(errno));
            return -1;
        }
    
        // Update line count
        for (int i = 0; i < bytes_written; i++) {
            if (buffer[i] == '\n') {
                line_count++;
            }
        }
        // Check if it's time to start a new chunk
        if (line_count >= chunk_size) {
            line_count = 0;
        }
    } // close while loop

    // Close input and output files
    if (input_fd != STDIN_FILENO) {
        close(input_fd);
    }
    if (output_fd != -1) {
        close(output_fd);
    }

    return 0;
} // close main

运行该示例,预期结果为

$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $?   # check exit status
0
$ wc *part* z_answer.jok.txt 
  100   669  4052 part-00
  100   725  4221 part-01
  100   551  3373 part-02
  100   640  3763 part-03
  100   588  3685 part-04
  100   544  3468 part-05
   90   473  3017 part-06
  690  4190 25579 z_answer.jok.txt
 1380  8380 51158 total

但是当我运行上面的代码时,结果是这样的。

$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $?   # check exit status
0
$ wc *part* z_answer.jok.txt 
  102   675  4100 part-00
  101   745  4300 part-01
  100   554  3400 part-02
  101   640  3800 part-03
  103   609  3800 part-04
  100   534  3400 part-05
   83   434  2779 part-06
  690  4190 25579 z_answer.jok.txt
 1380  8381 51158 total

我认为问题是在//更新行计数部分,并试图修复,但仍然卡住。任何想法,可以划分行的数字,用户设置?

kyxcudwk

kyxcudwk1#

OP需要一个新的算法。
每个缓冲液,多次:寻找一个\n,写这一行,可能关闭/打开一个文件。

// Pseudo code
read_chunk = 4k (Size is independent of lines/file)
while ((length = read(buffer, read_chunk)) > 0) {
  if needed, open next destination file
  start = buffer
  end = start + length
  while (start < end) {
    line_end = strchr(start, '\n')
    if (line_end) line_end++; line_count++;
    else line_end = end
    // write a (maybe partial) line 
    write(start, line_end - start)
    if (line_count big enough)
      reset line_count
      close destination file
    start = line_end
  }
}
close file, if needed

相关问题