使用文件和列表在C中修饰字符

6vl6ewon 于 2023-06-21 发布在其他

关注(0)|答案(3)|浏览(71)

我需要得到适当的波兰语字符“ąćśół "。我使用了一些解决方案，如setlocale，system chcp，wchar_t。只要我不使用文件/列表，一切都很顺利。wscanf，wprintf和wchar_t工作得很好。
但是，如果我试图从文件中读取一些内容并将其保存到列表中（即使是数组中），然后试图将其放到屏幕上，我无法获得正确的波兰语字符，并且在列表的情况下，我会不时地获得不同的结果，例如，**z**，**A2**，就像无处不在的随机字符。我一直试图通过使用fscanf和fgets`以及w（宽）变体来获得好的结果，但它不起作用。我做错什么了吗？

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <locale.h>

struct dyk{
    wchar_t line[200];                             
    struct dyk *next;                             
};

typedef struct dyk dyk;

void printdyk(char name[100]){
    dyk *wyp;
    wyp = malloc(sizeof(dyk));
    wchar_t yt[100];
    FILE *dyktando;
    dyktando = fopen(name, "r+");
    if(dyktando == NULL){
        wprintf(L"Błąd otwarcia pliku!\n");                 //Can't open file
    }else{
        fgets(&wyp->line, sizeof(dyk), dyktando);           //reading from file and send to the list
        wprintf(L"%s\n", wyp->line);                        //write text from the list on the screen
        wchar_t yt[100];
        wscanf(L"%s", &yt);                                 //testing strings comparing, so I have to put some variables
        int n=strcmp(yt, wyp->line);                        //str compare
        printf("%d", n);                                //result, it gives me -1 every time
    }
    fclose(dyktando);
}

我测试的功能与txt文件，内容只有一个字符“”。无法正确读取文件。在main函数的开头，我写了这两行：

system("chcp 852");
setlocale(LC_ALL, ".852");

我使用的是codeblock，mingw 32-gcc编译器，没有标志。

c

来源：https://stackoverflow.com/questions/54200889/polish-characters-in-c-using-files-and-lists

3条答案

按热度按时间

2eafrhcq1#

您没有在代码中的任何地方使用wchar_t兼容函数。特别是：

fgets(&wyp->line, sizeof(dyk), dyktando);           //reading from file and send to the list

兼容wchar_t的版本是fgetws。另外，wyp->line（不带&运算符）是正确的参数。

int n=strcmp(yt, wyp->line);                        //str compare

应该使用wcscmp。
还要注意的是，当一个函数需要的长度是 characters 而不是 bytes 时，wchar_t数组上的sizeof是不正确的（就像fgetws一样）。

赞(0）回复(0）举报 2023-06-21

eblbsuwk2#

OP（Amatheon）的一个评论指出，真正的潜在问题是如何使用宽字符函数正确读取文件。
为了确保最大的兼容性和可移植性，让我们限制为C99。考虑以下示例程序：

#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>

#ifdef   USE_ERRNO_CONSTANTS
#define  SET_ERRNO(value)  (errno = (value))
#else
#define  SET_ERRNO(value)
#endif

ssize_t get_wide_delimited(wchar_t **lineptr, size_t *sizeptr, wint_t delim, FILE *stream)
{
    wchar_t  *line = NULL;
    size_t    size = 0;
    size_t    used = 0;
    wint_t    wc;

    if (!lineptr || !sizeptr || !stream) {
        /* Invalid function parameters. NULL pointers are not allowed. */
        SET_ERRNO(EINVAL);
        return -1;
    }
    if (ferror(stream)) {
        /* Stream is already in error state. */        
        SET_ERRNO(EIO);
        return -1;
    }

    if (*sizeptr > 0) {
        line = *lineptr;
        size = *sizeptr;
    } else {
        *lineptr = NULL;
    }

    while (1) {

        wc = fgetwc(stream);
        if (wc == WEOF || wc == delim)
            break;

        if (used + 1 > size) {
            /* Growth policy.  We wish to allocate a chunk of memory at once,
               so we don't need to do realloc() too often as it is a bit slow,
               relatively speaking.  On the other hand, we don't want to do
               too large allocations, because that would waste memory.
               Anything that makes 'size' larger than 'used' will work.
            */
            if (used < 254)
                size = 256;
            else
            if (used < 65536)
                size = 2 * used;
            else
                size = (used | 65535) + 65521;

            line = realloc(line, size * sizeof (wchar_t));
            if (!line) {
                /* Out of memory. */
                SET_ERRNO(ENOMEM);
                return -1;
            }

            *lineptr = line;
            *sizeptr = size;
        }

        line[used++] = wc;
    }

    if (wc == WEOF) {
        /* Verify that the WEOF did not indicate a read error. */
        if (ferror(stream)) {
            /* Read error. */
            SET_ERRNO(EIO);
            return -1;
        }
    }

    /* Ensure there is enough room for the delimiter and end-of-string mark. */
    if (used + 2 > size) {
        /* We could reuse the reallocation policy here,
           with the exception that the minimum is used + 2, not used + 1.
           For simplicity, we use the minimum reallocation instead.
        */
        size = used + 2;
        line = realloc(line, size * sizeof (wchar_t));
        if (!line) {
            /* Out of memory. */
            SET_ERRNO(ENOMEM);
            return -1;
        }
        *lineptr = line;
        *sizeptr = size;
    }

    /* Append the delimiter, unless end-of-stream mark. */
    if (wc != WEOF)
        line[used++] = wc;

    /* Append the end-of-string nul wide char,
       but do not include it in the returned length. */
    line[used] = L'\0';

    /* Success! */
    return (ssize_t)used;
}

ssize_t get_wide_line(wchar_t **lineptr, size_t *sizeptr, FILE *stream)
{
    return get_wide_delimited(lineptr, sizeptr, L'\n', stream);
}

int main(int argc, char *argv[])
{
    wchar_t       *line = NULL, *p;
    size_t         size = 0;
    unsigned long  linenum;
    FILE          *in;
    int            arg;

    if (!setlocale(LC_ALL, ""))
        fprintf(stderr, "Warning: Your C library does not support your current locale.\n");
    if (fwide(stdout, 1) < 1)
        fprintf(stderr, "Warning: Your C library does not support wide standard output.\n");

    if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME [ FILENAME ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program will output the named files, using wide I/O.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    for (arg = 1; arg < argc; arg++) {

        in = fopen(argv[arg], "r");
        if (!in) {
            fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
            return EXIT_FAILURE;
        }

        if (fwide(in, 1) < 1) {
            fprintf(stderr, "%s: Wide input is not supported from this file.\n", argv[arg]);
            fclose(in);
            return EXIT_FAILURE;
        }

        linenum = 0;

        while (get_wide_line(&line, &size, in) > 0) {
            linenum++;

            /* We use another pointer to the line for simplicity.
               We must not modify 'line' (except via 'free(line); line=NULL; size=0;'
               or a similar reallocation), because it points to dynamically allocated buffer. */
            p = line;

            /* Remove leading whitespace. */
            while (iswspace(*p))
                p++;

            /* Trim off the line at the first occurrence of newline or carriage return.
               (The line will also end at the first embedded nul wide character, L'\0',
                if the file contains any.) */
            p[wcscspn(p, L"\r\n")] = L'\0';

            wprintf(L"%s: Line %lu: '%ls', %zu characters.\n", argv[arg], linenum, p, wcslen(p));
        }

        if (ferror(in)) {
            fprintf(stderr, "%s: Read error.\n", argv[arg]);
            fclose(in);
            return EXIT_FAILURE;
        }
        if (fclose(in)) {
            fprintf(stderr, "%s: Delayed read error.\n", argv[arg]);
            return EXIT_FAILURE;
        }

        wprintf(L"%s: Total %lu lines read.\n", argv[arg], linenum);
        fflush(stdout);
    }

    free(line);
    line = NULL;
    size = 0;

    return EXIT_SUCCESS;
}

因为EINVAL、EIO和ENOMEM errno常量在C标准中没有定义，所以如果您定义了USE_ERRNO_CONSTANTS预处理器值，get_wide_line()和get_wide_delimited()只会设置errno。
get_wide_line()和get_wide_delimited()是来自ISO/IEC TR 24731 - 2：2010的getwline()和getwdelim()函数的重新实现; POSIX. 1 getline()和getdelim()函数的宽字符等效项。与fgets()或fgetws()不同，它们使用动态分配的缓冲区来保存行，因此除了可用内存之外，没有固定的行长度限制。
我已经明确地将代码标记为使用Creative Commons Zero许可证：版权所有© 2018这意味着你可以在你自己的代码中使用它，无论你想要什么许可证。
注：我真的很希望用户推动他们的供应商和C标准委员会成员在下一版本的C标准中将这些包含在标准C库部分中。从上面可以看到，它们已经可以在标准C中实现了;只是C库本身可以更有效地做同样的事情。GNU C库就是一个完美的例子（尽管由于缺乏标准化，他们也在拖延实现）。试想一下，如果人们使用getline()/getdelim()/getwline()/getwdelim()而不是fgets()/fgetws()，会避免多少缓冲区溢出错误!也避免了必须考虑在每种情况下最大合理的线长度是多少。双赢!
(In事实上，我们可以将返回类型切换为size_t，并使用0代替-1作为错误指示符。这将把对C标准案文的修改限制在增加四项功能。它伤心和激怒我没有尽头，有这样一个重要的一组琐碎的功能，如此无情和无知的忽视，没有合理的理由。请尽你所能，不断地、无情地向你的供应商和任何你能接触到的C标准委员会成员抱怨这一点。你和他们都是值得的）。
该计划的基本部分是

if (!setlocale(LC_ALL, ""))

这告诉C库使用用户指定的区域设置。
请不要将区域设置值硬编码到您的程序中。在大多数操作系统中，您所需要做的就是在运行程序之前将LANG或LC_ALL环境变量更改为您想要使用的区域设置。
您可能会认为 "好吧，这次我可以硬编码它，因为这是用于此数据的区域设置"，但即使这样也可能是错误的，因为可以随时创建新的区域设置。当字符集部分被硬编码时，这尤其令人讨厌。例如，在西欧使用的ISO 8859单字节字符集是ISO 8859 - 15，而不是ISO 8859 - 1，因为ISO 8859 - 15中有€字符，而ISO 8859 - 1中没有。如果你在程序中硬编码了ISO 8859 - 1，那么它根本无法正确处理€字符。

if (fwide(stream, 1) < 1)用于stdout和文件句柄

虽然C库在内部确实根据您第一次在文件句柄上使用的I/O函数的类型来执行与fwide()调用等效的操作，但显式检查要好得多。
特别是，如果C库不能支持对句柄表示的文件或流的宽I/O，fwide()将返回负数。（除非第二个参数也是零，否则它永远不应该返回零;由于标准化中的问题，我建议在这种情况下使用严格的返回值检查方法，以捕获那些决定尽可能让程序员在编写可移植代码的同时在技术上仍然满足标准文本的供应商，就像微软正在做的那样。他们甚至在C标准委员会中安插了他们自己的代表，这样他们就可以调整C11，使其远离他们不想支持的C99特性，并获得他们自己的非标准扩展的批准，这些扩展以前没有人使用过，以帮助开发人员编写可移植的C代码。是的，我一点也不相信他们的行为。）

ssize_t len = get_wide_line(&line, &size, handle);

如果在第一次调用get_wide_line()或get_wide_delimited()之前初始化wchar_t *line = NULL;和size_t size = 0;，则该函数将根据需要动态调整缓冲区的大小。
当且仅当发生错误时，返回值为负。（函数不应返回零。）
当成功读取一行时，返回值反映缓冲区中宽字符的数量，包括分隔符（对于get_wide_delimited()，为换行符L'\n'），并且总是正值（大于零）。缓冲区中的内容将具有终止宽字符串结束字符L'\0'，但它不计入返回值。
请注意，当分隔符不是L'\0'时，缓冲区可能包含嵌入的宽nul字符L'\0'。在这种情况下，len > wcslen(line)。

上面的示例程序跳过每个输入行的任何前导空格，并在第一个换行符（L'\n'）、回车符（L'\r'）或nul（L'\0'）处修剪该行。因此，只检查返回值len是否成功（大于零的正返回值）。

free(line); line = NULL; size = 0;

在不再需要该行内容的任何时候，可以丢弃该行。我建议显式地将行指针设置为NULL，并将大小设置为零，以避免释放后使用的错误。此外，这允许任何后续的get_wide_line()或get_wide_delimited()正确地动态分配新的缓冲区。

ferror(handle)宽输入功能失败后

就像窄流和EOF一样，有两种情况下宽输入函数可能会返回WEOF（或返回-1，具体取决于函数）：因为没有更多的输入，或者因为发生了读取错误。
没有任何理由编写忽略读或写错误而不向用户报告的计算机程序。当然，它们是罕见的，但还没有罕见到程序员可以理智地认为它们永远不会发生。（事实上，闪存存储器的电路很脆弱，存储在脆弱的塑料 shell 中，承受着人类大小的压力（我一次又一次地坐在我的电脑上），错误并不罕见。）这是邪恶的，就像做饭的人懒得洗手一样，时不时地导致粪便细菌爆发。不要成为一个粪便细菌传播者等效的程序员。
假设你有一个粗心的讲师，他不允许你使用上面的get_wide_line()或get_wide_delimited()函数。
别担心。我们可以使用fgetws()实现相同的程序，如果我们将line限制在某个固定的上限（宽字符）。比这更长的行将改为两行或更多行：

#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>

#ifndef  MAX_WIDE_LINE_LEN
#define  MAX_WIDE_LINE_LEN  1023
#endif

int main(int argc, char *argv[])
{
    wchar_t        line[MAX_WIDE_LINE_LEN + 1], *p;
    unsigned long  linenum;
    FILE          *in;
    int            arg;

    if (!setlocale(LC_ALL, ""))
        fprintf(stderr, "Warning: Your C library does not support your current locale.\n");
    if (fwide(stdout, 1) < 1)
        fprintf(stderr, "Warning: Your C library does not support wide standard output.\n");

    if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME [ FILENAME ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program will output the named files, using wide I/O.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    for (arg = 1; arg < argc; arg++) {

        in = fopen(argv[arg], "r");
        if (!in) {
            fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
            return EXIT_FAILURE;
        }

        if (fwide(in, 1) < 1) {
            fprintf(stderr, "%s: Wide input is not supported from this file.\n", argv[arg]);
            fclose(in);
            return EXIT_FAILURE;
        }

        linenum = 0;

        while (1) {

            /* If line is an array, (sizeof line / sizeof line[0]) evaluates to
               the number of elements in it.  This does not work if line is a pointer
               to dynamically allocated memory.  In that case, you need to remember
               number of wide characters you allocated for in a separate variable,
               and use that variable here instead. */
            p = fgetws(line, sizeof line / sizeof line[0], in);
            if (!p)
                break;

            /* Have a new line. */
            linenum++;

            /* Remove leading whitespace. */
            while (iswspace(*p))
                p++;

            /* Trim off the line at the first occurrence of newline or carriage return.
               (The line will also end at the first embedded nul wide character, L'\0',
                if the file contains any.) */
            p[wcscspn(p, L"\r\n")] = L'\0';

            wprintf(L"%s: Line %lu: '%ls', %zu characters.\n", argv[arg], linenum, p, wcslen(p));
        }

        if (ferror(in)) {
            fprintf(stderr, "%s: Read error.\n", argv[arg]);
            fclose(in);
            return EXIT_FAILURE;
        }
        if (fclose(in)) {
            fprintf(stderr, "%s: Delayed read error.\n", argv[arg]);
            return EXIT_FAILURE;
        }

        wprintf(L"%s: Total %lu lines read.\n", argv[arg], linenum);
        fflush(stdout);
    }

    return EXIT_SUCCESS;
}

除了用于读取每一行的函数之外，不同之处在于，我没有将while循环条件保持为while ((p = fgetws(line, ...))) { ... }，而是改为我认为更具可读性的while (1) { p = fgetws(line, ...); if (!p) break; ...形式。
我故意先展示较长、较复杂的代码，最后展示较简单的代码，希望您能看到较复杂的代码实际上是较简单的main()--如果我们不只是计算代码行数或类似的愚蠢行为，而是看看有多少出错的机会。
正如OP自己在评论中所写的那样，传递给fgets()或fgetws()的缓冲区的大小是一个真实的的问题。有一些经验法则，但它们都遭受了对编辑的脆弱性（特别是数组和指针之间的差异）。对于getline()/getdelim()/getwline()/getwdelim()/get_wide_line()/get_wide_delimited()，经验法则是wchar_t *line = NULL; size_t size = 0; ssize_t len;和len = get_wide_line(&line, &size, handle);。没有变化，简单的记忆和使用。此外，它还摆脱了任何固定的限制。