在patricia/radix树上插入单词时遇到问题

oiopk7p5  于 2023-06-28  发布在  其他
关注(0)|答案(1)|浏览(105)

我正试图为patricia/radix树的大学应用程序编写代码,该树将从txt文件中的每行读取的单词插入树中,因此如果我读取一个文件,该文件说

roman
romance
romantic

其建议被实现为

(root)
      |
    roman$
   /     \
 ce$      tic$

我还添加了检查点,这样我就可以测试程序在死亡之前能走多远,所以我的结构被设置为这样:

typedef struct no no; // declaration of the struct no

typedef struct no {
    char *str; // pointer to store a string
    struct no *child[26]; // Array of pointers to all possible children of the tree
    int isword; // Flag to know if its a full word
    int isroot; // Flag to know if its the root of the tree
} tree;

我的insertintree声明:

no* InsertWord(no *r, char *str){
    printf("checkpoint 5 %s\n", str);
    //caso 1, we are at the root
    if(r->isroot){
        r->child[r->str[0] - 'a'] = InsertWord(r->str[r->str[0] - 'a'], str);
        return r;
    }
    //caso 2, found an empty node
    if(r->str == NULL){
        strcpy(r->str, str);
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    //caso 3, word is a perfect fit for the prefix
    if(strcmp(r->str, str)){
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    int i;
    for(i = 0; str[i] == r->str[i]; i++);
    //caso 4, the prefix fits the first part of the word, but there is still more to be inserted
    if(i == strlen(r->str)){
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
    //caso 5, the word fits the prefix, but there is still more prefix
    if(i == strlen(str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->filhos[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        return r;
    }
    //caso 6, the first part of the word fits with the first part of the preix, but then they differ
    if(i < strlen(str) && i < strlen(r->str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->child[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
}

我现在的输出看起来像这样

checkpoint 1
checkpoint 2
checkpoint 3 roman
checkpoint 4 roman
checkpoint 5
checkpoint 5

然后程序死亡,检查点1到4都与读取文件和验证字的函数相关,它们似乎都运行良好
我试着添加一个动态分配到程序,似乎没有帮助的问题,我知道我应该提供一个最小的可复制的例子,但我不知道问题在哪里(代码不是原来在英语我尽力提供翻译,但如果任何部分仍然是葡萄牙语,我没有注意到我很抱歉)
由于它是我的要求,我将添加其余的函数,调用这个函数,它有点长,但在这里你去

no *ValidateWord(no *r, char *str) {
    int i;
    printf("checkpoint 3 %s\n", str);
    for (i = 0; str[i] != '\0'; i++) {
        if (str[i] < 'A' || (str[i] > 'Z' && str[i] < 'a') || str[i] > 'z') {
            return r;
        } else if (str[i] >= 'A' && str[i] <= 'Z') {
            str[i] = str[i] + 32;
        }
    }
    printf("checkpoint 4 %s\n", str);
    return InsertWord(r, str);
}

no *LoadFile(no *r) {
    printf("checkpoint 1");
    printf("Insert the name of the chosen file: ");
    char namefile[100];
    scanf("%s", namefile);
    FILE *fp1;
    fp1 = fopen(namefile, "r");
    if (fp1 == NULL) {
        printf("couldnt open the file.\n");
        return r;
    }
    char str[100];
    while (fgets(str, sizeof(str), fp1)) {
        printf("checkpoint 2 %s\n", str);
        str[strcspn(str, "\n")] = '\0';
        r = ValidateWord(r, str);
    }
    fclose(fp1);
    return r;
}

这是调用所有这些的主文件

void menu(no* r) {
    int c = 0;
    printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
    scanf("%d", &c);
    while (c != 0) {
        if (c == 1)
            r = LoadFile(r); // Calls the LoadFile function
        else if (c == 2)
            CheckWords(r); // calls the function CheckWords
        else if (c == 3)
            PrintDictionary(r); // Calls the fucntion PrintDictionary
        else if (c == 4)
            r = LoadStopwordsFile(r); // calls the LoadStopwordsFile function
        printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
        scanf("%d", &c);
    }
}

int main() {
    no* r = (no*) malloc(sizeof(no)); // alocates memory for the root
    if (r == NULL) {
        printf("Error alocating memory.\n");
        return 1; // closes the aplication
    }
    r->isroot = 1; // defines the isroot flag to true
    menu(&r); // calls the menu function
    free(r); // frees the memory alocated for the root
    return 0;
}
nuypyhwy

nuypyhwy1#

Patrica树是隐式二进制基数树的一种特殊情况,它比这里的树更紧凑(当然是学究式的)。
我不确定你是否需要isroot;是做什么用的?空树?
isword$的带外符号;你不需要(或想要)两个符号来表示同一件事。(对于以null结尾的字符串,自然的trie条目是使用'\0'作为结尾后缀$
Morin, Patrick. "Data Structures for Strings"
作为一种优化,边缘标签可以通过使用两个指向字符串的指针(用于第一个和最后一个元素)以恒定大小存储
另一种选择是不复制字符串,而只是指向子字符串,而不是复制和操作字符串。
编辑:在您跟踪的Java code in the tutorial中,它使用带外isword并复制子字符串。这看起来像是在分配节点,而不是字符串的副本,Java会自动处理这些副本。
我认为以下是一个很好的类比:子串存储在struct末尾的flexible array member中;这样,节点和串的副本处于连续的存储器分配中,这减少了碎片和对malloc的调用。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

// A flexible array member for the substring.
struct node {
    struct node *child[26];
    int isword;
    char str[];
};
struct trie { struct node *root; };

// graph helper
static void graph_logic(const struct node *const node,
    const struct node *const parent, FILE *const fp) {
    fprintf(fp,
        "\tnode%p [label = \"%s\"];\n"
        "\tnode%p -> node%p [label=\"%s\"];\n",
        (const void *)node, node->isword ? "word" : "not",
        (const void *)parent, (const void *)node, node->str);
    for(struct node *const*i = node->child,
        *const*end = i + sizeof node->child / sizeof *node->child;
        i < end; i++) if(*i) graph_logic(*i, node, fp);
}
// graphs a trie in GraphViz format: https://graphviz.org/
static void graph(const struct trie *const trie, const char *const fn) {
    FILE *fp;
    assert(trie && fn);
    if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
    fprintf(fp, "digraph {\n"
        "\tgraph [truecolor=true, bgcolor=transparent, fontname=modern];\n"
        "\tnode [shape=none, fontname=modern];\n"
        "\tnode%p [label=\"root\"];\n", (void *)0);
    if(trie->root) graph_logic(trie->root, 0, fp);
    fprintf(fp, "}\n");
    fclose(fp);
}

static int InsertWord(struct trie *t, char *str){
    // The tree is empty.
    if(t->root == NULL){
        struct node *r;
        const size_t len = strlen(str);
        if(!(r = malloc(sizeof t->root + len + 1))) return 0; // FAM
        *r = (struct node){0};
        r->isword = 1;
        memcpy(r->str, str, len + 1);
        t->root = r;
        printf("word %s inserted as the first word\n", str);
        return 1;
    }
    assert(0); // continue this function . . .
    /* (For each edge.)
        (For each character, check that it's a match for the edge, if it's
        not, create two nodes from the split. Free the original.)
        (Or if it's the empty string, the word is already there.)
        (May be helpful to port `getFirstMismatchLetter`.) */
    return 0;
}
static int ValidateWord(struct trie *t, char *str) {
    int i;
    assert(t && str);
    printf("ValidateWord checkpoint: %s\n", str);
    // simplified
    for (i = 0; str[i] != '\0'; i++) if(!islower(str[i])) return 0;
    return InsertWord(t, str);
}

// this inputs https://en.wikipedia.org/wiki/Radix_tree
static int test(const char *const fn) {
    FILE *fp = 0;
    int success = 1;
    if(!(fp = fopen(fn, "w"))) goto catch;
    fprintf(fp,
        "romane\n"
        "romanus\n"
        "romulus\n"
        "rubens\n"
        "ruber\n"
        "rubicon\n"
        "rubicundus\n");
    fclose(fp), fp = 0;
    if(!freopen(fn, "r", stdin)) goto catch;
    goto finally;
catch:
    success = 0;
    perror(fn);
finally:
    if(fp) fclose(fp), fp = 0;
    return success;
}

int main(void) {
    char str[100];
    struct trie t = {0};
    struct { int no; char fn[100]; } debug = { 0, { 0 } };

    // ideally, tests should be automatic
    if(!test("trie.txt")) exit(EXIT_FAILURE);

    while(fgets(str, sizeof(str), stdin)) {
        str[strcspn(str, "\n")] = '\0';
        // Print a graph so we can see what's going on.
        sprintf(debug.fn, "trie%d.gv", debug.no++), graph(&t, debug.fn);
        int is_insert = ValidateWord(&t, str);
        assert(is_insert);
    }
    // memory leak: all the nodes need to be freed from the bottom.
    free(t.root);
    return 0;
}

相关问题