c++ 运行包含哈希表和fasta文件的cpp代码时出现std::bad_alloc错误

fwzugrvs  于 2023-03-20  发布在  其他
关注(0)|答案(2)|浏览(156)
#include <iostream>
#include <fstream>
#include <string>
#include <vector>

using namespace std;

// Define a struct to hold the sequence information
struct Sequence {
    string header;
    string sequence;
};

// Define the hash table class
class HashTable {
public:
    HashTable(int size) {
        table.resize(size);
        table_size = size;
    }

    void insert(string key, string value) {
        // Compute the hash value for the key
        int hash_value = hash(key) % table_size;

        // Create a new sequence object and add it to the hash table
        Sequence seq;
        seq.header = key;
        seq.sequence = value;
        table[hash_value].push_back(seq);
    }

    bool contains(string key) {
        // Compute the hash value for the key
        int hash_value = hash(key) % table_size;

        // Check if the key is in the hash table
        for (auto& seq : table[hash_value]) {
            if (seq.header == key) {
                return true;
            }
        }
        return false;
    }

    string get_sequence(string key) {
        // Compute the hash value for the key
        int hash_value = hash(key) % table_size;

        // Find the sequence with the given key
        for (auto& seq : table[hash_value]) {
            if (seq.header == key) {
                return seq.sequence;
            }
        }

        // If the key is not found, return an empty string
        return "";
    }

private:
    // The hash function computes the sum of the ASCII values of the characters in the key
    int hash(string key) {
        int sum = 0;
        for (char c : key) {
            sum += c;
        }
        return sum;
    }

    // The hash table is implemented as a vector of vectors of Sequence objects
    vector<vector<Sequence>> table;
    int table_size;
};

int main() {
    // Read the genome dataset from the file
    ifstream file("C:\\Users\\HP\\OneDrive\\Desktop\\actual_genome.fa");
    string line;
    string header = "";
    string sequence = "";
    HashTable table(56000000); // Use a large table size to accommodate the human genome

    while (getline(file, line)) {
        if (line[0] == '>') {
            // If this is a new header line, add the previous sequence to the hash table (if there is one)
            if (header != "" && sequence != "") {
                table.insert(header, sequence);
            }
            header = line;
            sequence = "";
        } else {
            // Add the current line to the current sequence
            sequence += line;
        }
    }

    // Add the last sequence to the hash table
    if (header != "" && sequence != "") {
        table.insert(header, sequence);
    }

    // Example usage: print the sequence for chromosome 1
    if (table.contains(">Fragment_1")) {
        cout << table.get_sequence(">Fragment_1") << endl;
    } else {
        cout << "Sequence not found." << endl;
    }

    return 0;
}

这是代码。不是我写的。它抛出错误“在抛出'std::bad_alloc' what()的示例后调用终止”:std::bad_alloc”当我使用具有55000000行的actual_genome文件时,但当我创建具有更少条目的另一个.fasta文件时,它工作正常。
我试过将hashtable table()的值增加到5600000来存储5500000的值,但是它不起作用。

r8xiu3jd

r8xiu3jd1#

你的Table的结构开销超过1GB(== 56 M * sizeof(vector)),而且这还只是在插入一个元素之前,每个条目至少需要2*sizeof(string),也就是每个条目需要16到48个字节;这只是针对空的string值。假设每个bucket正好得到1个条目,那么您已经丢弃了整个可用RAM。难怪std::bad_alloc会崩溃。事实上,这是预期的最佳结果。即使您使用std::unordered_map<string,string>,结果迟早也是相同的。您应该为该数据量找出一个完全重新设计的方法。

sczxawaw

sczxawaw2#

我认为错误的分配是由于试图分配太多的内存(堆有限制),或者如果内存已经分配,但在可以分配之前没有正确释放(泄漏)。

相关问题