#include <iostream>
#include <fstream>
#include <string>
#include <vector>
using namespace std;
// Define a struct to hold the sequence information
struct Sequence {
string header;
string sequence;
};
// Define the hash table class
class HashTable {
public:
HashTable(int size) {
table.resize(size);
table_size = size;
}
void insert(string key, string value) {
// Compute the hash value for the key
int hash_value = hash(key) % table_size;
// Create a new sequence object and add it to the hash table
Sequence seq;
seq.header = key;
seq.sequence = value;
table[hash_value].push_back(seq);
}
bool contains(string key) {
// Compute the hash value for the key
int hash_value = hash(key) % table_size;
// Check if the key is in the hash table
for (auto& seq : table[hash_value]) {
if (seq.header == key) {
return true;
}
}
return false;
}
string get_sequence(string key) {
// Compute the hash value for the key
int hash_value = hash(key) % table_size;
// Find the sequence with the given key
for (auto& seq : table[hash_value]) {
if (seq.header == key) {
return seq.sequence;
}
}
// If the key is not found, return an empty string
return "";
}
private:
// The hash function computes the sum of the ASCII values of the characters in the key
int hash(string key) {
int sum = 0;
for (char c : key) {
sum += c;
}
return sum;
}
// The hash table is implemented as a vector of vectors of Sequence objects
vector<vector<Sequence>> table;
int table_size;
};
int main() {
// Read the genome dataset from the file
ifstream file("C:\\Users\\HP\\OneDrive\\Desktop\\actual_genome.fa");
string line;
string header = "";
string sequence = "";
HashTable table(56000000); // Use a large table size to accommodate the human genome
while (getline(file, line)) {
if (line[0] == '>') {
// If this is a new header line, add the previous sequence to the hash table (if there is one)
if (header != "" && sequence != "") {
table.insert(header, sequence);
}
header = line;
sequence = "";
} else {
// Add the current line to the current sequence
sequence += line;
}
}
// Add the last sequence to the hash table
if (header != "" && sequence != "") {
table.insert(header, sequence);
}
// Example usage: print the sequence for chromosome 1
if (table.contains(">Fragment_1")) {
cout << table.get_sequence(">Fragment_1") << endl;
} else {
cout << "Sequence not found." << endl;
}
return 0;
}
这是代码。不是我写的。它抛出错误“在抛出'std::bad_alloc' what()的示例后调用终止”:std::bad_alloc”当我使用具有55000000行的actual_genome文件时,但当我创建具有更少条目的另一个.fasta文件时,它工作正常。
我试过将hashtable table()的值增加到5600000来存储5500000的值,但是它不起作用。
2条答案
按热度按时间r8xiu3jd1#
你的
Table
的结构开销超过1GB(== 56 M *sizeof(vector)
),而且这还只是在插入一个元素之前,每个条目至少需要2*sizeof(string)
,也就是每个条目需要16到48个字节;这只是针对空的string
值。假设每个bucket正好得到1个条目,那么您已经丢弃了整个可用RAM。难怪std::bad_alloc
会崩溃。事实上,这是预期的最佳结果。即使您使用std::unordered_map<string,string>
,结果迟早也是相同的。您应该为该数据量找出一个完全重新设计的方法。sczxawaw2#
我认为错误的分配是由于试图分配太多的内存(堆有限制),或者如果内存已经分配,但在可以分配之前没有正确释放(泄漏)。