javascript 解压缩大文件并保存到MongoDB

798qvoo8  于 2023-06-04  发布在  Java
关注(0)|答案(1)|浏览(257)

我正在尝试下载一个非常大的zip文件,解压并将文件传输到Google Cloud,然后从存储桶中获取每个文件并将其下载到Mongodb,这样就不会出现内存过载。我认为我做的一切都是正确的,但我仍然得到这个错误
<---最后几个GC--->
[8143:0x150040000] 100257 ms:清除6856.9(7026.4)-> 6850.6(7030.9)MB,22.5 /0.4ms(平均mu = 0.138,当前mu = 0.113)分配失败;[8143:0x150040000] 100331 ms:清除6861.2(7030.9)-> 6855.3(7036.2)MB,22.5 /0.4ms(平均mu = 0.138,当前mu = 0.113)分配失败;[8143:0x150040000] 100522 ms:清除6865.2(7036.2)-> 6859.7(7041.2)MB,17.8 /0.2ms(平均mu = 0.138,当前mu = 0.113)外部存储器压力;
<--- JS stacktrace --->
致命错误:无效的标记压缩接近堆限制分配失败- JavaScript堆内存不足
下面是我目前为止得到的代码

import axios from 'axios'
import AdmZip = require('adm-zip');
import { Storage } from '@google-cloud/storage'
import mongoose from 'mongoose'
import { Readable } from 'stream'

const storage = new Storage()
const bucketName = process.env.BUCKET_NAME

// Fetch company filings data
async function fetchCompanyFilingsData() {
    const response = await axios.get(
        'https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip',
        { responseType: 'arraybuffer' }
    )
    if (!response.data) {
        throw new Error('No data received from URL.')
    }
    return response.data
}

// Unzip and upload data to Google Cloud Storage
async function unzipAndUploadData(buffer: Buffer) {
    const zip = new AdmZip(buffer)
    const files = zip.getEntries().filter((file) => /\.json$/i.test(file.name))

    if (!files.length) {
        throw new Error('No JSON files found in ZIP archive.')
    }

    for (const file of files) {
        const bucket = storage.bucket(bucketName)
        const blob = bucket.file(file.name)

        const blobStream = blob.createWriteStream()

        blobStream.on('error', (err) => {
            console.error(err)
        })

        blobStream.on('finish', () => {
            console.log(`Uploaded ${file.name} to Google Cloud Storage`)
            blobStream.end() // Close the writable stream
        })

        // Convert content (which is a Buffer) to a Stream
        const readableStream = new Readable()
        readableStream.push(file.getData())
        readableStream.push(null)

        // Pipe the stream to blobStream
        readableStream.pipe(blobStream)
    }
}

// Parse JSON files and save data to MongoDB
async function saveDataToMongoDB(db) {
    const filingsCollection = mongoose.connection.db.collection('edgar-filings')
    const bucket = storage.bucket(bucketName)
    const [files] = await bucket.getFiles()

    for (const file of files) {
        if (file.name.endsWith('.json')) {
            const content = file.createReadStream() // Read the file as a readable stream

            let data = ''
            content.on('data', (chunk) => {
                data += chunk.toString()
            })

            content.on('end', async () => {
                const jsonData = JSON.parse(data)

                const existingFiling = await filingsCollection.findOne({ cik: jsonData.cik })
                if (!existingFiling) {
                    const result = await filingsCollection.insertOne(jsonData)
                    console.log('Inserted filing:', result)
                } else {
                    console.log('Filing already exists:', jsonData)
                }
            })
        }
    }
}

export async function processCompanyFilingsData(db): Promise<void> {
    try {
        const buffer = await fetchCompanyFilingsData()
        await unzipAndUploadData(buffer)

        await mongoose.connect(process.env.MONGODB_URI) // Replace with your MongoDB connection string
        await saveDataToMongoDB(db)
        await mongoose.disconnect()

        console.log('Data saved to MongoDB')
    } catch (error) {
        console.error(error)
    }
}
iyr7buue

iyr7buue1#

您遇到的错误消息表明您的JavaScript堆已耗尽内存。处理大文件或同时处理大量文件时可能会发生这种情况。为了解决这个问题,您可以尝试一些优化:

Increase the memory limit for Node.js: You can try increasing the memory limit for your Node.js process by setting the --max-old-space-size flag when running your script. For example:

node --max-old-space-size=4096 your-script.js

这会将内存限制增加到4GB(4096MB)。根据您的需要调整值。

Process files sequentially: Instead of processing all the files simultaneously, you can modify your code to process them one by one. This reduces the memory usage since only one file is being processed at a time. You can use a loop and an await statement to ensure each file is processed before moving on to the next one.

Use a cursor-based approach for MongoDB: Instead of loading the entire file content into memory and parsing it as a JSON object, consider using a cursor-based approach with MongoDB. This allows you to process the file in chunks without loading the entire file into memory at once. MongoDB's createReadStream() method returns a readable stream, which can be piped directly to another writable stream (e.g., a file or another database) without loading the entire content into memory.

通过实施这些优化,您可以减少内存使用并更有效地处理大文件。下面是如何修改代码的示例:

import axios from 'axios';
import AdmZip from 'adm-zip';
import { Storage } from '@google-cloud/storage';
import mongoose from 'mongoose';
import { Readable } from 'stream';

const storage = new Storage();
const bucketName = process.env.BUCKET_NAME;

async function fetchCompanyFilingsData() {
  const response = await axios.get(
    'https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip',
    { responseType: 'arraybuffer' }
  );
  if (!response.data) {
    throw new Error('No data received from URL.');
  }
  return response.data;
}

async function unzipAndUploadData(buffer: Buffer) {
  const zip = new AdmZip(buffer);
  const files = zip.getEntries().filter((file) => /\.json$/i.test(file.name));

  if (!files.length) {
    throw new Error('No JSON files found in ZIP archive.');
  }

  for (const file of files) {
    const bucket = storage.bucket(bucketName);
    const blob = bucket.file(file.name);
    const blobStream = blob.createWriteStream();

    blobStream.on('error', (err) => {
      console.error(err);
    });

    await new Promise((resolve, reject) => {
      blobStream.on('finish', resolve);
      blobStream.on('error', reject);

      const readableStream = new Readable();
      readableStream.push(file.getData());
      readableStream.push(null);

      readableStream.pipe(blobStream);
    });

    console.log(`Uploaded ${file.name} to Google Cloud Storage`);
  }
}

async function saveDataToMongoDB() {
  const filingsCollection = mongoose.connection.db.collection('edgar-filings');
  const bucket = storage.bucket(bucketName);
  const [files] = await bucket.getFiles();

  for (const file of files) {
    if (file.name.endsWith('.json')) {
      await new Promise((resolve, reject) => {
        const contentStream = file.createReadStream();
        let data = '';

        contentStream.on('data', (chunk) => {
          data += chunk.toString();
        });

        contentStream.on('end', async () => {
          const jsonData = JSON.parse(data);

          const existingFiling = await filingsCollection.findOne({ cik: jsonData.cik });
          if (!existingFiling) {
            const result = await filingsCollection.insertOne(jsonData);
            console.log('Inserted filing:', result);
          } else {
            console.log('Filing already exists:', jsonData);
          }

          resolve();
        });

        contentStream.on('error', reject);
      });
    }
  }
}

export async function processCompanyFilingsData() {
  try {
    const buffer = await fetchCompanyFilingsData();
    await unzipAndUploadData(buffer);

    await mongoose.connect(process.env.MONGODB_URI);
    await saveDataToMongoDB();
    await mongoose.disconnect();

    console.log('Data saved to MongoDB');
  } catch (error) {
    console.error(error);
  }
}

这些修改应该有助于在处理大文件并将其存储在MongoDB中时减少内存使用。

相关问题