c++ 读取多GB后,NVMe SSD读取速度下降

lhcgjxsq  于 2023-03-20  发布在  其他
关注(0)|答案(1)|浏览(129)

我有一个希捷FireCuda 530 4 TB固态硬盘(M.2 NVMe),其规格说它的顺序读取高达7.3 GB/s,它的IOPS是1 M.我试图写一个程序,实现任何接近声称的速度.操作系统是Ubuntu 22.04,AMD CPU(锐龙Threadripper 3990 X),千兆字节TRX 40 Designare主板,256 GB的内存DDR4 2667 MHz.
下面是我目前的程序(我也有基于mmap()的解决方案,但即使在理论上它也不应该更快--参见https://github.com/srogatch/nvme-max-read,了解基于mmap的版本):

#include <sys/mman.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/uio.h>

#include <string>
#include <iostream>
#include <filesystem>
#include <memory>
#include <cstring>
#include <chrono>
#include <vector>
#include <thread>
#include <cstdint>
#include <sstream>
namespace fs = std::filesystem;

const std::string gc_src_dir = "/scratch/LLMs/models/bloom";
constexpr size_t gc_page_size = 4096;
constexpr uintptr_t gc_page_mask = gc_page_size - 1;
constexpr size_t gc_n_workers = 64;

bool EndsWith(const std::string& text, const std::string& suffix) {
    if(text.size() < suffix.size()) {
        return false;
    }
    return memcmp(text.data()+text.size()-suffix.size(), suffix.data(), suffix.size()) == 0;
}

const uint8_t* AlignPageUp(const void* ptr) {
    return reinterpret_cast<uint8_t*>((uintptr_t(ptr) + gc_page_mask) & (~gc_page_mask));
}

uint8_t* AlignPageUp(void* ptr) {
    return const_cast<uint8_t*>(AlignPageUp(static_cast<const void*>(ptr)));
}

const uint8_t* AlignPageDown(const void* ptr) {
    return reinterpret_cast<uint8_t*>(uintptr_t(ptr) & (~gc_page_mask));
}

void Scattered(const int fd, const size_t n_bytes) {
    // See https://stackoverflow.com/questions/27271801/c-the-permitted-maximum-of-the-iovcnt-argument-in-writev
    constexpr size_t buffers_per_worker = 1024;
    constexpr size_t n_buffers = gc_n_workers * buffers_per_worker;
    const size_t even_page_bytes = (n_bytes + gc_page_mask) & (~gc_page_mask);
    size_t bytes_per_buffer = (n_bytes + n_buffers - 1) / n_buffers;
    bytes_per_buffer = (bytes_per_buffer + gc_page_mask) & (~gc_page_mask);
    const size_t used_buffers = (n_bytes + bytes_per_buffer - 1) / bytes_per_buffer;
    void* raw_storage = malloc(bytes_per_buffer * used_buffers + gc_page_mask);
    uint8_t* storage = AlignPageUp(raw_storage);
    iovec buffers[n_buffers];

#pragma omp parallel for
    for(size_t i=0; i<used_buffers; i++) {
        const size_t first_byte = bytes_per_buffer * i;
        const size_t limit_byte = std::min(even_page_bytes, bytes_per_buffer * (i+1));
        buffers[i].iov_len = limit_byte - first_byte;
        buffers[i].iov_base = storage + first_byte;
    }

#pragma omp parallel for num_threads(gc_n_workers)
    for(size_t i_worker = 0; i_worker < gc_n_workers; i_worker++) {
        const size_t first_buffer = i_worker * buffers_per_worker;
        const size_t limit_buffer = std::min((i_worker+1) * buffers_per_worker, used_buffers);
        if(first_buffer >= limit_buffer) {
            continue;
        }
        ssize_t n_read = preadv2(fd, buffers + first_buffer, limit_buffer - first_buffer, 0, RWF_HIPRI);
        if(n_read == -1) {
            perror("Failed to read file");
        }
    }
    free(raw_storage);
}

void MultiLargeRead(const int fd, const size_t n_bytes) {
    void* raw_storage = malloc(n_bytes + gc_page_size + gc_page_mask);
    uint8_t *storage = AlignPageUp(raw_storage);
    size_t bytes_per_worker = (n_bytes + gc_n_workers - 1) / gc_n_workers;
    bytes_per_worker = (bytes_per_worker + gc_page_mask) & (~gc_page_mask);
#pragma omp parallel for num_threads(gc_n_workers)
    for(size_t i=0; i<gc_n_workers; i++) {
        const size_t first_byte = i * bytes_per_worker;
        const size_t limit_byte = std::min(n_bytes, (i+1)*bytes_per_worker);
        if(first_byte >= limit_byte) {
            continue;
        }
        const ssize_t n_read = pread(fd, storage + first_byte,
            (limit_byte - first_byte + gc_page_mask) & (~gc_page_mask),
            first_byte);
        if(n_read == -1) {
            perror("Faild to read file");
        }
    }
    free(raw_storage);
}

int main() {
    for (const auto & entry : fs::directory_iterator(gc_src_dir)) {
        if(!EndsWith(entry.path(), ".safetensors") && !EndsWith(entry.path(), ".bin")) {
            continue;
        }
        std::cout << entry.path() << std::endl;

        const int fd = open(entry.path().c_str(), O_RDONLY | O_LARGEFILE | O_DIRECT);
        if(fd == -1) {
            perror("Failed to open");
            continue;
        }
        struct stat file_stat;
        if (fstat(fd, &file_stat) == -1) {
            perror("Faild to stat");
            continue;
        }
        //posix_fadvise(fd, 0, file_stat.st_size, POSIX_FADV_RANDOM);

        std::chrono::steady_clock::time_point tmLast = std::chrono::steady_clock::now();
        Scattered(fd, file_stat.st_size);
        std::chrono::steady_clock::time_point tmNow = std::chrono::steady_clock::now();
        const double nSec = std::chrono::duration_cast<std::chrono::nanoseconds>(tmNow - tmLast).count() / 1e9;
        const double GBperSec = (file_stat.st_size / nSec) / 1e9;
        std::cout << file_stat.st_size << " bytes in " << nSec << " seconds: "
            << GBperSec << " billion bytes per second." << std::endl;

        posix_fadvise(fd, 0, file_stat.st_size, POSIX_FADV_DONTNEED);
        if(close(fd) == -1) {
            perror("Failed to close");
        }
    }
    return 0;
}

/scratch/LLMs/models/bloom包含从https://huggingface.co/bigscience/bloom/tree/main下载的Bloom大型语言模型。
我使用以下命令运行上面的程序:

g++ -fopenmp -O3 nvme-read-fileio.cpp -o nvme-read-fileio
echo 3 | sudo tee /proc/sys/vm/drop_caches
./nvme-read-fileio

程序开始时阅读约为4.5 GB/s,但随后性能下降到700-800 MB/s。SSD的温度不会超过74摄氏度。我以为SLC缓存只在写入SSD时起作用。但显然有一些缓存(不在操作系统中,因为操作系统缓存在posix_fadvise()后保持较低)。
有人能解释一下这是怎么回事吗?这些是否是持续读取NVMe SSD的预期数字?
SSD的性能日志为:

./run-fileio.sh 
[sudo] password for serge: 
3
"/scratch/LLMs/models/bloom/model_00046-of-00072.safetensors"
4932875563 bytes in 1.06306 seconds: 4.64028 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00050-of-00072.bin"
4932877665 bytes in 1.05543 seconds: 4.67381 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00012-of-00072.safetensors"
4932875573 bytes in 1.09619 seconds: 4.50001 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00020-of-00072.safetensors"
4932875563 bytes in 1.08084 seconds: 4.56391 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00066-of-00072.safetensors"
4932875565 bytes in 1.11794 seconds: 4.41245 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00072-of-00072.safetensors"
57530 bytes in 0.0143936 seconds: 0.00399693 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00036-of-00072.bin"
4932877665 bytes in 1.07298 seconds: 4.59736 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00055-of-00072.safetensors"
4932875563 bytes in 1.0579 seconds: 4.66288 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00002-of-00072.safetensors"
4932875549 bytes in 1.08532 seconds: 4.54509 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00041-of-00072.bin"
4932877665 bytes in 1.07056 seconds: 4.60776 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00017-of-00072.safetensors"
4932875563 bytes in 1.11623 seconds: 4.41921 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00069-of-00072.safetensors"
4932875573 bytes in 1.24681 seconds: 3.95639 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00037-of-00072.safetensors"
4932875573 bytes in 1.5898 seconds: 3.10283 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00053-of-00072.bin"
4932877665 bytes in 1.08551 seconds: 4.54429 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00071-of-00072.bin"
4932877665 bytes in 2.83951 seconds: 1.73723 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00040-of-00072.bin"
4932877665 bytes in 1.09829 seconds: 4.4914 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00055-of-00072.bin"
4932877665 bytes in 1.13006 seconds: 4.36515 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00036-of-00072.safetensors"
4932875563 bytes in 1.11194 seconds: 4.43627 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00064-of-00072.bin"
4932877665 bytes in 2.82379 seconds: 1.7469 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00071-of-00072.safetensors"
4932875557 bytes in 1.09051 seconds: 4.52346 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00023-of-00072.bin"
4932877665 bytes in 2.83307 seconds: 1.74117 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00018-of-00072.safetensors"
4932875573 bytes in 1.10318 seconds: 4.47151 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00033-of-00072.safetensors"
4932875573 bytes in 2.85241 seconds: 1.72937 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00059-of-00072.safetensors"
4932875563 bytes in 1.11315 seconds: 4.43147 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00070-of-00072.bin"
4932877665 bytes in 1.05519 seconds: 4.67487 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00044-of-00072.bin"
4932877665 bytes in 3.30498 seconds: 1.49256 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00056-of-00072.safetensors"
4932875565 bytes in 1.49834 seconds: 3.29222 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00003-of-00072.bin"
4932877601 bytes in 1.09202 seconds: 4.51722 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00011-of-00072.safetensors"
4932875551 bytes in 3.22613 seconds: 1.52904 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00024-of-00072.safetensors"
4932875561 bytes in 2.73117 seconds: 1.80614 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00015-of-00072.bin"
4932877665 bytes in 2.78437 seconds: 1.77163 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00047-of-00072.bin"
4932877665 bytes in 1.08964 seconds: 4.52709 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00050-of-00072.safetensors"
4932875551 bytes in 1.05356 seconds: 4.68211 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00022-of-00072.bin"
4932877665 bytes in 1.12053 seconds: 4.40229 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00008-of-00072.bin"
4932877601 bytes in 3.23984 seconds: 1.52257 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00011-of-00072.bin"
4932877601 bytes in 2.7694 seconds: 1.78121 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00025-of-00072.bin"
4932877665 bytes in 2.35111 seconds: 2.0981 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00005-of-00072.bin"
4932877601 bytes in 2.94068 seconds: 1.67746 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00001-of-00072.safetensors"
7193289031 bytes in 1.91779 seconds: 3.75082 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00051-of-00072.safetensors"
4932875573 bytes in 3.97354 seconds: 1.24143 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00035-of-00072.bin"
4932877665 bytes in 2.97325 seconds: 1.65908 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00064-of-00072.safetensors"
4932875537 bytes in 2.3653 seconds: 2.08552 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00030-of-00072.safetensors"
4932875531 bytes in 1.07454 seconds: 4.5907 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00004-of-00072.safetensors"
4932875557 bytes in 4.56415 seconds: 1.08079 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00015-of-00072.safetensors"
4932875531 bytes in 3.24319 seconds: 1.52099 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00048-of-00072.safetensors"
4932875555 bytes in 2.37022 seconds: 2.08119 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00027-of-00072.safetensors"
4932875563 bytes in 1.072 seconds: 4.60155 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00060-of-00072.bin"
4932877665 bytes in 1.07123 seconds: 4.60486 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00045-of-00072.safetensors"
4932875569 bytes in 6.21179 seconds: 0.794116 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00043-of-00072.bin"
4932877665 bytes in 3.3426 seconds: 1.47576 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00044-of-00072.safetensors"
4932875555 bytes in 1.14831 seconds: 4.29577 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00032-of-00072.bin"
4932877665 bytes in 1.08985 seconds: 4.52621 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00008-of-00072.safetensors"
4932875519 bytes in 1.09357 seconds: 4.51081 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00052-of-00072.bin"
4932877665 bytes in 6.71973 seconds: 0.734089 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00068-of-00072.safetensors"
4932875563 bytes in 1.05955 seconds: 4.65564 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00013-of-00072.safetensors"
4932875573 bytes in 1.08236 seconds: 4.55753 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00029-of-00072.safetensors"
4932875563 bytes in 6.20449 seconds: 0.795049 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00039-of-00072.bin"
4932877665 bytes in 1.21287 seconds: 4.0671 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00016-of-00072.bin"
4932877665 bytes in 5.95278 seconds: 0.828668 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00056-of-00072.bin"
4932877665 bytes in 1.52038 seconds: 3.2445 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00033-of-00072.bin"
4932877665 bytes in 1.09718 seconds: 4.49596 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00041-of-00072.safetensors"
4932875573 bytes in 6.2845 seconds: 0.784927 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00063-of-00072.bin"
4932877665 bytes in 5.43954 seconds: 0.906855 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00045-of-00072.bin"
4932877665 bytes in 3.228 seconds: 1.52815 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00052-of-00072.safetensors"
4932875549 bytes in 1.97378 seconds: 2.4992 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00022-of-00072.safetensors"
4932875573 bytes in 1.10796 seconds: 4.45222 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00009-of-00072.safetensors"
4932875551 bytes in 5.92352 seconds: 0.832761 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00072-of-00072.bin"
58279 bytes in 0.0159825 seconds: 0.00364643 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00002-of-00072.bin"
4932877601 bytes in 1.0347 seconds: 4.76744 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00007-of-00072.bin"
4932877601 bytes in 6.40498 seconds: 0.770163 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00018-of-00072.bin"
4932877665 bytes in 6.25208 seconds: 0.788998 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00038-of-00072.bin"
4932877665 bytes in 1.50503 seconds: 3.27758 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00032-of-00072.safetensors"
4932875573 bytes in 1.03922 seconds: 4.74671 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00061-of-00072.safetensors"
4932875565 bytes in 6.46773 seconds: 0.76269 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00054-of-00072.bin"
4932877665 bytes in 6.37987 seconds: 0.773194 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00049-of-00072.bin"
4932877665 bytes in 1.06745 seconds: 4.62118 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00069-of-00072.bin"
4932877665 bytes in 1.04855 seconds: 4.70447 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00025-of-00072.safetensors"
4932875553 bytes in 5.97924 seconds: 0.825 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00003-of-00072.safetensors"
4932875551 bytes in 6.97898 seconds: 0.706819 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00031-of-00072.safetensors"
4932875555 bytes in 1.04063 seconds: 4.74029 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00014-of-00072.safetensors"
4932875551 bytes in 1.09654 seconds: 4.49859 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00028-of-00072.safetensors"
4932875573 bytes in 6.06694 seconds: 0.813074 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00068-of-00072.bin"
4932877665 bytes in 6.16835 seconds: 0.799708 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00047-of-00072.safetensors"
4932875573 bytes in 6.5344 seconds: 0.754909 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00031-of-00072.bin"
4932877665 bytes in 1.02919 seconds: 4.79295 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00029-of-00072.bin"
4932877665 bytes in 5.87793 seconds: 0.83922 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00035-of-00072.safetensors"
4932875573 bytes in 1.06242 seconds: 4.64306 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00065-of-00072.safetensors"
4932875571 bytes in 6.52125 seconds: 0.756431 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00065-of-00072.bin"
4932877665 bytes in 6.0777 seconds: 0.811636 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00026-of-00072.safetensors"
4932875563 bytes in 5.9842 seconds: 0.824316 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00042-of-00072.bin"
4932877665 bytes in 5.9264 seconds: 0.832357 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00019-of-00072.bin"
4932877665 bytes in 5.893 seconds: 0.837075 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00048-of-00072.bin"
4932877665 bytes in 4.13364 seconds: 1.19335 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00051-of-00072.bin"
4932877665 bytes in 3.35901 seconds: 1.46855 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00034-of-00072.bin"
4932877665 bytes in 6.14335 seconds: 0.802962 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00020-of-00072.bin"
4932877665 bytes in 1.04687 seconds: 4.71204 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00049-of-00072.safetensors"
4932875541 bytes in 6.37837 seconds: 0.773376 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00053-of-00072.safetensors"
4932875527 bytes in 6.32886 seconds: 0.779426 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00034-of-00072.safetensors"
4932875573 bytes in 6.32055 seconds: 0.78045 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00021-of-00072.safetensors"
4932875553 bytes in 6.25532 seconds: 0.788589 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00058-of-00072.safetensors"
4932875573 bytes in 5.91806 seconds: 0.83353 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00057-of-00072.safetensors"
4932875573 bytes in 5.93299 seconds: 0.831431 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00010-of-00072.safetensors"
4932875541 bytes in 6.10677 seconds: 0.807772 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00014-of-00072.bin"
4932877665 bytes in 6.45824 seconds: 0.763812 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00046-of-00072.bin"
4932877665 bytes in 6.69341 seconds: 0.736975 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00038-of-00072.safetensors"
4932875573 bytes in 1.47864 seconds: 3.3361 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00024-of-00072.bin"
4932877665 bytes in 1.05901 seconds: 4.65803 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00027-of-00072.bin"
4932877665 bytes in 6.40909 seconds: 0.769669 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00006-of-00072.bin"
4932877601 bytes in 5.52455 seconds: 0.892901 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00017-of-00072.bin"
4932877665 bytes in 5.88582 seconds: 0.838095 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00058-of-00072.bin"
4932877665 bytes in 6.09686 seconds: 0.809085 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00009-of-00072.bin"
4932877601 bytes in 6.5395 seconds: 0.75432 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00039-of-00072.safetensors"
4932875555 bytes in 8.07509 seconds: 0.610876 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00042-of-00072.safetensors"
4932875521 bytes in 6.39566 seconds: 0.771284 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00067-of-00072.bin"
4932877665 bytes in 1.92706 seconds: 2.55979 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00016-of-00072.safetensors"
4932875573 bytes in 1.08779 seconds: 4.53477 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00021-of-00072.bin"
4932877665 bytes in 6.08484 seconds: 0.810684 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00005-of-00072.safetensors"
4932875509 bytes in 6.18826 seconds: 0.797134 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00037-of-00072.bin"
4932877665 bytes in 1.07592 seconds: 4.58478 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00001-of-00072.bin"
7193290147 bytes in 8.89842 seconds: 0.808378 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00061-of-00072.bin"
4932877665 bytes in 6.05211 seconds: 0.815067 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00067-of-00072.safetensors"
4932875539 bytes in 1.92372 seconds: 2.56423 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00062-of-00072.safetensors"
4932875573 bytes in 6.68542 seconds: 0.737856 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00007-of-00072.safetensors"
4932875551 bytes in 1.09001 seconds: 4.52553 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00028-of-00072.bin"
4932877665 bytes in 5.91756 seconds: 0.8336 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00060-of-00072.safetensors"
4932875527 bytes in 6.2479 seconds: 0.789526 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00063-of-00072.safetensors"
4932875565 bytes in 6.11895 seconds: 0.806164 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00004-of-00072.bin"
4932877601 bytes in 1.91632 seconds: 2.57415 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00040-of-00072.safetensors"
4932875533 bytes in 1.1723 seconds: 4.20785 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00023-of-00072.safetensors"
4932875573 bytes in 6.29939 seconds: 0.783072 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00010-of-00072.bin"
4932877601 bytes in 6.46308 seconds: 0.763239 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00019-of-00072.safetensors"
4932875555 bytes in 6.28934 seconds: 0.784323 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00054-of-00072.safetensors"
4932875573 bytes in 6.13669 seconds: 0.803833 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00059-of-00072.bin"
4932877665 bytes in 6.26267 seconds: 0.787664 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00006-of-00072.safetensors"
4932875553 bytes in 6.55845 seconds: 0.752141 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00070-of-00072.safetensors"
4932875553 bytes in 1.0652 seconds: 4.63095 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00062-of-00072.bin"
4932877665 bytes in 6.25757 seconds: 0.788305 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00012-of-00072.bin"
4932877665 bytes in 5.91722 seconds: 0.833648 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00057-of-00072.bin"
4932877665 bytes in 6.42177 seconds: 0.76815 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00013-of-00072.bin"
4932877665 bytes in 6.10804 seconds: 0.807604 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00043-of-00072.safetensors"
4932875573 bytes in 6.22971 seconds: 0.791831 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00066-of-00072.bin"
4932877665 bytes in 6.08366 seconds: 0.81084 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00030-of-00072.bin"
4932877665 bytes in 6.34713 seconds: 0.777183 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00026-of-00072.bin"
4932877665 bytes in 6.26312 seconds: 0.787607 billion bytes per second.
ngynwnxp

ngynwnxp1#

这是一个部分答案,显示了类似(较小)的效果和跟踪问题的几个提示。
我无法在我的NVMe SSE Samsung 980 Pro(1 TB)上完全重现这个问题,但我发现了一个有趣的类似效果,可以解释SSE的行为。以下是我的机器上前30 GiB文件的结果:

"/tmp/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.4701 seconds: 3.35546 billion bytes per second.
"/tmp/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.52471 seconds: 3.23528 billion bytes per second.
"/tmp/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.47729 seconds: 3.33913 billion bytes per second.
"/tmp/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.66287 seconds: 2.96649 billion bytes per second.
"/tmp/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 2.61768 seconds: 2.74796 billion bytes per second.
"/tmp/bigfiles/model_00006-of-00072.safetensors"
4932875553 bytes in 1.84153 seconds: 2.67868 billion bytes per second.

我们可以看到,速度似乎随着时间的推移而下降。这种效果在多次运行中是一致的。然而,即使在另一次运行后,第一个文件的速度仍然是~3.3 GB/s。这意味着我的SSD可能不是性能下降的原因(否则,在后续运行中速度会降低)。注意,数据不会被缓存(使用iotop和较低级别的分析工具检查)。
如果没有下载更多的数据,因为我正在运行的空间(~4 GiB左)在这个分区上,这实际上是一个重要的一点:最慢的文件是我最后下载的文件!我的第一个假设是 * 当剩余空间不多时,文件系统会出现碎片 *,因此最后一个文件会比第一个文件分割成更多的小块。如果是这样,这会导致操作系统(OS)对小块的IO请求多于对第一个文件的请求,从而导致吞吐量降低。
为了检查这一点,我只是删除了最后一个文件,以腾出一些空间,并复制了第二个文件(model_00002-of-00072.safetensors),下面是结果:

"/tmp/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.46369 seconds: 3.37017 billion bytes per second.
"/tmp/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.8347 seconds: 2.68866 billion bytes per second.   <----------
"/tmp/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.47645 seconds: 3.34104 billion bytes per second.
"/tmp/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.69516 seconds: 2.90997 billion bytes per second.
"/tmp/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 2.64308 seconds: 2.72155 billion bytes per second.

[missing: "/tmp/bigfiles/model_00006-of-00072.safetensors"]

正如我们所看到的,除了复制的文件之外,所有文件的速度都与上一次运行的速度一致。请注意,分区上的文件系统是Ext4。其他文件系统可能会导致不同的行为,特别是FAT,它往往会很快变得碎片化。
话虽如此,我尝试使用工具e4defrag来整理文件,它并没有明显改善情况。事实上,除了第一个(最快的)文件外,所有文件都被报告为“不需要整理碎片”的文件。使用e4defrag -c /your/directory可以显示一个报告。复制文件也是automatically defragment them的一个简单方法。
然后,我在同一Linux系统的一个更大的NTFS分区上再次测试了同一程序(800 GiB),并没有重现效果。我下载了更多的下载文件,并制作了许多副本的最后一个文件,所以不要等待下载它。结果目录toke 229 GiB。结果在NTFS分区实际上是 * 更稳定 *,甚至令人惊讶地 * 更好 * 比Ext4分区。我运行这个3次,以确保这是可重复的(它是)。以下是最后一次运行的结果:

"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00007-of-00072.safetensors"
4932875551 bytes in 1.31361 seconds: 3.75521 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00008-of-00072.safetensors"
4932875519 bytes in 1.29188 seconds: 3.81837 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00009-of-00072.safetensors"
4932875551 bytes in 1.28907 seconds: 3.82669 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00010-of-00072.safetensors"
4932875541 bytes in 1.29115 seconds: 3.82053 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00011-of-00072.safetensors"
4932875551 bytes in 1.29246 seconds: 3.81666 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00012-of-00072.safetensors"
4932875573 bytes in 1.29473 seconds: 3.80995 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00013-of-00072.safetensors"
4932875573 bytes in 1.28939 seconds: 3.82575 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00014-of-00072.safetensors"
4932875551 bytes in 1.28572 seconds: 3.83668 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00015-of-00072.safetensors"
4932875531 bytes in 1.28118 seconds: 3.85025 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy1.safetensors"
4932875573 bytes in 1.2539 seconds: 3.93404 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy10.safetensors"
4932875573 bytes in 1.26357 seconds: 3.90393 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy11.safetensors"
4932875573 bytes in 1.25435 seconds: 3.93262 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy30.safetensors"
4932875573 bytes in 1.24447 seconds: 3.96383 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy4.safetensors"
4932875573 bytes in 1.26411 seconds: 3.90225 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy5.safetensors"
4932875573 bytes in 1.24267 seconds: 3.96959 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy6.safetensors"
4932875573 bytes in 1.24377 seconds: 3.96607 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy7.safetensors"
4932875573 bytes in 1.24523 seconds: 3.96141 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy8.safetensors"
4932875573 bytes in 1.24923 seconds: 3.94875 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy9.safetensors"
4932875573 bytes in 1.24158 seconds: 3.97305 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072.safetensors"
4932875573 bytes in 1.24082 seconds: 3.9755 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy13.safetensors"
4932875573 bytes in 1.24269 seconds: 3.96953 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy14.safetensors"
4932875573 bytes in 1.23559 seconds: 3.99231 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy15.safetensors"
4932875573 bytes in 1.23835 seconds: 3.98343 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy16.safetensors"
4932875573 bytes in 1.25322 seconds: 3.93617 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy17.safetensors"
4932875573 bytes in 1.23546 seconds: 3.99275 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy18.safetensors"
4932875573 bytes in 1.24692 seconds: 3.95606 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy19.safetensors"
4932875573 bytes in 1.23862 seconds: 3.98255 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy2.safetensors"
4932875573 bytes in 1.23723 seconds: 3.98703 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy20.safetensors"
4932875573 bytes in 1.243 seconds: 3.96852 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy21.safetensors"
4932875573 bytes in 1.2451 seconds: 3.96182 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy22.safetensors"
4932875573 bytes in 1.25585 seconds: 3.92793 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy23.safetensors"
4932875573 bytes in 1.26094 seconds: 3.91205 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy24.safetensors"
4932875573 bytes in 1.23445 seconds: 3.99602 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy25.safetensors"
4932875573 bytes in 1.2411 seconds: 3.97459 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy26.safetensors"
4932875573 bytes in 1.24181 seconds: 3.97233 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy27.safetensors"
4932875573 bytes in 1.23395 seconds: 3.99762 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy28.safetensors"
4932875573 bytes in 1.23269 seconds: 4.00172 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy29.safetensors"
4932875573 bytes in 1.25099 seconds: 3.94318 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00006-of-00072.safetensors"
4932875553 bytes in 1.27636 seconds: 3.86481 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy12.safetensors"
4932875573 bytes in 1.23839 seconds: 3.9833 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy3.safetensors"
4932875573 bytes in 1.23959 seconds: 3.97945 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 1.86529 seconds: 3.8564 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.27457 seconds: 3.87021 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.29629 seconds: 3.80538 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.2737 seconds: 3.87288 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.28479 seconds: 3.83945 billion bytes per second.

我建议你尝试将文件存储在另一个文件系统的不同分区,以便检查性能差距是否来自于此。我认为罪魁祸首是SSD驱动程序、固件或硬件本身。
到目前为止,我的SSD的SLC缓存没有影响读取时间,只有写入,但不清楚读取是否会受到SLC缓存的影响。行为可能会有很大的不同,从一个SSD到另一个。在我的SSD,固件将数据存储在SLC高速缓冲存储器中且不直接将数据移动到TLC块。我猜这是为了提高SSD的寿命,避免将数据系统地存储到TLC,而它们可能会在稍后被删除(TLC单元具有有限的写入次数,其往往低于SLC单元之一even on dynamic pseudo-SLC cache like on your SSD, which uses TLC cells to make an SLC cache)。当该高速缓存关闭到饱和时,它显然开始将大部分SLC缓存内容移动到TLC块。我猜这是为了能够更好地维持下一次写入突发,并延迟由于SLC-〉TLC块复制而导致的吞吐量切换。
请注意,一个好的固件应该在SLC缓存中保留相对修改的文件块,以减少SSD的磨损。这意味着检查SLC与TLC块速度的方法是锤击写入文件,然后测量读取它的时间与写入一次的文件相比。但是,此策略是危险的,因为如果固件实际上不将锤子文件内容存储在SLC高速缓冲存储器中,那么此策略可显著缩短SSD的寿命。因此,我没有在我的SSD上尝试过这个。
注意,一旦写入属于文件的SLC/TLC页集合,读取速度没有理由改变(除非请求的连续性较低--这对于未触及的文件和目标SSD上未执行的其他写入来说是意外的--或者如果固件决定意外地移动读取的页面--这在例如由于磨损均衡而同时进行其它写入时是可能的)。因此,以不同的顺序读取文件是很有意义的,这样可以检查坏的性能结果是与特定文件有关还是仅仅依赖于时间/热量.为了检查热量是否是一个问题,您可以在运行程序之前尝试密集使用SSD,并检查这是否影响基准测试的性能.注意75°C对这个SSD来说是相当高的(建议不要超过70°C).除了这个(以及FS的影响),我已经没有办法了.我希望这能有所帮助.

相关问题