numpy 使用多处理加速np.sum

如果我有一个大小为10^8到10^9的numpy数组，计算它的和是否可能比np.sum更快？
我尝试过将multiprocessing与fork一起使用，但无论工作线程的数量（1-4）如何，它似乎都比只调用np.sum慢。我在配有2 GHz双核英特尔酷睿i5处理器的Mac上使用Python 3.8。不确定如果我有更多的CPU，结果是否会有所不同。
我的代码：

import concurrent.futures
import multiprocessing as mp
import time
from concurrent.futures.process import ProcessPoolExecutor

import numpy as np

# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2

def np_sum_global(start, stop):
    return np.sum(data[start:stop])

def benchmark():
    st = time.time()
    ARRAY_SIZE = int(3e8)
    print("array size =", ARRAY_SIZE)
    global data
    data = np.random.random(ARRAY_SIZE)
    print("generated", time.time() - st)
    print("CPU Count =", mp.cpu_count())

    for trial in range(5):
        print("TRIAL =", trial)
        st = time.time()
        s = np.sum(data)
        print("method 1", time.time() - st, s)

        for NUM_WORKERS in range(1, 5):
            st = time.time()
            futures = []
            with ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
                for i in range(0, NUM_WORKERS):
                    futures.append(
                        executor.submit(
                            np_sum_global,
                            ARRAY_SIZE * i // NUM_WORKERS,
                            ARRAY_SIZE * (i + 1) // NUM_WORKERS,
                        )
                    )
            futures, _ = concurrent.futures.wait(futures)
            s = sum(future.result() for future in futures)
            print("workers =", NUM_WORKERS, time.time() - st, s)
        print()

if __name__ == "__main__":
    mp.set_start_method("fork")
    benchmark()

输出：

array size = 300000000
generated 5.1455769538879395
CPU Count = 4
TRIAL = 0
method 1 0.29593801498413086 150004049.39847052
workers = 1 1.8904719352722168 150004049.39847052
workers = 2 1.2082111835479736 150004049.39847034
workers = 3 1.2650330066680908 150004049.39847082
workers = 4 1.233708143234253 150004049.39847046

TRIAL = 1
method 1 0.5861320495605469 150004049.39847052
workers = 1 1.801928997039795 150004049.39847052
workers = 2 1.165492057800293 150004049.39847034
workers = 3 1.2669389247894287 150004049.39847082
workers = 4 1.2941789627075195 150004049.39847043

TRIAL = 2
method 1 0.44912219047546387 150004049.39847052
workers = 1 1.8038971424102783 150004049.39847052
workers = 2 1.1491520404815674 150004049.39847034
workers = 3 1.3324410915374756 150004049.39847082
workers = 4 1.4198641777038574 150004049.39847046

TRIAL = 3
method 1 0.5163640975952148 150004049.39847052
workers = 1 3.248213052749634 150004049.39847052
workers = 2 2.5148861408233643 150004049.39847034
workers = 3 1.0224149227142334 150004049.39847082
workers = 4 1.20924711227417 150004049.39847046

TRIAL = 4
method 1 1.2363107204437256 150004049.39847052
workers = 1 1.8627309799194336 150004049.39847052
workers = 2 1.233341932296753 150004049.39847034
workers = 3 1.3235111236572266 150004049.39847082
workers = 4 1.344843864440918 150004049.39847046

我看过的一些链接：

How to parallelize a sum calculation in python numpy?
在Pool.mapPython多处理中将www.example.com与共享内存数组结合使用

这是numba的一个测试，它首先要编译代码，这使得第一次运行的速度慢了很多，接下来的运行速度是numpy的2到3倍，所以这取决于你运行代码的频率，numba是否值得你这么做。

import numba
import numpy as np
import time

# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2

@numba.jit(nopython=True, parallel=True, cache=True)
def numba_sum(data):
    return np.sum(data)

def benchmark():
    st = time.time()
    ARRAY_SIZE = int(3e8)
    print("array size =", ARRAY_SIZE)
    global data
    data = np.random.random(ARRAY_SIZE)
    print("generated", time.time() - st)

    for trial in range(5):
        print("TRIAL =", trial)
        st = time.time()
        s = np.sum(data)
        print("method 1", time.time() - st, s)
        print("TRIAL =", trial)
        st = time.time()
        s = numba_sum(data)
        print("method 2", time.time() - st, s)

if __name__ == "__main__":
    benchmark()

numpy 使用多处理加速np.sum

1条答案

相关问题

热门标签

最新问答