numpy 进一步优化ISING模型

我已经用Python实现了二维ISING模型，使用了NumPy和Numba的JIT：

from timeit import default_timer as timer
import matplotlib.pyplot as plt
import numba as nb
import numpy as np

# TODO for Dict optimization.
# from numba import types
# from numba.typed import Dict

@nb.njit(nogil=True)
def initialstate(N):   
    ''' 
    Generates a random spin configuration for initial condition
    '''
    state = np.empty((N,N),dtype=np.int8)
    for i in range(N):
        for j in range(N):
            state[i,j] = 2*np.random.randint(2)-1
    return state


@nb.njit(nogil=True)
def mcmove(lattice, beta, N):
    '''
    Monte Carlo move using Metropolis algorithm 
    '''
    
    # # TODO* Dict optimization 
    # dict_param = Dict.empty(
    # key_type=types.int64,
    # value_type=types.float64,
    # )
    # dict_param = {cost : np.exp(-cost*beta) for cost in [-8, -4, 0, 4, 8] }

    for _ in range(N):
        for __ in range(N):
                a = np.random.randint(0, N)
                b = np.random.randint(0, N)
                s =  lattice[a, b]
                dE = lattice[(a+1)%N,b] + lattice[a,(b+1)%N] + lattice[(a-1)%N,b] + lattice[a,(b-1)%N]
                cost = 2*s*dE

                if cost < 0:
                    s *= -1
                
                #TODO* elif np.random.rand() < dict_param[cost]:
                elif np.random.rand() < np.exp(-cost*beta):
                    s *= -1
                lattice[a, b] = s

    return lattice

@nb.njit(nogil=True)
def calcEnergy(lattice, N):
    '''
    Energy of a given configuration
    '''
    energy = 0 
    
    for i in range(len(lattice)):
        for j in range(len(lattice)):
            S = lattice[i,j]
            nb = lattice[(i+1)%N, j] + lattice[i,(j+1)%N] + lattice[(i-1)%N, j] + lattice[i,(j-1)%N]
            energy += -nb*S
    return energy/2

@nb.njit(nogil=True)
def calcMag(lattice):
    '''
    Magnetization of a given configuration
    '''
    mag = np.sum(lattice, dtype=np.int32)
    return mag

@nb.njit(nogil=True)
def ISING_model(nT, N, burnin, mcSteps):

    """ 
    nT      :         Number of temperature points.
    N       :         Size of the lattice, N x N.
    burnin  :         Number of MC sweeps for equilibration (Burn-in).
    mcSteps :         Number of MC sweeps for calculation.

    """

    T       = np.linspace(1.2, 3.8, nT); 
    E,M,C,X = np.zeros(nT), np.zeros(nT), np.zeros(nT), np.zeros(nT)
    n1, n2  = 1.0/(mcSteps*N*N), 1.0/(mcSteps*mcSteps*N*N) 

    for temperature in range(nT):
        lattice = initialstate(N)         # initialise

        E1 = M1 = E2 = M2 = 0
        iT = 1/T[temperature]
        iT2= iT*iT
        
        for _ in range(burnin):           # equilibrate
            mcmove(lattice, iT, N)        # Monte Carlo moves

        for _ in range(mcSteps):
            mcmove(lattice, iT, N)           
            Ene = calcEnergy(lattice, N)  # calculate the Energy
            Mag = calcMag(lattice,)       # calculate the Magnetisation

            E1 += Ene
            M1 += Mag
            M2 += Mag*Mag 
            E2 += Ene*Ene

        E[temperature] = n1*E1
        M[temperature] = n1*M1
        C[temperature] = (n1*E2 - n2*E1*E1)*iT2
        X[temperature] = (n1*M2 - n2*M1*M1)*iT

    return T,E,M,C,X

def main():
    
    N = 32
    start_time = timer()
    T,E,M,C,X = ISING_model(nT = 64, N = N, burnin = 8 * 10**4, mcSteps = 8 * 10**4)
    end_time = timer()

    print("Elapsed time: %g seconds" % (end_time - start_time))

    f = plt.figure(figsize=(18, 10)); #  

    # figure title
    f.suptitle(f"Ising Model: 2D Lattice\nSize: {N}x{N}", fontsize=20)

    _ =  f.add_subplot(2, 2, 1 )
    plt.plot(T, E, '-o', color='Blue') 
    plt.xlabel("Temperature (T)", fontsize=20)
    plt.ylabel("Energy ", fontsize=20)
    plt.axis('tight')

    _ =  f.add_subplot(2, 2, 2 )
    plt.plot(T, abs(M), '-o', color='Red')
    plt.xlabel("Temperature (T)", fontsize=20)
    plt.ylabel("Magnetization ", fontsize=20)
    plt.axis('tight')

    _ =  f.add_subplot(2, 2, 3 )
    plt.plot(T, C, '-o', color='Green')
    plt.xlabel("Temperature (T)", fontsize=20)
    plt.ylabel("Specific Heat ", fontsize=20)
    plt.axis('tight')

    _ =  f.add_subplot(2, 2, 4 )
    plt.plot(T, X, '-o', color='Black')
    plt.xlabel("Temperature (T)", fontsize=20)
    plt.ylabel("Susceptibility", fontsize=20)
    plt.axis('tight')

    plt.show()

if __name__ == '__main__':
    main()

当然，这是可行的：

我有两个主要问题：
1.我知道ISING模型很难模拟，但是看下表，我好像遗漏了什么...

lattice size : 32x32    
    burnin = 8 * 10**4
    mcSteps = 8 * 10**4
    Simulation time = 365.98 seconds

    lattice size : 64x64    
    burnin = 10**5
    mcSteps = 10**5
    Simulation time = 1869.58 seconds

1.我试着实现另一种优化，不使用字典一遍又一遍地计算指数，但在我的测试中，它似乎更慢。我做错了什么？

指数的计算并不是一个真正的问题。主要的问题是生成随机数是昂贵的并且生成了大量的随机值。另一个问题是当前的计算本质上是顺序的。
实际上，对于N=32，mcmove倾向于生成大约3000的随机值，并且该函数每次迭代被调用2 * 80_000次。这意味着，每次迭代生成2 * 80_000 * 3000 = 480_000_000随机数。假设生成一个随机数大约需要5纳秒（即在4 GHz CPU上只有20个周期），那么每次迭代只需要大约2. 5秒就能生成所有随机数。在我的4. 5 GHz i5- 9600 KF CPU上，每次迭代大约需要2. 5 - 3. 0秒。
首先要做的是尝试使用更快的方法来生成随机数。坏消息是，这在Numba以及更普遍的任何基于Python的代码中都很难做到。使用低级语言（如C或C++）进行微优化可以显著地帮助加快计算速度。这种低级微优化在高级语言/工具（如Python，包括Numba）中是不可能的。尽管如此，可以实现随机数生成器，（RNG）专门设计用于生成所需的随机值. xoshiro256**可用于快速生成随机数，尽管它可能不如Numpy/Numba生成的随机数那么随机（没有免费的午餐）。这个想法是生成64位整数并提取位的范围，以便产生2个16位整数和一个32位浮点值。这个RNG应该能够在一个现代的CPU上在大约10个周期内生成3个值！
一旦应用了这种优化，指数的计算就成了新的瓶颈。它可以像你做的那样使用查找表（LUT）来改进。但是，使用字典很慢。你可以使用基本数组来做。这要快得多。注意，索引需要是正数和小的。因此，需要添加最小的开销。
一旦实施了前面的优化，新的瓶颈就是条件if cost < 0和elif c < ...。这些条件很慢，因为它们是不可预测的（由于结果是随机的）。实际上，现代CPU试图预测条件语句的结果，以避免CPU流水线中代价高昂的停顿。这是一个复杂的主题。如果你想了解更多，那么请阅读this great post。在实践中，使用无分支计算可以避免这样的问题。这意味着您需要使用二元运算符和整数棒，以便s的符号根据条件的值进行更改。例如：s *= 1 - ((cost < 0) | (c < lut[cost])) * 2 .
请注意，modulus通常开销很大，除非编译器在编译时知道值。当值为2的幂时，它们甚至更快，因为编译器可以使用位技巧来计算模数对于calcEnergy，一种解决方案是单独计算边界以完全避免模数。此外，当编译器知道编译时的迭代次数时，循环可以更快（它可以展开循环并更好地对它们进行矢量化）。此外，当N不是2的幂时，RNG的实现可能会明显更慢、更复杂，因此我假设N是2的幂。
下面是最终代码：

# [...] Same as in the initial code

@nb.njit(inline="always")
def rol64(x, k):
    return (x << k) | (x >> (64 - k))

@nb.njit(inline="always")
def xoshiro256ss_init():
    state = np.empty(4, dtype=np.uint64)
    maxi = (np.uint64(1) << np.uint64(63)) - np.uint64(1)
    for i in range(4):
        state[i] = np.random.randint(0, maxi)
    return state

@nb.njit(inline="always")
def xoshiro256ss(state):
    result = rol64(state[1] * np.uint64(5), np.uint64(7)) * np.uint64(9)
    t = state[1] << np.uint64(17)
    state[2] ^= state[0]
    state[3] ^= state[1]
    state[1] ^= state[2]
    state[0] ^= state[3]
    state[2] ^= t
    state[3] = rol64(state[3], np.uint64(45))
    return result

@nb.njit(inline="always")
def xoshiro_gen_values(N, state):
    '''
    Produce 2 integers between 0 and N and a simple-precision floating-point number.
    N must be a power of two less than 65536. Otherwise results will be biased (ie. not random).
    N should be known at compile time so for this to be fast
    '''
    rand_bits = xoshiro256ss(state)
    a = (rand_bits >> np.uint64(32)) % N
    b = (rand_bits >> np.uint64(48)) % N
    c = np.uint32(rand_bits) * np.float32(2.3283064370807974e-10)
    return (a, b, c)

@nb.njit(nogil=True)
def mcmove_generic(lattice, beta, N):
    '''
    Monte Carlo move using Metropolis algorithm.
    N must be a small power of two and known at compile time
    '''

    state = xoshiro256ss_init()

    lut = np.full(16, np.nan)
    for cost in (0, 4, 8, 12, 16):
        lut[cost] = np.exp(-cost*beta)

    for _ in range(N):
        for __ in range(N):
            a, b, c = xoshiro_gen_values(N, state)
            s =  lattice[a, b]
            dE = lattice[(a+1)%N,b] + lattice[a,(b+1)%N] + lattice[(a-1)%N,b] + lattice[a,(b-1)%N]
            cost = 2*s*dE

            # Branchless computation of s
            tmp = (cost < 0) | (c < lut[cost])
            s *= 1 - tmp * 2

            lattice[a, b] = s

    return lattice

@nb.njit(nogil=True)
def mcmove(lattice, beta, N):
    assert N in [16, 32, 64, 128]
    if N == 16: return mcmove_generic(lattice, beta, 16)
    elif N == 32: return mcmove_generic(lattice, beta, 32)
    elif N == 64: return mcmove_generic(lattice, beta, 64)
    elif N == 128: return mcmove_generic(lattice, beta, 128)
    else: raise Exception('Not implemented')

@nb.njit(nogil=True)
def calcEnergy(lattice, N):
    '''
    Energy of a given configuration
    '''
    energy = 0 
    # Center
    for i in range(1, len(lattice)-1):
        for j in range(1, len(lattice)-1):
            S = lattice[i,j]
            nb = lattice[i+1, j] + lattice[i,j+1] + lattice[i-1, j] + lattice[i,j-1]
            energy -= nb*S
    # Border
    for i in (0, len(lattice)-1):
        for j in range(1, len(lattice)-1):
            S = lattice[i,j]
            nb = lattice[(i+1)%N, j] + lattice[i,(j+1)%N] + lattice[(i-1)%N, j] + lattice[i,(j-1)%N]
            energy -= nb*S
    for i in range(1, len(lattice)-1):
        for j in (0, len(lattice)-1):
            S = lattice[i,j]
            nb = lattice[(i+1)%N, j] + lattice[i,(j+1)%N] + lattice[(i-1)%N, j] + lattice[i,(j-1)%N]
            energy -= nb*S
    return energy/2

@nb.njit(nogil=True)
def calcMag(lattice):
    '''
    Magnetization of a given configuration
    '''
    mag = np.sum(lattice, dtype=np.int32)
    return mag

# [...] Same as in the initial code

我希望代码中没有错误。使用不同的RNG很难检查结果。
生成的代码在我的计算机上运行速度明显加快：它使用N=32在5.3秒内计算4次迭代，而不是24.1秒。因此计算速度快了4.5倍！
使用Python中的Numba很难进一步优化代码。由于mcmove中的长依赖链，计算无法高效并行化。

numpy 进一步优化ISING模型

1条答案

相关问题

热门标签

最新问答