C语言 为什么OpenMP会扰乱这个for循环?

yyyllmsg  于 2022-12-03  发布在  其他
关注(0)|答案(1)|浏览(133)

我尝试在我的音频代码中使用OpenMP。我使用Miniaudio作为音频后端,一个非常方便的单头多平台库,你可以找到它here
下面是我的代码,我试图将它提炼成最小的工作形式:

/*** DEFINES */
    #define MA_NO_DECODING
    #define MA_NO_ENCODING
    #define MINIAUDIO_IMPLEMENTATION

    #define WAVES_QTY 1000
    #define FREQ_INIT 50.f
    #define FREQ_STEP 1.f

    #define DEVICE_FORMAT ma_format_f32
    #define DEVICE_CHANNELS 1
    #define DEVICE_SAMPLE_RATE 8000
/* DEFINES end. */

/*** INCLUDES */
    /* single header library from https://github.com/mackron/miniaudio/blob/master/miniaudio.h */
    #include "miniaudio.h"

    #include <stdio.h>
    #include <stdint.h>
    #include <math.h>
    #include <omp.h>
/* INCLUDES end. */

/*** GLOBALS */
    float phaseArray[WAVES_QTY];
    float amplitudeArray[WAVES_QTY];
    float freqArray[WAVES_QTY];
/* GLOBALS end. */

/*** FUNCTION DECLARATIONS */
    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
/* FUNCTION DECLARATIONS end. */    

/*** MAIN */
    int main(){
        uint32_t i;
        ma_device_config deviceConfig;
        ma_device device;

        printf("== will now initialize arrays...");
        for(i=0;i<WAVES_QTY;i++){
            phaseArray[i] = 0.f;
            amplitudeArray[i] = 1.f;
            freqArray[i] = FREQ_INIT + i*FREQ_STEP;
            amplitudeArray[i] /=  ((float)WAVES_QTY);/* so we don't overflow max volume */
        }
        printf(" DONE!\n");

        deviceConfig = ma_device_config_init(ma_device_type_playback);
        deviceConfig.playback.format   = DEVICE_FORMAT;
        deviceConfig.playback.channels = DEVICE_CHANNELS;
        deviceConfig.sampleRate        = DEVICE_SAMPLE_RATE;
        deviceConfig.dataCallback      = data_callback;
        if(ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS){
            printf("Failed to open playback device.\n");
            return -4;
        }

        printf("== Device Name: %s\n", device.playback.name);

        /* this is the actual sound start */
        if (ma_device_start(&device) != MA_SUCCESS) {
            printf("== Failed to start playback device.\n");
            ma_device_uninit(&device);
            return -5;
        }

        printf("~~~ You should hear sound now ~~~\n");
        printf("== Press Enter to quit...");
        getchar();

        ma_device_uninit(&device); /* clean up */

        return 0;
    }
/* MAIN end. */

/*** FUNCTION DEFINITIONS */
    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
        float* Samples = pOutput;
        ma_uint32 SampleIndex;

        for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex++){
            uint32_t ii;
            *Samples = 0.f;

            /**** HERE */
            #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples)
            for(ii=0;  ii<WAVES_QTY; ii++){
                phaseArray[ii] = fmod(phaseArray[ii] + (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
                *Samples += (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
            }

            Samples++;
        }
        (void)pDevice;
        (void)pInput;
    }
/* FUNCTION DEFINITIONS end. */

使用gcc -g0 thousandwaves.c -fopenmp -o thousandwaves.exe -Wall -Wextra -Wshadow -Werror=implicit-int -Werror=incompatible-pointer-types -Werror=int-conversion -Wvla -pedantic-errors -ansi在Win10、MinGW 32上编译
该程序非常简单,它可以生成1000个正弦波并将它们实时混合在一起。我希望使用OpenMP在所有可用线程上运行正弦波生成,因此我在data_callback函数中添加了#pragma omp parallel for(我还用注解/**** HERE */标记了确切的位置)。它编译和运行时没有错误/警告,但程序产生了非常嘈杂的声音,(只需注解/删除#pragma,听听它的声音)。
我猜这与嵌套的for循环有关,但我不能确定问题所在。我的#pragma只针对内部循环,因此它应该只对该部分进行并行化。
让我知道你的想法,谢谢:))
编辑:非常感谢,reduction子句确实起到了作用,而且,在循环中使用atomic进一步加快了速度

void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
        float* Samples = pOutput;
        ma_uint32 SampleIndex;

        for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex++){
            uint32_t ii;
            float sample = 0.f;

            #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples) reduction(+:sample)
            for(ii=0;  ii<WAVES_QTY; ii++){
                phaseArray[ii] = fmod(phaseArray[ii] + (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
                #pragma omp atomic update
                sample += (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
                
            }

            *Samples = sample;
            Samples++;
        }
        (void)pDevice;
        (void)pInput;
    }
fdbelqdn

fdbelqdn1#

所有线程都在向*Samples写入,这是一个竞争条件,它应该是原子的,并且将导致比串行代码更慢的代码。

#pragma omp atomic
*Samples += (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];

编辑:正如@Jérôme所指出的,减少会更快

void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) {
    float* Samples = pOutput;
    ma_uint32 SampleIndex;

    for (SampleIndex = 0; SampleIndex < frameCount; SampleIndex++) {
        int32_t ii;
        *Samples = 0.f;
        float sample = 0.f;

        /**** HERE */
        #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray) reduction(+:sample)
        for (ii = 0; ii < WAVES_QTY; ii++) {
            phaseArray[ii] = fmod(phaseArray[ii] + (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
            sample += (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
        }
        *Samples = sample;
        Samples++;
    }
    (void)pDevice;
    (void)pInput;
}

相关问题