C语言 寻找 neon 代码的性能改进,以匹配屏幕上的剪切区域

ifmq2ha2  于 2023-01-04  发布在  其他
关注(0)|答案(2)|浏览(170)

这是我的测试代码,找到屏幕上的第一个剪切区域。两个子例程和伪循环的代码,以比较他们的性能。
point_in_ neon (NEON版本)和point_in(常规版本)执行相同的操作:在给定列表中找到第一个裁剪区域(包含给定点),如果没有匹配区域,则返回-1。
我期望 neon 版本比普通版本快。不幸的是,它比普通版本慢。有没有别的方法可以加快它?
编译器命令为:

${CC} -O2 -ftree-vectorize -o vcomp vcomp.c

谢谢你,

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>

#define WIDTH   (4096)
#define HEIGHT  (4096)
#define CLIPS   (32)

static inline uint64_t now(void) {
    struct timeval tv;

    gettimeofday(&tv,NULL);
    return tv.tv_sec*1000000+tv.tv_usec;
}

typedef struct _rect_t {
    int32_t x;
    int32_t y;
    uint32_t width;
    uint32_t height;
} rect_t;

typedef struct _point_t {
    int32_t x;
    int32_t y;
} point_t;

int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
    const int32_t right[4]={
        rs[0].x+rs[0].width-1,
        rs[1].x+rs[1].width-1,
        rs[2].x+rs[2].width-1,
        rs[3].x+rs[3].width-1
    }, bottom[4]={
        rs[0].y+rs[0].height-1,
        rs[1].y+rs[1].height-1,
        rs[2].y+rs[2].height-1,
        rs[3].y+rs[3].height-1
    };
    int32x4_t p, r;
    uint32x4_t t;
    uint32_t res[4];

    //p = <Xp, Xp, Xp, Xp>
    p=vld1q_dup_s32(&pt->x);

    //r = <Left0, Left1, Left2, Left3>
    r=vld1q_lane_s32(&rs[0].x, r, 0);
    r=vld1q_lane_s32(&rs[1].x, r, 1);
    r=vld1q_lane_s32(&rs[2].x, r, 2);
    r=vld1q_lane_s32(&rs[3].x, r, 3);
    //t = (p >= r)
    t=vcgeq_s32(p, r);

    //r = <Right0, Right1, Right2, Right3>
    r=vld1q_s32(&right);
    //t = t & (r >= p)
    t=vandq_u32(t, vcgeq_s32(r, p));
    
    //p = <Yp, Yp, Yp, Yp>
    p=vld1q_dup_s32(&pt->y);
    
    //r = <Top0, Top1, Top2, Top3>
    r=vld1q_lane_s32(&rs[0].y, r, 0);
    r=vld1q_lane_s32(&rs[1].y, r, 1);
    r=vld1q_lane_s32(&rs[2].y, r, 2);
    r=vld1q_lane_s32(&rs[3].y, r, 3);
    //t = t & (p >= r)
    t=vandq_u32(t, vcgeq_s32(p, r));
   
    //r = <Bottom0, Bottom1, Bottom2, Bottom3>
    r=vld1q_s32(&bottom);
    //t = t & (r >= p)
    t=vandq_u32(t, vcgeq_s32(r, p));

    vst1q_u32(res, t);
    
    if(res[0])
        return 0;
    else if(res[1])
        return 1;
    else if(res[2])
        return 2;
    else if(res[3])
        return 3;

    return -1;
}

int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
    int32_t i;
    
    for(i=0;i<len;i++) {
        int32_t right=rs[i].x+rs[i].width-1,
                bottom=rs[i].y+rs[i].height-1;
                
        if(pt->x>=rs[i].x && pt->x<=right &&
           pt->y>=rs[i].y && pt->y<=bottom)
            return i;
    }

    return -1;
}

int32_t main(int32_t argc, char *argv[]) {
    rect_t rs[CLIPS];
    
    int32_t i, j;
    uint64_t ts0, ts1;
    int32_t res[2][CLIPS];
    
    srand((unsigned int)time(NULL));
    for(i=0;i<CLIPS;i++) {
        rs[i].x=rand()%WIDTH;
        rs[i].y=rand()%HEIGHT;
        rs[i].width=rand()%WIDTH;
        rs[i].height=rand()%HEIGHT;
    }
    memset(res, 0, sizeof(res));

    ts0=now();
    for(i=0;i<HEIGHT;i++) {
        for(j=0;j<WIDTH;j++) {
            point_t p={i, j};
            int32_t idx=point_in(&p, rs, CLIPS);
            
            if(idx>=0)
                res[0][idx]=1;
        }
    }
    ts0=now()-ts0;

    ts1=now();
    for(i=0;i<HEIGHT;i++) {
        for(j=0;j<WIDTH;j++) {
            int32_t k, idx;
            point_t p={i, j};
            
            for(k=0, idx=-1;k<CLIPS/4;k++) {
                idx=point_in_neon(&p, &rs[k*4]);
                if(idx>=0)
                    break;
            }
            
            if(idx>=0)
                res[1][k*4+idx]=1;
        }
    }
    ts1=now()-ts1;

    /*
    for(i=0;i<CLIPS;i++) {
        if(res[0][i]!=res[1][i]) {
            printf("error.\n");
            return 1;
        }
    }
    */

    printf("regular = %lu\n", ts0);
    printf("neon    = %lu\n", ts1);
   
    return 0;
}
a1o7rhls

a1o7rhls1#

根据Peter Cordes的建议,我用vld4q_s32内部函数替换了point_in_ neon 子例程的数据洛丁部分,随后的右和底部计算可以矢量化,现在的代码比常规版本更短更快。

int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
    int32x4x4_t r;
    int32x4_t right, bottom, p;
    uint32x4_t t;
    uint32_t res[4];

    /*
      r.val[0] = <X0, X1, X2, X3>
      r.val[1] = <Y0, Y1, Y2, Y3>
      r.val[2] = <Width0, Width1, Width2, Width3>
      r.val[3] = <Height0, Height1, Height2, Height3>
    */
    r=vld4q_s32(rs);
    //right = <Right0, Right1, Right2, Right3>
    right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
    //bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
    bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));

    //p = <Xp, Xp, Xp, Xp>
    p=vld1q_dup_s32(&pt->x);
    //t = (p >= left)
    t=vcgeq_s32(p, r.val[0]);
    //t = t & (right >= p)
    t=vandq_u32(t, vcgeq_s32(right, p));
    //p = <Yp, Yp, Yp, Yp>
    p=vld1q_dup_s32(&pt->y);
    //t = t & (p >= top)
    t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
    //t = t & (r >= bottom)
    t=vandq_u32(t, vcgeq_s32(bottom, p));
    
    vst1q_u32(res, t);
    if(res[0])
        return 0;
    else if(res[1])
        return 1;
    else if(res[2])
        return 2;
    else if(res[3])
        return 3;

    return -1;
}
cu6pst1q

cu6pst1q2#

从原始的point_in方法开始,我们可以通过删除-1并将〈=改为〈来稍微清理一下。

int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
    int32_t i;
    
    for(i=0; i < len; i++) 
    {
        // this is pointless - change your data structures so that
        // the rect stores minx/maxx, miny/maxy instead!
        int32_t right = rs[i].x + rs[i].width;
        int32_t bottom= rs[i].y + rs[i].height;

        bool cmp0 = pt->x >= rs[i].x;
        bool cmp1 = pt->y >= rs[i].y;
        bool cmp2 = pt->x < right;
        bool cmp3 = pt->y < bottom;
        if(cmp0 & cmp1 & cmp2 & cmp3)
            return i;
    }
    return -1;
}

接下来要指出的是:

// your screen size... 
#define WIDTH   (4096)
#define HEIGHT  (4096)

// yet your structures use uint32 as storage???
typedef struct _rect_t {
    int32_t x;
    int32_t y;
    uint32_t width;
    uint32_t height;
} rect_t;

typedef struct _point_t {
    int32_t x;
    int32_t y;
} point_t;

如果您可以使用16位整数,速度将提高一倍 (因为您可以在SIMD寄存器中装入8 x 16位数,而不是4 x 32位数)。既然如此,我们不妨同时将数据布局更改为数组结构。
我还将把无意义的p.x +宽度去掉,并将其存储为xmax/ymax (删除循环中的重复计算)

typedef struct rect_x8_t {
    int16x8_t x;
    int16x8_t y;
    int16x8_t xmax; //< x + width
    int16x8_t ymax; //< y + height
} rect_x8_t;

typedef struct point_x8_t {
    int16x8_t x;
    int16x8_t y;
} point_x8_t;

假设您没有可被8整除的剪辑数量,我们需要稍微填充该数量(不是什么大问题)

// assuming this has already been initialised
rect_t rs[CLIPS];

// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);

// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};

// I'm going to do this rubbishly as an pre-process step. 
// I don't care too much about efficiency here... 
for(uint32_t i = 0; i < CLIPS; ++i) {
    rs8[i / 8].x[i & 7] = rs[i].x;
    rs8[i / 8].y[i & 7] = rs[I].y;
    rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
    rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}

我有几点担心:

for(i=0;i<HEIGHT;i++) {
        for(j=0;j<WIDTH;j++) {
    
            // This seems wrong? Shouldn't it be p = {j, i} ?
            point_t p={i, j};
            int32_t idx=point_in(&p, rs, CLIPS);

            // I'm not quite sure what the result says about your 
            // image data and clip regions??? 
            //
            // This seems like a really silly way of asking 
            // a simple question about the clip regions. The pixels
            // don't have any effect here. 
            if(idx >= 0)
                res[0][idx] = 1;
        }
    }

无论如何,现在重构point_in方法以使用int16x8_t,我们得到:

inline int32_t point_in_x8(const point_x8_t pt, 
                           const rect_x8_t* rs,
                           uint32_t len) {
    for(int32_t i = 0; i < len; i++) {
        // perform comparisons on 8 rects at a time
        uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
        uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
        uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
        uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);

        // combine to single comparison value
        uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
        uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
        uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);

        // use a horizontal max to see if any lanes are true
        if(vmaxvq_u16(cmp0123)) {
            for(int32_t j = 0; j < 8; ++j) {
                if(cmp0123[j])
                    return 8*i + j;
            }
        }
    }
    return -1;
}

rect_x8_t结构体中的任何其他填充元素最终都应该被忽略(因为它们应该是0/0、0/0,最终总是false)。
最后...

for(i = 0; i < HEIGHT; i++) {
        point_x8_t p;
        // splat the y value
        p.y = vld1q_dup_s16(i);
        for(j = 0; j < WIDTH; j++) {
            // splat the x value
            p.x = vld1q_dup_s16(j);
    
            int32_t idx = point_in_x8(p, rs8, CLIPS8);

            if(idx >= 0)
                res[1][idx] = 1;
        }
    }

vld4指令实际上有一个相当高的延迟。考虑到WIDTH * HEIGHT实际上是一个非常大的数字,这里的预混合(作为预处理步骤)更有意义。

然而

整个算法可以通过简单地忽略像素,直接处理CLIP区域来得到很大的改进。

如果剪辑区域完全包含在前面的剪辑区域中,则该剪辑区域为false

for(i = 0; i < CLIPS; i++) {

   // if region is empty, ignore. 
   if(rs[i].width == 0 || rs[i].height == 0) {
       res[0][i] = 0;
       continue;
   }

   // first region will always be true (unless it's of zero size)
   if(i == 0) {
      res[0][1] = 1;
      continue;
   }

   uint32_t how_many_intersect = 0;
   bool entirely_contained = false;
   uint32_t intersection_indices[CLIPS] = {};

   // do a lazy test first.
   for(j = i - 1; j >= 0; --j) {
       // if the last region is entirely contained by preceding
       // ones, it will be false. exit loop. 
       if(region_is_entirely_contained(rs[i], rs[j])) {
          res[0][i] = 0;
          entirely_contained = true;
          j = -1; ///< break out of loop
       }
       else
       // do the regions intersect?
       if(region_intersects(rs[i], rs[j])) {
          intersection_indices[how_many_intersect] = j;
          ++how_many_intersect;
       }
   }

   // if one region entirely contains this clip region, skip it.
   if(entirely_contained) {
      continue; 
   }

   // if you only intersect one or no regions, the result is true.
   if(how_many_intersect <= 1) {
       res[0][i] = 1;
       continue; 
   }

   // If you get here, the result is *probably* true, however 
   // you will need to split this clip region against the previous
   // ones to be fully sure. If all regions are fully contained, 
   // the answer is false. 

   // I won't implement it, but something like this:

 * split rs[i] against each rs[intersection_indices[]].
 * Throw away the rectangles that are entirely contained. 
 * Each bit that remains should be tested against each rs[intersection_indices[]] 
 * If you find any split rectangle that isn't contained, 
   set to true and move on. 

 

}

相关问题