程式師世界 >> 編程語言 >> C語言 >> C++ >> C++入門知識 >> opencl初探-sobel檢測，opencl初探-sobel

opencl初探-sobel檢測，opencl初探-sobel

編輯：C++入門知識

opencl初探-sobel檢測，opencl初探-sobel

sobel檢測的C版本，neon和GPU的時間比較。

Platform: LG G3, Adreno 330 ,img size 3264x2448

sobel:

C code

neon

GPU

42+3.7+6.6

單位:ms GPU時間=memory time+Queued time+Run time

Sobel org

Sobel vector

Sobel vector + mem_fence

Queued time

4.6

7.2

2.8

Wait time

0.07

0.09

0.07

Run time

66.9

7.3

6.6

typedef unsigned char BYTE; void sobel(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy) { int src_step = w; int dst_step = w; int x, height = h - 2; BYTE* dstX = Ix+dst_step; BYTE* dstY = Iy+dst_step; for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step ) { const BYTE* src2 = src + src_step; const BYTE* src3 = src + src_step*2; for( x = 1; x < w-1 ; x++ ) { short t0 = 0 ; short t1 = 0 ; t0 = -src[x-1]+src[x+1] ; t1 = src[x-1]+(src[x]<<1)+src[x+1]; t0 += ((-src2[x-1]+src2[x+1])<<1) ; t0 += -src3[x-1]+src3[x+1] ; t1 -= ( src3[x-1]+(src3[x]<<1)+src3[x+1] ); dstX[x] = t0>>3; dstY[x] = t1>>3; } } } void sobel_neon(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy) { int src_step = w; int dst_step = w; int x, height = h - 2; BYTE* dstX = Ix+dst_step; BYTE* dstY = Iy+dst_step; for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step ) { const BYTE* src2 = src + src_step; const BYTE* src3 = src + src_step*2; x = 1; while((x+8) <= w-1 ) { uint8x8_t left = vld1_u8(src+x-1); uint8x8_t mid = vld1_u8(src+x) ; uint8x8_t right = vld1_u8(src+x+1) ; int16x8_t t0 = vreinterpretq_s16_u16( vsubl_u8(right,left) ) ; int16x8_t t1 = vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) , vreinterpretq_s16_u16( vshll_n_u8(mid,1) ) ); left = vld1_u8(src2+x-1); right = vld1_u8(src2+x+1) ; int16x8_t temp = vreinterpretq_s16_u16( vsubl_u8(right,left) ); t0 = vaddq_s16(t0,vshlq_n_s16(temp,1)); left = vld1_u8(src3+x-1); mid = vld1_u8(src3+x) ; right = vld1_u8(src3+x+1) ; t0 = vaddq_s16(t0,vreinterpretq_s16_u16( vsubl_u8(right,left) )); temp = vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) , vreinterpretq_s16_u16( vshll_n_u8(mid,1) ) ); t1 = vsubq_s16(t1,temp); vst1_s8((int8_t*)dstX+x,vshrn_n_s16(t0,3)); vst1_s8((int8_t*)dstY+x,vshrn_n_s16(t1,3)); x += 8; } while( (x) < w-1 ) { short t0 = 0 ; short t1 = 0 ; t0 = -src[x-1]+src[x+1] ; t1 = src[x-1]+(src[x]<<1)+src[x+1]; t0 += ((-src2[x-1]+src2[x+1])<<1) ; t0 += -src3[x-1]+src3[x+1] ; t1 -= ( src3[x-1]+(src3[x]<<1)+src3[x+1] ); dstX[x] = t0>>3; dstY[x] = t1>>3; x++; } } } View Code