網絡上有很多有關YUV4:2:2轉YUV4:2:0的描述,但大多數都是講解原理,沒有實際性的做法,本文把自己在TI DAVINCI DM6446 端的測試過的代碼奉獻出來,供大家參考和學習,同時拋磚引玉,希望得到大家的指點。本方法適合TI DM642,DM643x,DM644x等DSP系列,前段圖像采集格式一般都是YCbCr 4:2:2(YUV4:2:2),但很多視頻應用都需要對YUV4:2:2進行轉化成YUV4:2:0的格式,比如jpeg,MPEG4,H.264等,在DM643x,DM644x上,TI 采用EDMA3的方式實現轉換,那是另外的方法,這裡專門介紹通用的做法,在DM6441(513MHz)上處理640x480只需要7.5ms,而且還可以再優化,這個大家可以試試。
/***********************************************/
以PAL制為例,這裡的YCbCr 4:2:2(YUV4:2:2)像素排列方式是:
U0,0 Y0,0 V0,0 Y 0,1 U0,1 Y0,2 V0,1 Y 0,3......U0,359 Y0,718 V0,359 Y 0,719
..............................
U575,0 Y575,0 V575,0 Y 575,1 ...........U575,359 Y575,718 V575,359 Y 575,719
unsigned int tmp,tmp0,tmp1,tmp2;
tmp = (unsigned int)YCbCr_buf;/*對於D1,CIF,QCIF,VGA,QVGA的BUF肯定是4字節對齊,所以這裡定義unsigned int也是可以的,當然你也可以使用指針*/
tmp0 = (unsigned int)Y_buf;
for(y=0;y<ORG_IMG_HEIGHT;y++)
{
for(x=0;x<(ORG_IMG_WIDTH>>1);x+=4)
{
m0 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<1) + (x<<2)); /
m1 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<1) + ((x+1)<<2));
m2 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<1) + ((x+2)<<2));
m3 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<1) + ((x+3)<<2));
*(unsigned short*)(tmp0+y*ORG_IMG_WIDTH + (x<<1))=(unsigned short)(((m0>>16)&0xFF00)|((m0>>8)&0x00FF));
*(unsigned short*)(tmp0+y*ORG_IMG_WIDTH + ((x+1)<<1))=(unsigned short)(((m1>>16)&0xFF00)|((m1>>8)&0x00FF));
*(unsigned short*)(tmp0+y*ORG_IMG_WIDTH + ((x+2)<<1))=(unsigned short)(((m2>>16)&0xFF00)|((m2>>8)&0x00FF));
*(unsigned short*)(tmp0+y*RG_IMG_WIDTH + ((x+3)<<1))=(unsigned short)(((m3>>16)&0xFF00)|((m3>>8)&0x00FF));
}
}
tmp1=(unsigned int)U_buf;
tmp2=(unsigned int)V_buf;
for(y=0;y<(ORG_IMG_HEIGHT>>1);y++)
{
for(x=0;x<(ORG_IMG_WIDTH>>1);x+=4)
{
m0 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<2) + (x<<2));
m1 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<2) + ((x+1)<<2));
m2 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<2) + ((x+2)<<2));
m3 = *(unsigned int*)(tmp+y*(ORG_IMG_WIDTH<<2) + ((x+3)<<2));
*(unsigned char*)(tmp1+y*(ORG_IMG_WIDTH>>1) + x)=(unsigned char)m0;
*(unsigned char*)(tmp2+y*(ORG_IMG_WIDTH>>1) + x)=(unsigned char)(m0>>16);
*(unsigned char*)(tmp1+y*(ORG_IMG_WIDTH>>1) + x + 1)=(unsigned char)m1;
*(unsigned char*)(tmp2+y*(ORG_IMG_WIDTH>>1) + x + 1)=(unsigned char)(m1>>16);
*(unsigned char*)(tmp1+y*(ORG_IMG_WIDTH>>1) + x + 2)=(unsigned char)m2;
*(unsigned char*)(tmp2+y*(ORG_IMG_WIDTH>>1) + x + 2)=(unsigned char)(m2>>16);
*(unsigned char*)(tmp1+y*(ORG_IMG_WIDTH>>1) + x + 3)=(unsigned char)m3;
*(unsigned char*)(tmp2+y*(ORG_IMG_WIDTH>>1) + x + 3)=(unsigned char)(m3>>16);
}
}
}
對上面的代碼點評:DSP的優化原則,能移位,就不要乘除;能int 讀內存,就不要用char讀內存,因為C64,C64+的DSP 讀內存指令需要4個時鐘周期;循環能成4的倍數,最好拆4次操作,形成管道流水線操作,當然循環內部不能有if, break等語句。
另外,DM642或DM643有自己的效率更高的程序,這裡也奉獻給大家。
#include <csl.h>
#include <csl_dat.h>
#include <csl_cache.h>
#pragma DATA_SECTION(int_mem_temp, ".img_buf");/*可以把.img_buf定義到L2RAM*/
#pragma DATA_ALIGN(int_mem_temp, 128);
unsigned char int_mem_temp[720];
void yuv422to420( char *frameIn[], char *frm_out[],
int width, int height)
{
char *pSrcY = frameIn[0];
char *pSrcU = frameIn[1];
char *pSrcV = frameIn[2];
char *pDestY = frm_out[0];
char *pDestU = frm_out[1];
char *pDestV = frm_out[2];
unsigned int id;
unsigned int i;
for( i = 0; i < height; i++)
{
id = DAT_copy(pSrcY + (i * 720), int_mem_temp, 720);
id = DAT_copy(int_mem_temp, pDestY + (i * 720), 720);
DAT_wait(id);
}
for( i = 0; i < (height >> 1); i++)
{
id = DAT_copy(pSrcU + (i * 720), int_mem_temp, 360);
id = DAT_copy(int_mem_temp, pDestU + (i * 360), 360);
DAT_wait(id);
}
for( i = 0; i < (height >> 1); i++)
{
id = DAT_copy(pSrcV + (i * 720), int_mem_temp, 360);
id = DAT_copy(int_mem_temp, pDestV + (i * 360), 360);
DAT_wait(id);
}
return ;
}
void yuv420to422( char *frameIn[], char *frm_out[],
int width, int height)
{
char *pSrcY = frameIn[0];
char *pSrcU = frameIn[1];
char *pSrcV = frameIn[2];
char *pDestY = frm_out[0];
char *pDestU = frm_out[1];
char *pDestV = frm_out[2];
unsigned int id;
unsigned int i;
for( i = 0; i < height; i++)
{
id = DAT_copy(pSrcY + (i * 720), int_mem_temp, 720);
id = DAT_copy(int_mem_temp, pDestY + (i * 720), 720);
DAT_wait(id);
}
for( i = 0; i < (height >> 1); i++)
{
id = DAT_copy(pSrcU + (i * 360), int_mem_temp, 360);
id = DAT_copy(int_mem_temp, pDestU + ((2 * i) * 360), 360);
id = DAT_copy(int_mem_temp, pDestU + ((2*i + 1)* 360), 360);
DAT_wait(id);
}
for( i = 0; i < (height >> 1); i++)
{
id = DAT_copy(pSrcV + (i * 360), int_mem_temp, 360);
id = DAT_copy(int_mem_temp, pDestV + ((2*i) * 360), 360);
id = DAT_copy(int_mem_temp, pDestV + ((2*i+1) * 360), 360);
DAT_wait(id);
}
return ;
}