1.加速方法
(1)將少量常量數據,如模式串長度、文本長度等,保存在線程的private memory中。
(2)將模式串保存在GPU的local memory中,加速線程對模式串的訪問。
(3)將待查找的文本保存在global memory中,使用盡可能多線程訪問global memory,減小線程平均訪存時間。
(4)每個work-group中的線程操作文本中一段,多個work-group並行處理大文本。
2.同步
(1)work-group內,使用CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
(2)全局使用對__global int 的原子操作,來保證每個線程將結果寫到全局內存的正確位置。設備支持的操作可以通過查詢設備的擴展獲得,如下圖,可知核函數支持原子操作、printf操作:
3.代碼實例,大文本精確模式串搜索
3.1 核函數(string_search_kernel.cl):
int compare(__global const uchar* text, __local const uchar* pattern, uint length){
for(uint l=0; l
if (text[l] != pattern[l])
return 0;
}
return 1;
}
__kernel void
StringSearch (
__global uchar* text, //Input Text
const uint textLength, //Length of the text
__global const uchar* pattern, //Pattern string
const uint patternLength, //Pattern length
const uint maxSearchLength, //Maximum search positions for each work-group
__global int* resultCount, //Result counts (global)
__global int* resultBuffer, //Save the match result
__local uchar* localPattern) //local buffer for the search pattern
{
int localIdx = get_local_id(0);
int localSize = get_local_size(0);
int groupIdx = get_group_id(0);
uint lastSearchIdx = textLength - patternLength + 1;
uint beginSearchIdx = groupIdx * maxSearchLength;
uint endSearchIdx = beginSearchIdx + maxSearchLength;
if(beginSearchIdx > lastSearchIdx)
return;
if(endSearchIdx > lastSearchIdx)
endSearchIdx = lastSearchIdx;
for(int idx = localIdx; idx < patternLength; idx+=localSize)
localPattern[idx] = pattern[idx];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stringPos=beginSearchIdx+localIdx; stringPos
if (compare(text+stringPos, localPattern, patternLength) == 1){
int count = atomic_inc(resultCount);
resultBuffer[count] = stringPos;
//printf("%d ",stringPos);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}