字符串查找是信息安全、信息過濾領域的重要操作,尤其是對大文本的實時處理。這篇作為實例,使用GPU OpenCL進行精確模式串查找。
1.加速方法
(1)將少量常量數據,如模式串長度、文本長度等,保存在線程的private memory中。
(2)將模式串保存在GPU的local memory中,加速線程對模式串的訪問。
(3)將待查找的文本保存在global memory中,使用盡可能多線程訪問global memory,減小線程平均訪存時間。
(4)每個work-group中的線程操作文本中一段,多個work-group並行處理大文本。
2.同步
(1)work-group內,使用CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
(2)全局使用對__global int 的原子操作,來保證每個線程將結果寫到全局內存的正確位置。設備支持的操作可以通過查詢設備的擴展獲得,如下圖,可知核函數支持原子操作、printf操作:
3.代碼實例,大文本精確模式串搜索
3.1 核函數(string_search_kernel.cl):
int compare(__global const uchar* text, __local const uchar* pattern, uint length){ for(uint l=0; l<length; ++l){ if (text[l] != pattern[l]) return 0; } return 1; } __kernel void StringSearch ( __global uchar* text, //Input Text const uint textLength, //Length of the text __global const uchar* pattern, //Pattern string const uint patternLength, //Pattern length const uint maxSearchLength, //Maximum search positions for each work-group __global int* resultCount, //Result counts (global) __global int* resultBuffer, //Save the match result __local uchar* localPattern) //local buffer for the search pattern { int localIdx = get_local_id(0); int localSize = get_local_size(0); int groupIdx = get_group_id(0); uint lastSearchIdx = textLength - patternLength + 1; uint beginSearchIdx = groupIdx * maxSearchLength; uint endSearchIdx = beginSearchIdx + maxSearchLength; if(beginSearchIdx > lastSearchIdx) return; if(endSearchIdx > lastSearchIdx) endSearchIdx = lastSearchIdx; for(int idx = localIdx; idx < patternLength; idx+=localSize) localPattern[idx] = pattern[idx]; barrier(CLK_LOCAL_MEM_FENCE); for(uint stringPos=beginSearchIdx+localIdx; stringPos<endSearchIdx; stringPos+=localSize){ if (compare(text+stringPos, localPattern, patternLength) == 1){ int count = atomic_inc(resultCount); resultBuffer[count] = stringPos; //printf("%d ",stringPos); } barrier(CLK_LOCAL_MEM_FENCE); } }
3.2.tool.h 、tool.cpp
見:http://www.cnblogs.com/xudong-bupt/p/3582780.html
3.3 StringSearch.cpp
#include <CL/cl.h> #include "tool.h" #include <string.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <string> #include <fstream> using namespace std; int main(int argc, char* argv[]) { cl_int status; /**Step 1: Getting platforms and choose an available one(first).*/ cl_platform_id platform; getPlatform(platform); /**Step 2:Query the platform and choose the first GPU device if has one.*/ cl_device_id *devices=getCl_device_id(platform); /**Step 3: Create context.*/ cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); /**Step 4: Creating command queue associate with the context.*/ cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); /**Step 5: Create program object */ const char *filename = "string_search_kernel.cl"; string sourceStr; status = convertToString(filename, sourceStr); const char *source = sourceStr.c_str(); size_t sourceSize[] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL); /**Step 6: Build program. */ status=clBuildProgram(program, 1,devices,NULL,NULL,NULL); /**Step 7: Initial input,output for the host and create memory objects for the kernel*/ string textStr; //StringSearch_Input.txt convertToString("StringSearch_Input.txt", textStr); const char * text = textStr.c_str(); int textlen=strlen(text); char * pattern="info"; int patternlen=strlen(pattern); int maxSearchLength=256*64; int * resultCount=new int[1]; *resultCount=0; int * result=new int[textlen]; memset(result,0,sizeof(int)*textlen); cl_mem textBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(char)*textlen,(void *)text, NULL); //global memory cl_mem patternBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(char)*patternlen, (void *)pattern, NULL); cl_mem resultCountBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(int), (void *)resultCount, NULL); cl_mem resultBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(int)*textlen, (void *)result, NULL); /**Step 8: Create kernel object */ cl_kernel kernel = clCreateKernel(program,"StringSearch", NULL); /**Step 9: Sets Kernel arguments.*/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&textBuffer); //global status = clSetKernelArg(kernel, 1, sizeof(int), &textlen); //private status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&patternBuffer); //global status = clSetKernelArg(kernel, 3, sizeof(int), &patternlen); //private status = clSetKernelArg(kernel, 4, sizeof(int), &maxSearchLength); //private status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&resultCountBuffer); //global status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&resultBuffer); //global status = clSetKernelArg(kernel, 7, sizeof(char)*patternlen, NULL); //local /**Step 10: Running the kernel.*/ cl_event enentPoint; int globalWorkItem=textlen/64; if(textlen%64 != 0) globalWorkItem++; size_t groupNUm[1]={globalWorkItem}; size_t localNUm[1]={256}; status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, groupNUm, localNUm, 0, NULL, &enentPoint); clWaitForEvents(1,&enentPoint); ///wait clReleaseEvent(enentPoint); int count=0; status = clEnqueueReadBuffer(commandQueue, resultCountBuffer, CL_TRUE, 0, sizeof(int), &count, 0, NULL, NULL); cout<<"\nNumber of matches:"<<count<<endl; /**Step 12: Clean the resources.*/ status = clReleaseKernel(kernel);//*Release kernel. status = clReleaseProgram(program); //Release the program object. status = clReleaseMemObject(resultBuffer);//Release mem object. status = clReleaseMemObject(textBuffer);//Release mem object. status = clReleaseMemObject(resultCountBuffer);//Release mem object. status = clReleaseMemObject(patternBuffer);//Release mem object. status = clReleaseCommandQueue(commandQueue);//Release Command queue. status = clReleaseContext(context);//Release context. free(devices); free(result); free(resultCount); getchar(); return 0; }
作者:cnblogs 旭東的博客