今天,有個同學向我咨詢大數據的一些面試題,其中一類比較有代表性比如判斷是否在集合內,比如10個url,判斷一個url是否在集合內,還比如有個1~100萬個連續無序數字,隨機取出裡面的N個,求這N個數字等等。這類問題都需要一個大的數據集合,而且每個數據單元都很小,比如一個int 。很大程度上,這類問題可以用Bitmap或者Bloomfilter來做,基本思想就是開辟一塊大內存,然後利用一個byte裡的8個bit來實現按位標記元素。因為地址空間都是連續的,所以查找都是O(1)的。這裡需要說的是,BloomFilter判斷屬不屬於集合,在理論上是存在誤判的,如果要求數據100%正確,則不要使用BloomFilter。
進入正題,Bitmap正如其名,就是一塊內存,內存是一個一個連續的位圖,每一個位通過0、1代表一個元素的有無。比如數字為N的數字對應到Bitmap就是第N/8個byte的字節,和第N%8個01位,這麼映射。所以通過檢測對應的bit位即可知道數據在不在集合內,而且能保證正確。直接上代碼 :
[cpp]
#include <cstdlib>
#include <iostream>
#include <algorithm>
#include <vector>
#include <stddef.h>
#include <memory.h>
#define BYTES 12500
int main()
{
srand((unsigned int)time(NULL));
size_t total_numbers = 100000;
typedef std::vector<int> SetContainer;
typedef std::vector<int>::iterator SetIterator;
SetContainer numbers;
numbers.reserve(total_numbers);
int r1 = rand() % total_numbers;
int r2 = r1 + 1000;
// generate total_numbers-2 numbers
for(int i=0;i!=total_numbers;++i) {
if (i!=r1 && i!= r2)
numbers.push_back(i);
}
std::cout<<"["<<numbers.size()<<"] insert ok";
std::cin.get();
// shuffle
std::random_shuffle(numbers.begin(),numbers.end());
unsigned char *bitmap = (unsigned char*)malloc(BYTES);
memset(bitmap,0,BYTES);
for (SetIterator itr=numbers.begin();itr!=numbers.end();++itr) {
ptrdiff_t forward = (*itr) / 8;
size_t offset = (*itr) % 8;
bitmap[forward] |= (0x80UL >> offset);
}
std::cout<<"Bitmap build ok";
std::cin.get();
for (int j=0;j!=BYTES;++j) {
if (bitmap[j]!=0xFF) {
std::cout<<"FIND ";
unsigned long num = j * 8;
unsigned char check = bitmap[j];
unsigned char bit = 0;
while(bit!=8) {
if (0 == (check&(0x80UL>>bit)))
std::cout<<"["<<(num+bit)<<"] ";
bit++;
}
std::cout<<std::endl;
}
}
std::cout<<"DONE";
std::cin.get();
free(bitmap);
return 0;
}
BloomFilter,是由 Howard Bloom 在 1970 年提出的二進制向量數據結構,適合與比Bitmap更多量的數據,通過圖片看一下方法流程 :
1、初始化一塊大內存用於存放01標志位:
2、通過使用N個hash函數(N==3),對同一個值Hash多次哈希,然後同Bitmap一樣映射到Bloomfilter中去,
3、檢測時,同樣通過N次哈希,在映射的位中去找,並要保持映射的每一位都是1的情況下,即檢測處包含關系。正如前面說的,BloomFilter可能有誤判,誤判的幾率取決於Hash函數的個數,Hash函數沖撞的概率,以及Bloomfilter開開辟的內存大小。Hash函數的個數要取個合適的值,大了會造成效率問題,少了可能誤判高,理論5~10個之間,工程裡用3~5個,具體多少可以視需求而定。
代碼 :
[cpp]
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <algorithm>
#include <vector>
#include <stddef.h>
#include <memory.h>
#define BLOOM (1024UL*1024UL*1024UL) // 1G
#define HASH_RESULT 3
typedef unsigned char BloomFilter;
typedef struct __hash_result {
size_t N; // how many result
size_t result[0];
}HashResult;
/* Brian Kernighan & Dennis Ritchie hashfunction , used in Java */
size_t BKDR_hash(const char* str)
{
register size_t hash = 0;
while (size_t ch = (size_t)*str++) {
hash = hash * 131 + ch;
}
return hash;
}
/* Unix System Hashfunction , also used in Microsoft's hash_map */
size_t FNV_hash(const char* str)
{
if(!*str)
return 0;
register size_t hash = 2166136261;
while (size_t ch = (size_t)*str++) {
hash *= 16777619;
hash ^= ch;
}
return hash;
}
/* Donald Knuth Hashfunction , presented in book <Art of Computer Programming> */
size_t DEK_hash(const char* str)
{
if(!*str)
return 0;
register size_t hash = 1315423911;
while (size_t ch = (size_t)*str++) {
hash = ((hash << 5) ^ (hash >> 27)) ^ ch;
}
return hash;
}
typedef size_t (*HASH_FUNC)(const char*);
HASH_FUNC HASH[] = {
BKDR_hash,FNV_hash,DEK_hash
};
void bloom_filter_mark(BloomFilter* bf, const char* v)
{
HashResult *hr = (HashResult*)calloc(1,sizeof(HashResult)+(sizeof(size_t)*HASH_RESULT));
for (int i=0;i!=HASH_RESULT;++i) {
hr->result[i] = (HASH[i](v)) % BLOOM;
// set the binary bit to 1
bf[hr->result[i]/8] |= 0x80UL >> (hr->result[i]%8);
//printf("**%lu|hash-%d[%lu]|offset[%X]\n",HASH[i](v),i,hr->result[i],bf[hr->result[i]/8]);
}
free(hr);
}
bool bloom_filter_check(BloomFilter* bf, const char* v)
{
HashResult *hr = (HashResult*)calloc(1,sizeof(HashResult)+(sizeof(size_t)*HASH_RESULT));
size_t in = HASH_RESULT;
for (int i=0;i!=HASH_RESULT;++i) {
hr->result[i] = HASH[i](v) % BLOOM;
//printf("**%lu|%X\n",hr->result[i],bf[hr->result[i]/8]);
// check this bit is "1" or not
if (bf[hr->result[i]/8] & (0x80UL >> (hr->result[i]%8)))
in--;
}
free(hr);
return in == 0;
}
int main()
{
// std::cout<<BKDR_hash("0")<<std::endl;
// std::cout<<DEK_hash("0")<<std::endl;
// std::cout<<FNV_hash("0")<<std::endl;
BloomFilter* bloom = new (std::nothrow) BloomFilter[BLOOM];
if (NULL == bloom)
printf("No Space to build BloomFilter\n"),exit(0);
printf("BloomFilter Calloc Memory Ok\n");
for(int i=0;i!=1000000;i++) {
char buf[16] = {0};
sprintf(buf,"%d",i);
bloom_filter_mark(bloom,buf);
}
printf("BloomFilter Build Ok\n");
for(int i=999995;i!=1000010;i++) {
char buf[16] = {0};
sprintf(buf,"%d",i);
if (bloom_filter_check(bloom,buf))
printf("[FOUND] %d\n",i);
}
delete bloom;
return 0;
}