一個簡單的布隆過濾器

布隆過濾器(Bloom Filter)是1970年由布隆提出的。它其實是一個很長的二進制向量和一系列隨機映射函數。布隆過濾器能夠用於檢索一個元素是否在一個集合中。它的優勢是空間效率和查詢時間都遠遠超過通常的算法,缺點是有必定的誤識別率和刪除困難。 算法

——摘自維基百科 ide

由於查找別的資料,偶爾發現這一利器。興趣所致,寫了一個簡單的C實現。固然不少問題沒有考慮在內,僅就核心概念給出了一個縮略版實現,作爲一個提醒,加深一下印象。 函數

simple_bf.h: 測試

#ifndef _SIMPLE_BF_H_
#define _SIMPLE_BF_H_

#include <stdint.h>
#include <stdbool.h>

#define VEC_LEN (1024 * 1024 * 64 + 1)

#ifdef __cplusplus
extern "C" {
#endif

typedef struct _bf_vec {
        uint64_t part[VEC_LEN];
} bf_vec_t;

extern bool bf_add(bf_vec_t *vec, const char *str);

extern bool bf_is_contains(bf_vec_t *vec, const char *str);

#ifndef likely
#define likely(x) __builtin_expect((x),1)
#endif

#ifndef unlikely
#define unlikely(x) __builtin_expect((x),0)
#endif

#ifdef __cplusplus
}
#endif

#endif
simple_bf.c:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "simple_bf.h"

/* return false means the arg bit is over the bit length of the arg vec,
 * otherwise return true means successfully set the right bit.
 * for simple, only consider about these two conditions */
static inline __attribute__((always_inline)) bool bf_set_bit(bf_vec_t *vec, uint64_t bit)
{
	uint64_t part_cnt = bit / 64;
	if (unlikely(part_cnt > VEC_LEN))
		return false;
	uint8_t mod = bit % 64;
	vec->part[part_cnt] |= (1ULL << mod);
	return true;	
}

static inline __attribute__((always_inline)) bool bf_test_bit(bf_vec_t *vec, uint64_t bit)
{
	uint64_t part_cnt = bit / 64;
	if (unlikely(part_cnt > VEC_LEN))
		return false;
	uint8_t mod = bit % 64;
	return ((vec->part[part_cnt] & (1ULL << mod)) != 0);	
}

/* the seed vector and the BKDR hash function */
static uint32_t seeds[8] = {31, 131, 1313, 13131, 131313, 1313131, 13131313, 131313131};
static uint32_t bkdr_hash_modified(const char *str, uint32_t seed)
{
	register uint32_t hash = 0;
	uint32_t ch;
	while ((ch = (uint32_t)*str++)) {
		hash = hash * seed + ch;
	}
	return hash;
}

bool bf_add(bf_vec_t *vec, const char *str)
{
	int i;
	for (i = 0; i < 8; ++i) {
		uint32_t val = bkdr_hash_modified(str, seeds[i]);
		if (!bf_set_bit(vec, val))
			return false;
	}
	return true;
}

bool bf_is_contains(bf_vec_t *vec, const char *str)
{
	int i;
	for (i = 0; i < 8; ++i) {
		uint32_t val = bkdr_hash_modified(str, seeds[i]);
		if (!bf_test_bit(vec, val))
			return false;
	}
	return true;
}

#ifndef NDEBUG 
int main1(int argc, char *argv[])
{
	if (argc < 3) {
		fprintf(stderr, "usage: bf as0 [as1 as2 ...] ts\n"
				"as means string to add to the bloom filter\n"
				"ts means the string to test if it is in the filter vector\n"
				);
		return 1;
	}

	bf_vec_t *vec = calloc(1, sizeof(bf_vec_t));
	if (vec == NULL)
		return 1;
		
	int i;
	for (i = 1; i < argc - 1; ++i) {
		if (bf_add(vec, argv[i]))
			printf("add to bloom filter successed, string %s\n", argv[i]);
		else
			printf("add to bloom filter FAILED, string %s\n", argv[i]);
	}
	printf("------------------------------------------------------------------\n");
	if (bf_is_contains(vec, argv[argc - 1]))
		printf("test string %s is in bloom filter\n", argv[argc - 1]);
	else
		printf("test string %s is NOT in bloom filter\n", argv[argc - 1]);

	free(vec);
	return 0;
}

int main2(int argc, char *argv[])
{
	if (argc != 3) {
		fprintf(stderr, "usage: bf filename ts\n"
			"file of filename contains the strings to be added\n"
			"ts means the string to test if it is in the filter vector\n"
			);
		return 1;
	}

	bf_vec_t *vec = calloc(1, sizeof(bf_vec_t));
	if (vec == NULL)
		return 1;
	
	char buf[128] = {0};	
	FILE *fp = fopen(argv[1], "rt");
	if (fp == NULL) {
		free(vec);
		return 1;
	}
	while (fgets(buf, sizeof(buf), fp) != NULL) {
		buf[strlen(buf) - 1] = '\0';
		if (bf_add(vec, buf))
			printf("add to bloom filter successed, string %s\n", buf);
		else
			printf("add to bloom filter FAILED, string %s\n", buf);
	}

	printf("------------------------------------------------------------------\n");
	if (bf_is_contains(vec, argv[2]))
		printf("test string %s is in bloom filter\n", argv[2]);
	else
		printf("test string %s is NOT in bloom filter\n", argv[2]);
	
	free(vec);
	return 0;
}

int main(int argc, char *argv[])
{
	return main2(argc, argv);
}
#endif
在Linux 2.6.39.4 SMP x86_64 gcc version 4.4.3下作了簡單測試,效率仍是比較可觀的。沒有作量化的效率測試,也沒有對誤判率作量化測試。

我在考慮是否是要加一個謂詞回調接口,相似於: ui

typedef int (*bf_contains_cb)(void *data);
int bf_oper_if_contains(bf_vec_t *vec, const char *str, bf_contains_cb callback, void *data);
這樣可用性應該會好一些。暫且放下,等到真正要用時,再作詳細的量化測試和代碼完善吧。
相關文章
相關標籤/搜索