布隆過濾器(Bloom Filter)是1970年由布隆提出的。它其實是一個很長的二進制向量和一系列隨機映射函數。布隆過濾器能夠用於檢索一個元素是否在一個集合中。它的優勢是空間效率和查詢時間都遠遠超過通常的算法,缺點是有必定的誤識別率和刪除困難。 算法
——摘自維基百科 ide
由於查找別的資料,偶爾發現這一利器。興趣所致,寫了一個簡單的C實現。固然不少問題沒有考慮在內,僅就核心概念給出了一個縮略版實現,作爲一個提醒,加深一下印象。 函數
simple_bf.h: 測試
#ifndef _SIMPLE_BF_H_ #define _SIMPLE_BF_H_ #include <stdint.h> #include <stdbool.h> #define VEC_LEN (1024 * 1024 * 64 + 1) #ifdef __cplusplus extern "C" { #endif typedef struct _bf_vec { uint64_t part[VEC_LEN]; } bf_vec_t; extern bool bf_add(bf_vec_t *vec, const char *str); extern bool bf_is_contains(bf_vec_t *vec, const char *str); #ifndef likely #define likely(x) __builtin_expect((x),1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect((x),0) #endif #ifdef __cplusplus } #endif #endifsimple_bf.c:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include "simple_bf.h" /* return false means the arg bit is over the bit length of the arg vec, * otherwise return true means successfully set the right bit. * for simple, only consider about these two conditions */ static inline __attribute__((always_inline)) bool bf_set_bit(bf_vec_t *vec, uint64_t bit) { uint64_t part_cnt = bit / 64; if (unlikely(part_cnt > VEC_LEN)) return false; uint8_t mod = bit % 64; vec->part[part_cnt] |= (1ULL << mod); return true; } static inline __attribute__((always_inline)) bool bf_test_bit(bf_vec_t *vec, uint64_t bit) { uint64_t part_cnt = bit / 64; if (unlikely(part_cnt > VEC_LEN)) return false; uint8_t mod = bit % 64; return ((vec->part[part_cnt] & (1ULL << mod)) != 0); } /* the seed vector and the BKDR hash function */ static uint32_t seeds[8] = {31, 131, 1313, 13131, 131313, 1313131, 13131313, 131313131}; static uint32_t bkdr_hash_modified(const char *str, uint32_t seed) { register uint32_t hash = 0; uint32_t ch; while ((ch = (uint32_t)*str++)) { hash = hash * seed + ch; } return hash; } bool bf_add(bf_vec_t *vec, const char *str) { int i; for (i = 0; i < 8; ++i) { uint32_t val = bkdr_hash_modified(str, seeds[i]); if (!bf_set_bit(vec, val)) return false; } return true; } bool bf_is_contains(bf_vec_t *vec, const char *str) { int i; for (i = 0; i < 8; ++i) { uint32_t val = bkdr_hash_modified(str, seeds[i]); if (!bf_test_bit(vec, val)) return false; } return true; } #ifndef NDEBUG int main1(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "usage: bf as0 [as1 as2 ...] ts\n" "as means string to add to the bloom filter\n" "ts means the string to test if it is in the filter vector\n" ); return 1; } bf_vec_t *vec = calloc(1, sizeof(bf_vec_t)); if (vec == NULL) return 1; int i; for (i = 1; i < argc - 1; ++i) { if (bf_add(vec, argv[i])) printf("add to bloom filter successed, string %s\n", argv[i]); else printf("add to bloom filter FAILED, string %s\n", argv[i]); } printf("------------------------------------------------------------------\n"); if (bf_is_contains(vec, argv[argc - 1])) printf("test string %s is in bloom filter\n", argv[argc - 1]); else printf("test string %s is NOT in bloom filter\n", argv[argc - 1]); free(vec); return 0; } int main2(int argc, char *argv[]) { if (argc != 3) { fprintf(stderr, "usage: bf filename ts\n" "file of filename contains the strings to be added\n" "ts means the string to test if it is in the filter vector\n" ); return 1; } bf_vec_t *vec = calloc(1, sizeof(bf_vec_t)); if (vec == NULL) return 1; char buf[128] = {0}; FILE *fp = fopen(argv[1], "rt"); if (fp == NULL) { free(vec); return 1; } while (fgets(buf, sizeof(buf), fp) != NULL) { buf[strlen(buf) - 1] = '\0'; if (bf_add(vec, buf)) printf("add to bloom filter successed, string %s\n", buf); else printf("add to bloom filter FAILED, string %s\n", buf); } printf("------------------------------------------------------------------\n"); if (bf_is_contains(vec, argv[2])) printf("test string %s is in bloom filter\n", argv[2]); else printf("test string %s is NOT in bloom filter\n", argv[2]); free(vec); return 0; } int main(int argc, char *argv[]) { return main2(argc, argv); } #endif在Linux 2.6.39.4 SMP x86_64 gcc version 4.4.3下作了簡單測試,效率仍是比較可觀的。沒有作量化的效率測試,也沒有對誤判率作量化測試。
我在考慮是否是要加一個謂詞回調接口,相似於: ui
typedef int (*bf_contains_cb)(void *data); int bf_oper_if_contains(bf_vec_t *vec, const char *str, bf_contains_cb callback, void *data);這樣可用性應該會好一些。暫且放下,等到真正要用時,再作詳細的量化測試和代碼完善吧。