Skip to content

Commit 41a6430

Browse files
arnav-aggarwaltrekhleb
authored andcommitted
Add bloom filter (trekhleb#84)
1 parent b33f1d5 commit 41a6430

File tree

5 files changed

+358
-0
lines changed

5 files changed

+358
-0
lines changed

‎README.md‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ the data.
3838
*`A`[Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
3939
*`A`[Graph](src/data-structures/graph) (both directed and undirected)
4040
*`A`[Disjoint Set](src/data-structures/disjoint-set)
41+
*`A`[Bloom Filter](src/data-structures/bloom-filter)
4142

4243
## Algorithms
4344

@@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
231232
|**B-Tree**| log(n) | log(n) | log(n) | log(n) ||
232233
|**Red-Black Tree**| log(n) | log(n) | log(n) | log(n) ||
233234
|**AVL Tree**| log(n) | log(n) | log(n) | log(n) ||
235+
|**Bloom Filter**|| 1 | 1 |||
234236

235237
### Array Sorting Algorithms Complexity
236238

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
exportdefaultclassBloomFilter{
2+
/**
3+
* @param{number} size
4+
*/
5+
constructor(size=100){
6+
// Bloom filter size directly affects the likelihood of false positives.
7+
// The bigger the size the lower the likelihood of false positives.
8+
this.size=size;
9+
this.storage=this.createStore(size);
10+
}
11+
12+
/**
13+
* @param{string} item
14+
*/
15+
insert(item){
16+
consthashValues=this.getHashValues(item);
17+
18+
// Set each hashValue index to true
19+
hashValues.forEach(val=>this.storage.setValue(val));
20+
}
21+
22+
/**
23+
* @param{string} item
24+
* @return{boolean}
25+
*/
26+
mayContain(item){
27+
consthashValues=this.getHashValues(item);
28+
29+
for(leti=0;i<hashValues.length;i+=1){
30+
if(!this.storage.getValue(hashValues[i])){
31+
// We know that the item was definitely not inserted.
32+
returnfalse;
33+
}
34+
}
35+
36+
// The item may or may not have been inserted.
37+
returntrue;
38+
}
39+
40+
/**
41+
* Creates the data store for our filter.
42+
* We use this method to generate the store in order to
43+
* encapsulate the data itself and only provide access
44+
* to the necessary methods.
45+
*
46+
* @param{number} size
47+
* @return{Object}
48+
*/
49+
createStore(size){
50+
conststorage=[];
51+
52+
// Initialize all indexes to false
53+
for(leti=0;i<size;i+=1){
54+
storage.push(false);
55+
}
56+
57+
conststorageInterface={
58+
getValue(index){
59+
returnstorage[index];
60+
},
61+
setValue(index){
62+
storage[index]=true;
63+
},
64+
};
65+
66+
returnstorageInterface;
67+
}
68+
69+
/**
70+
* @param{string} str
71+
* @return{number}
72+
*/
73+
hash1(str){
74+
lethash=0;
75+
76+
for(leti=0;i<str.length;i+=1){
77+
constchar=str.charCodeAt(i);
78+
hash=(hash<<5)+hash+char;
79+
hash&=hash;// Convert to 32bit integer
80+
hash=Math.abs(hash);
81+
}
82+
83+
returnhash%this.size;
84+
}
85+
86+
/**
87+
* @param{string} str
88+
* @return{number}
89+
*/
90+
hash2(str){
91+
lethash=5381;
92+
93+
for(leti=0;i<str.length;i+=1){
94+
constchar=str.charCodeAt(i);
95+
hash=(hash<<5)+hash+char;/* hash * 33 + c */
96+
}
97+
98+
returnhash%this.size;
99+
}
100+
101+
/**
102+
* @param{string} str
103+
* @return{number}
104+
*/
105+
hash3(str){
106+
lethash=0;
107+
108+
for(leti=0;i<str.length;i+=1){
109+
constchar=str.charCodeAt(i);
110+
hash=(hash<<5)-hash;
111+
hash+=char;
112+
hash&=hash;// Convert to 32bit integer
113+
}
114+
115+
returnhash%this.size;
116+
}
117+
118+
/**
119+
* Runs all 3 hash functions on the input and returns an array of results
120+
*
121+
* @param{string} str
122+
* @return{number[]}
123+
*/
124+
getHashValues(item){
125+
return[this.hash1(item),Math.abs(this.hash2(item)),Math.abs(this.hash3(item))];
126+
}
127+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Bloom Filter
2+
3+
A bloom filter is a data structure designed to
4+
test whether an element is present in a set. It
5+
is designed to be blazingly fast and use minimal
6+
memory at the cost of potential false positives.
7+
8+
![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
9+
10+
## Operations
11+
12+
There are two main operations a bloom filter can
13+
perform: insertion and search. Search may result in
14+
false positives. Deletion is not possible.
15+
16+
In other words, the filter can take in items. When
17+
we go to check if an item has previously been
18+
inserted, it can tell us either "no" or "maybe".
19+
20+
Both insertion and search are O(1) operations.
21+
22+
## Making the filter
23+
24+
A bloom filter is created by allotting a certain size.
25+
In our example, we use 100 as a default length. All
26+
locations are initialized to `false`.
27+
28+
### Insertion
29+
30+
During insertion, a number of hash functions,
31+
in our case 3 hash functions, are used to create
32+
hashes of the input. These hash functions output
33+
indexes. At every index received, we simply change
34+
the value in our bloom filter to `true`.
35+
36+
### Search
37+
38+
During a search, the same hash functions are called
39+
and used to hash the input. We then check if the
40+
indexes received _all_ have a value of `true` inside
41+
our bloom filter. If they _all_ have a value of
42+
`true`, we know that the bloom filter may have had
43+
the value previously inserted.
44+
45+
However, it's not certain, because it's possible
46+
that other values previously inserted flipped the
47+
values to `true`. The values aren't necessarily
48+
`true` due to the item currently being searched for.
49+
Absolute certainty is impossible unless only a single
50+
item has previously been inserted.
51+
52+
While checking the bloom filter for the indexes
53+
returned by our hash functions, if even one of them
54+
has a value of `false`, we definitively know that the
55+
item was not previously inserted.
56+
57+
## False Positives
58+
59+
The probability of false positives is determined by
60+
three factors: the size of the bloom filter, the
61+
number of hash functions we use, and the number
62+
of items that have been inserted into the filter.
63+
64+
The formula to calculate probablity of a false positive is:
65+
66+
( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
67+
68+
k = # hash functions
69+
70+
m = size
71+
72+
n = # items inserted
73+
74+
These variables, k, m, and n, should be picked based
75+
on how acceptable false positives are. If the values
76+
are picked and the resulting probability is too high,
77+
the values should be tweaked and the probability
78+
re-calculated.
79+
80+
## Applications
81+
82+
A bloom filter can be used on a blogging website. If
83+
the goal is to show readers only articles that they
84+
have never seen before, a bloom filter is perfect.
85+
It can store hashed values based on the articles. After
86+
a user reads a few articles, they can be inserted into
87+
the filter. The next time the user visits the site,
88+
those articles can be filtered out of the results.
89+
90+
Some articles will inevitably be filtered out by mistake,
91+
but the cost is acceptable. It's ok if a user never sees
92+
a few articles as long as they have other, brand new ones
93+
to see every time they visit the site.
94+
95+
The popular blog site Medium does a version of this.
96+
Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
97+
98+
## References
99+
100+
-[Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
101+
-[Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
102+
-[Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
103+
-[Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
104+
-[YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
importBloomFilterfrom'../BloomFilter';
2+
3+
describe('Bloom Filter',()=>{
4+
letbloomFilter;
5+
constpeople=['Bruce Wayne','Clark Kent','Barry Allen'];
6+
7+
beforeEach(()=>{
8+
bloomFilter=newBloomFilter();
9+
});
10+
11+
it('Should have methods named "insert" and "mayContain"',()=>{
12+
expect(typeofbloomFilter.insert).toBe('function');
13+
expect(typeofbloomFilter.mayContain).toBe('function');
14+
});
15+
16+
it('Should create a new filter store with the appropriate methods',()=>{
17+
conststore=bloomFilter.createStore(18);
18+
expect(typeofstore.getValue).toBe('function');
19+
expect(typeofstore.setValue).toBe('function');
20+
});
21+
22+
it('Should hash deterministically with all 3 hash functions',()=>{
23+
conststr='abc';
24+
expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
25+
expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
26+
expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
27+
});
28+
29+
it('Should create an array with 3 hash values',()=>{
30+
expect(bloomFilter.getHashValues('abc').length).toEqual(3);
31+
});
32+
33+
it('Should insert strings correctly and return true when checking for inserted values',()=>{
34+
people.forEach(person=>bloomFilter.insert(person));
35+
expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
36+
expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
37+
expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
38+
});
39+
});
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
importBloomFilterfrom'../BloomFilter';
2+
3+
// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
4+
functionmakeID(){
5+
constpossible='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
6+
letid='';
7+
8+
for(leti=0;i<10;i+=1){
9+
constrandomLength=Math.random()*possible.length;
10+
constrandomIndex=Math.floor(randomLength);
11+
id+=possible.charAt(randomIndex);
12+
}
13+
14+
returnid;
15+
}
16+
17+
functionrun10kTrials(numRandomTests=1000){
18+
constbloomFilter=newBloomFilter();
19+
constmockPeopleIDs=[];
20+
21+
for(leti=0;i<10;i+=1){
22+
mockPeopleIDs.push(makeID());
23+
}
24+
25+
mockPeopleIDs.forEach(id=>bloomFilter.insert(id));
26+
letnumFalsePositives=0;
27+
28+
for(letindex=0;index<numRandomTests;index+=1){
29+
constrandomID=makeID();
30+
if(bloomFilter.mayContain(randomID)){
31+
numFalsePositives+=1;
32+
}
33+
}
34+
35+
returnnumFalsePositives;
36+
}
37+
38+
functiontestFilter(numTrials=100){
39+
constresults=[];
40+
41+
for(leti=0;i<numTrials;i+=1){
42+
results.push(run10kTrials());
43+
}
44+
45+
constsum=results.reduce((cumulative,next)=>cumulative+next,0);
46+
returnsum/numTrials;
47+
}
48+
49+
describe('Bloom filter false positives',()=>{
50+
constfalsePositiveProbability=0.0174;
51+
constexpectedFalsePositives=falsePositiveProbability*1000;
52+
constavgFalsePositives=testFilter();
53+
54+
it(`Should keep false positives close to an expected value:
55+
56+
# trials = 1000
57+
k = 3 (hash functions)
58+
m = 100 (size)
59+
n = 10 (items inserted)
60+
61+
Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
62+
Chance of false positive = 0.017
63+
64+
Expected false positives = # trials * chance of false positive
65+
Expected false positives => 1000 * ${falsePositiveProbability}
66+
Expected false positives => ${expectedFalsePositives}
67+
68+
**************************
69+
EXPECTED = ${expectedFalsePositives}
70+
ACTUAL AVG = ${avgFalsePositives}
71+
**************************
72+
73+
If the expected and actual numbers are far off, something is wrong.
74+
Inspect manually.`,()=>{
75+
// We give it a large range to avoid unnecessary failures.
76+
// If it's working correctly, the value should definitely
77+
// fall within this range.
78+
79+
// In over 1,000 test runs, none of them ever come close
80+
// to falling outside of this range.
81+
constupperLimit=expectedFalsePositives+5;
82+
constlowerLimit=expectedFalsePositives-5;
83+
expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
84+
expect(avgFalsePositives).toBeLessThan(upperLimit);
85+
});
86+
});

0 commit comments

Comments
(0)