Add bloom filter (trekhleb#84)

arnav-aggarwal · trekhleb · commit 41a6430532fd · 2018-06-30T20:07:19.000+03:00
diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ the data.
 * `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
 * `A` [Graph](src/data-structures/graph) (both directed and undirected)
 * `A` [Disjoint Set](src/data-structures/disjoint-set)
+* `A` [Bloom Filter](src/data-structures/bloom-filter)
 
 ## Algorithms
 
@@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
 | **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
 | **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
 | **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
+| **Bloom Filter** | | 1 | 1 | | |
 
 ### Array Sorting Algorithms Complexity
 
diff --git a/src/data-structures/bloom-filter/BloomFilter.js b/src/data-structures/bloom-filter/BloomFilter.js
@@ -0,0 +1,127 @@
+export default class BloomFilter{
+ /**
+ * @param{number} size
+ */
+ constructor(size = 100){
+ // Bloom filter size directly affects the likelihood of false positives.
+ // The bigger the size the lower the likelihood of false positives.
+ this.size = size;
+ this.storage = this.createStore(size);
+ }
+
+ /**
+ * @param{string} item
+ */
+ insert(item){
+ const hashValues = this.getHashValues(item);
+
+ // Set each hashValue index to true
+ hashValues.forEach(val => this.storage.setValue(val));
+ }
+
+ /**
+ * @param{string} item
+ * @return{boolean}
+ */
+ mayContain(item){
+ const hashValues = this.getHashValues(item);
+
+ for (let i = 0; i < hashValues.length; i += 1){
+ if (!this.storage.getValue(hashValues[i])){
+ // We know that the item was definitely not inserted.
+ return false;
+ }
+ }
+
+ // The item may or may not have been inserted.
+ return true;
+ }
+
+ /**
+ * Creates the data store for our filter.
+ * We use this method to generate the store in order to
+ * encapsulate the data itself and only provide access
+ * to the necessary methods.
+ *
+ * @param{number} size
+ * @return{Object}
+ */
+ createStore(size){
+ const storage = [];
+
+ // Initialize all indexes to false
+ for (let i = 0; i < size; i += 1){
+ storage.push(false);
+ }
+
+ const storageInterface ={
+ getValue(index){
+ return storage[index];
+ },
+ setValue(index){
+ storage[index] = true;
+ },
+ };
+
+ return storageInterface;
+ }
+
+ /**
+ * @param{string} str
+ * @return{number}
+ */
+ hash1(str){
+ let hash = 0;
+
+ for (let i = 0; i < str.length; i += 1){
+ const char = str.charCodeAt(i);
+ hash = (hash << 5) + hash + char;
+ hash &= hash; // Convert to 32bit integer
+ hash = Math.abs(hash);
+ }
+
+ return hash % this.size;
+ }
+
+ /**
+ * @param{string} str
+ * @return{number}
+ */
+ hash2(str){
+ let hash = 5381;
+
+ for (let i = 0; i < str.length; i += 1){
+ const char = str.charCodeAt(i);
+ hash = (hash << 5) + hash + char; /* hash * 33 + c */
+ }
+
+ return hash % this.size;
+ }
+
+ /**
+ * @param{string} str
+ * @return{number}
+ */
+ hash3(str){
+ let hash = 0;
+
+ for (let i = 0; i < str.length; i += 1){
+ const char = str.charCodeAt(i);
+ hash = (hash << 5) - hash;
+ hash += char;
+ hash &= hash; // Convert to 32bit integer
+ }
+
+ return hash % this.size;
+ }
+
+ /**
+ * Runs all 3 hash functions on the input and returns an array of results
+ *
+ * @param{string} str
+ * @return{number[]}
+ */
+ getHashValues(item){
+ return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
+ }
+}
diff --git a/src/data-structures/bloom-filter/README.md b/src/data-structures/bloom-filter/README.md
@@ -0,0 +1,104 @@
+# Bloom Filter
+
+A bloom filter is a data structure designed to
+test whether an element is present in a set. It
+is designed to be blazingly fast and use minimal
+memory at the cost of potential false positives.
+
+![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
+
+## Operations
+
+There are two main operations a bloom filter can
+perform: insertion and search. Search may result in
+false positives. Deletion is not possible.
+
+In other words, the filter can take in items. When
+we go to check if an item has previously been
+inserted, it can tell us either "no" or "maybe".
+
+Both insertion and search are O(1) operations.
+
+## Making the filter
+
+A bloom filter is created by allotting a certain size.
+In our example, we use 100 as a default length. All
+locations are initialized to `false`.
+
+### Insertion
+
+During insertion, a number of hash functions,
+in our case 3 hash functions, are used to create
+hashes of the input. These hash functions output
+indexes. At every index received, we simply change
+the value in our bloom filter to `true`.
+
+### Search
+
+During a search, the same hash functions are called
+and used to hash the input. We then check if the
+indexes received _all_ have a value of `true` inside
+our bloom filter. If they _all_ have a value of
+`true`, we know that the bloom filter may have had
+the value previously inserted.
+
+However, it's not certain, because it's possible
+that other values previously inserted flipped the
+values to `true`. The values aren't necessarily
+`true` due to the item currently being searched for.
+Absolute certainty is impossible unless only a single
+item has previously been inserted.
+
+While checking the bloom filter for the indexes
+returned by our hash functions, if even one of them
+has a value of `false`, we definitively know that the
+item was not previously inserted.
+
+## False Positives
+
+The probability of false positives is determined by
+three factors: the size of the bloom filter, the
+number of hash functions we use, and the number
+of items that have been inserted into the filter.
+
+The formula to calculate probablity of a false positive is:
+
+( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
+
+k = # hash functions
+
+m = size
+
+n = # items inserted
+
+These variables, k, m, and n, should be picked based
+on how acceptable false positives are. If the values
+are picked and the resulting probability is too high,
+the values should be tweaked and the probability
+re-calculated.
+
+## Applications
+
+A bloom filter can be used on a blogging website. If
+the goal is to show readers only articles that they
+have never seen before, a bloom filter is perfect.
+It can store hashed values based on the articles. After
+a user reads a few articles, they can be inserted into
+the filter. The next time the user visits the site,
+those articles can be filtered out of the results.
+
+Some articles will inevitably be filtered out by mistake,
+but the cost is acceptable. It's ok if a user never sees
+a few articles as long as they have other, brand new ones
+to see every time they visit the site.
+
+The popular blog site Medium does a version of this.
+Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
+
+## References
+
+- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
+- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
+- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
+- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
+- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
diff --git a/src/data-structures/bloom-filter/__test__/BloomFilter.test.js b/src/data-structures/bloom-filter/__test__/BloomFilter.test.js
@@ -0,0 +1,39 @@
+import BloomFilter from '../BloomFilter';
+
+describe('Bloom Filter', () =>{
+ let bloomFilter;
+ const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
+
+ beforeEach(() =>{
+ bloomFilter = new BloomFilter();
+ });
+
+ it('Should have methods named "insert" and "mayContain"', () =>{
+ expect(typeof bloomFilter.insert).toBe('function');
+ expect(typeof bloomFilter.mayContain).toBe('function');
+ });
+
+ it('Should create a new filter store with the appropriate methods', () =>{
+ const store = bloomFilter.createStore(18);
+ expect(typeof store.getValue).toBe('function');
+ expect(typeof store.setValue).toBe('function');
+ });
+
+ it('Should hash deterministically with all 3 hash functions', () =>{
+ const str = 'abc';
+ expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
+ expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
+ expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
+ });
+
+ it('Should create an array with 3 hash values', () =>{
+ expect(bloomFilter.getHashValues('abc').length).toEqual(3);
+ });
+
+ it('Should insert strings correctly and return true when checking for inserted values', () =>{
+ people.forEach(person => bloomFilter.insert(person));
+ expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
+ expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
+ expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
+ });
+});
diff --git a/src/data-structures/bloom-filter/__test__/BloomFilterFalsePositive.test.js b/src/data-structures/bloom-filter/__test__/BloomFilterFalsePositive.test.js
@@ -0,0 +1,86 @@
+import BloomFilter from '../BloomFilter';
+
+// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
+function makeID(){
+ const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+ let id = '';
+
+ for (let i = 0; i < 10; i += 1){
+ const randomLength = Math.random() * possible.length;
+ const randomIndex = Math.floor(randomLength);
+ id += possible.charAt(randomIndex);
+ }
+
+ return id;
+}
+
+function run10kTrials(numRandomTests = 1000){
+ const bloomFilter = new BloomFilter();
+ const mockPeopleIDs = [];
+
+ for (let i = 0; i < 10; i += 1){
+ mockPeopleIDs.push(makeID());
+ }
+
+ mockPeopleIDs.forEach(id => bloomFilter.insert(id));
+ let numFalsePositives = 0;
+
+ for (let index = 0; index < numRandomTests; index += 1){
+ const randomID = makeID();
+ if (bloomFilter.mayContain(randomID)){
+ numFalsePositives += 1;
+ }
+ }
+
+ return numFalsePositives;
+}
+
+function testFilter(numTrials = 100){
+ const results = [];
+
+ for (let i = 0; i < numTrials; i += 1){
+ results.push(run10kTrials());
+ }
+
+ const sum = results.reduce((cumulative, next) => cumulative + next, 0);
+ return sum / numTrials;
+}
+
+describe('Bloom filter false positives', () =>{
+ const falsePositiveProbability = 0.0174;
+ const expectedFalsePositives = falsePositiveProbability * 1000;
+ const avgFalsePositives = testFilter();
+
+ it(`Should keep false positives close to an expected value:
+ 
+ # trials = 1000
+ k = 3 (hash functions)
+ m = 100 (size)
+ n = 10 (items inserted)
+ 
+ Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
+ Chance of false positive = 0.017
+ 
+ Expected false positives = # trials * chance of false positive
+ Expected false positives => 1000 * ${falsePositiveProbability}
+ Expected false positives => ${expectedFalsePositives}
+ 
+ **************************
+ EXPECTED = ${expectedFalsePositives}
+ ACTUAL AVG = ${avgFalsePositives}
+ **************************
+ 
+ If the expected and actual numbers are far off, something is wrong.
+ Inspect manually.`, () =>{
+ // We give it a large range to avoid unnecessary failures.
+ // If it's working correctly, the value should definitely
+ // fall within this range.
+
+ // In over 1,000 test runs, none of them ever come close
+ // to falling outside of this range.
+ const upperLimit = expectedFalsePositives + 5;
+ const lowerLimit = expectedFalsePositives - 5;
+ expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
+ expect(avgFalsePositives).toBeLessThan(upperLimit);
+ });
+});

-Original file line number
+Diff line change
 *`A`[Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
 *`A`[Graph](src/data-structures/graph) (both directed and undirected)
 *`A`[Disjoint Set](src/data-structures/disjoint-set)
 +*`A`[Bloom Filter](src/data-structures/bloom-filter)
 ## Algorithms
 |**B-Tree**| log(n) | log(n) | log(n) | log(n) ||
 |**Red-Black Tree**| log(n) | log(n) | log(n) | log(n) ||
 |**AVL Tree**| log(n) | log(n) | log(n) | log(n) ||
 +|**Bloom Filter**|| 1 | 1 |||
 ### Array Sorting Algorithms Complexity
-Original file line number
+Diff line change
@@ @@ -0,0 +1,39 @@ @@
 +importBloomFilterfrom'../BloomFilter';
++
 +describe('Bloom Filter',()=>{
 +letbloomFilter;
 +constpeople=['Bruce Wayne','Clark Kent','Barry Allen'];
++
 +beforeEach(()=>{
 +bloomFilter=newBloomFilter();
 +});
++
 +it('Should have methods named "insert" and "mayContain"',()=>{
 +expect(typeofbloomFilter.insert).toBe('function');
 +expect(typeofbloomFilter.mayContain).toBe('function');
 +});
++
 +it('Should create a new filter store with the appropriate methods',()=>{
 +conststore=bloomFilter.createStore(18);
 +expect(typeofstore.getValue).toBe('function');
 +expect(typeofstore.setValue).toBe('function');
 +});
++
 +it('Should hash deterministically with all 3 hash functions',()=>{
 +conststr='abc';
 +expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
 +expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
 +expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
 +});
++
 +it('Should create an array with 3 hash values',()=>{
 +expect(bloomFilter.getHashValues('abc').length).toEqual(3);
 +});
++
 +it('Should insert strings correctly and return true when checking for inserted values',()=>{
 +people.forEach(person=>bloomFilter.insert(person));
 +expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
 +expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
 +expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
 +});
 +});
-Original file line number
+Diff line change
@@ @@ -0,0 +1,86 @@ @@
 +importBloomFilterfrom'../BloomFilter';
++
 +// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
 +functionmakeID(){
 +constpossible='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
 +letid='';
++
 +for(leti=0;i<10;i+=1){
 +constrandomLength=Math.random()*possible.length;
 +constrandomIndex=Math.floor(randomLength);
 +id+=possible.charAt(randomIndex);
 +}
++
 +returnid;
 +}
++
 +functionrun10kTrials(numRandomTests=1000){
 +constbloomFilter=newBloomFilter();
 +constmockPeopleIDs=[];
++
 +for(leti=0;i<10;i+=1){
 +mockPeopleIDs.push(makeID());
 +}
++
 +mockPeopleIDs.forEach(id=>bloomFilter.insert(id));
 +letnumFalsePositives=0;
++
 +for(letindex=0;index<numRandomTests;index+=1){
 +constrandomID=makeID();
 +if(bloomFilter.mayContain(randomID)){
 +numFalsePositives+=1;
 +}
 +}
++
 +returnnumFalsePositives;
 +}
++
 +functiontestFilter(numTrials=100){
 +constresults=[];
++
 +for(leti=0;i<numTrials;i+=1){
 +results.push(run10kTrials());
 +}
++
 +constsum=results.reduce((cumulative,next)=>cumulative+next,0);
 +returnsum/numTrials;
 +}
++
 +describe('Bloom filter false positives',()=>{
 +constfalsePositiveProbability=0.0174;
 +constexpectedFalsePositives=falsePositiveProbability*1000;
 +constavgFalsePositives=testFilter();
++
 +it(`Should keep false positives close to an expected value:
++
 + # trials = 1000
 + k = 3 (hash functions)
 + m = 100 (size)
 + n = 10 (items inserted)
++
 + Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
 + Chance of false positive = 0.017
++
 + Expected false positives = # trials * chance of false positive
 + Expected false positives => 1000 * ${falsePositiveProbability}
 + Expected false positives => ${expectedFalsePositives}
++
 + **************************
 + EXPECTED = ${expectedFalsePositives}
 + ACTUAL AVG = ${avgFalsePositives}
 + **************************
++
 + If the expected and actual numbers are far off, something is wrong.
 + Inspect manually.`,()=>{
 +// We give it a large range to avoid unnecessary failures.
 +// If it's working correctly, the value should definitely
 +// fall within this range.
++
 +// In over 1,000 test runs, none of them ever come close
 +// to falling outside of this range.
 +constupperLimit=expectedFalsePositives+5;
 +constlowerLimit=expectedFalsePositives-5;
 +expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
 +expect(avgFalsePositives).toBeLessThan(upperLimit);
 +});
 +});