From 469c04b14c393a9725cb10e9d22429870bb508a3 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Fri, 24 Jan 2020 23:10:46 +0000 Subject: [PATCH 1/2] Initial bloom filter code contribution (#83) * Added initial bloom filter code. Added changed lang3 dependency from test to compile in pom.xml * added tests + made recommended changes. * Updated documentation * refactored ProtoBloomFilter added tests. * Cleand up code and added tests * Added CountingBloomFilter * Fixed CountingBloomFilter issues Fixed checkstyle and bug report issues * Initial bloom filter collections checkin * Added unit tests * fixed test cases * Extract BloomFilter as an interface * added missing license info * fixed Jacoco errors * fixed names for so build picks up tests * cleaned up Jacoco report for BloomNestedCollection * removed unused code * cleaned up and reformatted * added javadoc fixed issue with BloomNestedCollection detecting duplicates in an edge case. * fixed candidate testing bug * Cleand up niggling report issues. * fixed javadoc errors * fixed javadoc for java 13 issue * Second set of fixes. * "package private for testing" for methods and properties. * In "Builder": ** Field "hashes" made "final" * removes some "Serializable" implementations. * "StandardBloomFilter" made non non "final" fields final and changed "final protected" to "final private". * removed transient fields * made Package name singular * added javadocs for private and protected fields and methods. * Occurrences of "bloom" replaced with "Bloom" * removed checkstyle and findbugs exclusions * Fixed method and class names * Documentation updates * Fixed checkstyle isses Added BloomFilterConfiguration functions for estimation. * added .checkstyle to eclipse ignore section. * renamed test classes to match main class names * Updated the documentation. * Implemented requested changes. Part of COLLECTIONS-728 Changed remaining "get" comments to "gets" etc. Added final where possible and reasonable. renamed enum Change to CHANGE fixed missing javadoc links and missed name changes. fixed ProtoBloomFilter hashCode renamed CollectionStatistics to BloomCollectionStatistics renamed CollectionConfiguration to BloomCollectionConfiguration renamed BloomCollectionStatistics.getTxnCount() to getTransactionCount() * Added final set of constructors and tests for them. Cleaned up issues from Gilles Sadowski review * fixes for Gilles Sadowski issues in BloomCollectionStatistics * Update javadoc * renamed match() -> matches() and inverseMatch() -> inverseMatches() This follows the pattern set with the Object.equals() method name. * added isFull() method to check if a bloom filter is full. * Changed gate from StandardBloomFilter to BloomFilter * renamed BloomCollectionX -> BloomFilterGatedX specifically: BloomCollectionConfiguration -> BloomFilterGatedConfiguraiton BloomCollectionStatistics -> BloomFilterGatedStatistics * Made the StandardBloomFilter(BitSet) constructor public * removed extraneous build() methods from ProtoBloomFilter.Factory * Added Use cases * Initial cut * changes for interface * Changed to Hasher implementation * Added missing files and removed Shape from some BloomFilter calls * Added @since 4.5 tags * fixed javadoc * fixed PMD errors * Added tests and fixed sign extension issues * changed to Byte constant * made BloomFilter.verify*() non final * Added remove(Hasher) for completeness * Replaced private implementation of MurmurHash3 with commons-codec * fixed typo * Removed Hasher.Factory added HashFunction interface * removed Usage.md * made commons-codec dependency optional * Improved performance of Iterator. * renamed instance variable "md" as messageDigest. * updated javadoc * renamed Iter to Iterator and removed unused imports * removed unused imports * Made instance variables final. Also fixed MD5 constructor to throw IllegalStateException if MD5 algo can not be found. * removed unused imports * Updated javadoc. * Added HashFunctionIdentity to replace HashFunctionName Added test cases, updated java doc. Renamed function implementations to reflect actual function. Added comparators for HashFunctionIdentity * fixed naming issues * Updated javadoc * fixed checkstyle issue * Removed link that was causing problems in java 11+ javadoc * changed HashFunctionIdentity.getProcess() to getProcessType() * changed HashFunctionIdentity.getProcess() to getProcessType() * Added package documentation * Added BloomFilter interface and removed unnecessary methods * updated tests and fixed issues * Moved set operations to separate class and updated tests * fixed FindBugs, PMD and Checkstyle errors * fixed javadocs * Added SetOperations and tests * Added javadocs indicating optional commons-codec required * Added another cosine test * Updated to commons-codec 1.14 * fixed typos * moved Hasher to o.a.c.c.b.hasher package * extracted Shape.java and moved to o.a.c.c.b.hasher package * Added javadoc and removed unused imports in testing code * Added isEmpty() method to Hasher --- .gitignore | 3 +- pom.xml | 9 + .../bloomfilter/AbstractBloomFilter.java | 287 ++++++++++ .../bloomfilter/BitSetBloomFilter.java | 142 +++++ .../collections4/bloomfilter/BloomFilter.java | 128 +++++ .../bloomfilter/CountingBloomFilter.java | 271 +++++++++ .../bloomfilter/HasherBloomFilter.java | 143 +++++ .../bloomfilter/SetOperations.java | 165 ++++++ .../bloomfilter/hasher/DynamicHasher.java | 178 ++++++ .../bloomfilter/hasher/HashFunction.java | 35 ++ .../hasher/HashFunctionIdentity.java | 161 ++++++ .../hasher/HashFunctionIdentityImpl.java | 87 +++ .../bloomfilter/hasher/Hasher.java | 104 ++++ .../bloomfilter/hasher/Shape.java | 358 ++++++++++++ .../bloomfilter/hasher/StaticHasher.java | 146 +++++ .../hasher/function/MD5Cyclic.java | 112 ++++ .../hasher/function/Murmur128x86Cyclic.java | 91 +++ .../hasher/function/Murmur32x86Iterative.java | 79 +++ .../hasher/function/ObjectsHashIterative.java | 93 ++++ .../hasher/function/package-info.java | 23 + .../bloomfilter/hasher/package-info.java | 24 + .../bloomfilter/package-info.java | 126 +++++ .../bloomfilter/AbstractBloomFilterTest.java | 523 ++++++++++++++++++ .../bloomfilter/BitSetBloomFilterTest.java | 126 +++++ .../bloomfilter/CountingBloomFilterTest.java | 439 +++++++++++++++ .../DefaultBloomFilterMethodsTest.java | 100 ++++ .../bloomfilter/HasherBloomFilterTest.java | 62 +++ .../bloomfilter/SetOperationsTest.java | 348 ++++++++++++ .../hasher/CommonComparatorTest.java | 164 ++++++ .../hasher/DeepComparatorTest.java | 191 +++++++ .../hasher/DynamicHasherBuilderTest.java | 108 ++++ .../bloomfilter/hasher/DynamicHasherTest.java | 144 +++++ .../hasher/HashFunctionIdentityImplTest.java | 92 +++ .../bloomfilter/hasher/ShapeTest.java | 497 +++++++++++++++++ .../bloomfilter/hasher/StaticHasherTest.java | 338 +++++++++++ .../hasher/function/MD5CyclicTest.java | 64 +++ .../function/Murmur128x86CyclicTest.java | 66 +++ .../function/Murmur32x86IterativeTest.java | 64 +++ .../function/ObjectsHashIterativeTest.java | 69 +++ 39 files changed, 6159 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86Cyclic.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/CommonComparatorTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DeepComparatorTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86CyclicTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java diff --git a/.gitignore b/.gitignore index 6ff3feb943..50cf2cd0f0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ site-content .settings .classpath .project +.checkstyle # Mac files -.DS_Store \ No newline at end of file +.DS_Store diff --git a/pom.xml b/pom.xml index 9edbb25d7e..72973b6028 100644 --- a/pom.xml +++ b/pom.xml @@ -438,6 +438,9 @@ Vamsi Kavuri + + Claude Warren + @@ -459,6 +462,12 @@ 3.9 test + + commons-codec + commons-codec + 1.14 + true + diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java new file mode 100644 index 0000000000..6949f97313 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.BitSet; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + +/** + * An abstract Bloom filter providing default implementations for most Bloom filter + * functions. Specific implementations are encouraged to override the methods that can be + * more efficiently implemented. + *

+ * This abstract class provides additional functionality not declared in the interface. + * Specifically: + *

+ * + * @since 4.5 + */ +public abstract class AbstractBloomFilter implements BloomFilter { + + /** + * The shape used by this BloomFilter + */ + private final Shape shape; + + /** + * Gets an array of little-endian long values representing the on bits of this filter. + * bits 0-63 are in the first long. + * + * @return the LongBuffer representation of this filter. + */ + @Override + public abstract long[] getBits(); + + /** + * Creates a StaticHasher that contains the indexes of the bits that are on in this + * filter. + * + * @return a StaticHasher for that produces this Bloom filter. + */ + @Override + public abstract StaticHasher getHasher(); + + /** + * Construct a Bloom filter with the specified shape. + * + * @param shape The shape. + */ + protected AbstractBloomFilter(Shape shape) { + this.shape = shape; + } + + /** + * Verify the other Bloom filter has the same shape as this Bloom filter. + * + * @param other the other filter to check. + * @throws IllegalArgumentException if the shapes are not the same. + */ + protected void verifyShape(BloomFilter other) { + verifyShape(other.getShape()); + } + + /** + * Verify the specified shape has the same shape as this Bloom filter. + * + * @param shape the other shape to check. + * @throws IllegalArgumentException if the shapes are not the same. + */ + protected void verifyShape(Shape shape) { + if (!this.shape.equals(shape)) { + throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", shape, this.shape)); + } + } + + /** + * Verifies that the hasher has the same name as the shape. + * + * @param hasher the Hasher to check + */ + protected void verifyHasher(Hasher hasher) { + if (shape.getHashFunctionIdentity().getSignature() != hasher.getHashFunctionIdentity().getSignature()) { + throw new IllegalArgumentException( + String.format("Hasher (%s) is not the hasher for shape (%s)", + HashFunctionIdentity.asCommonString(hasher.getHashFunctionIdentity()), + shape.toString())); + } + } + + /** + * Gets the shape of this filter. + * + * @return The shape of this filter. + */ + @Override + public final Shape getShape() { + return shape; + } + + /** + * Merge the other Bloom filter into this one. + * + * @param other the other Bloom filter. + */ + @Override + abstract public void merge(BloomFilter other); + + /** + * Merge the decomposed Bloom filter defined by the hasher into this Bloom + * filter. The hasher provides an iterator of bit indexes to enable. + * + * @param hasher the hasher to provide the indexes. + * @throws IllegalArgumentException if the shape argument does not match the shape of + * this filter, or if the hasher is not the specified one + */ + @Override + abstract public void merge(Hasher hasher); + + /** + * Gets the cardinality of this Bloom filter. + * + * @return the cardinality (number of enabled bits) in this filter. + */ + @Override + public int cardinality() { + return BitSet.valueOf(getBits()).cardinality(); + } + + /** + * Performs a logical "AND" with the other Bloom filter and returns the cardinality of + * the result. + * + * @param other the other Bloom filter. + * @return the cardinality of the result of {@code ( this AND other )}. + */ + @Override + public int andCardinality(BloomFilter other) { + verifyShape(other); + long[] mine = getBits(); + long[] theirs = other.getBits(); + int limit = Integer.min(mine.length, theirs.length); + long[] result = new long[limit]; + for (int i = 0; i < limit; i++) { + result[i] = mine[i] & theirs[i]; + } + return BitSet.valueOf(result).cardinality(); + } + + @Override + public int orCardinality(BloomFilter other) { + verifyShape(other); + long[] mine = getBits(); + long[] theirs = other.getBits(); + long[] remainder = null; + long[] result = null; + if (mine.length > theirs.length) { + result = new long[mine.length]; + remainder = mine; + } else { + result = new long[theirs.length]; + remainder = theirs; + + } + int limit = Integer.min(mine.length, theirs.length); + for (int i = 0; i < limit; i++) { + result[i] = mine[i] | theirs[i]; + } + if (limit theirs.length) { + result = new long[mine.length]; + remainder = mine; + } else { + result = new long[theirs.length]; + remainder = theirs; + + } + int limit = Integer.min(mine.length, theirs.length); + for (int i = 0; i < limit; i++) { + result[i] = mine[i] ^ theirs[i]; + } + if (limit This method takes + * advantage of internal structures of BitSetBloomFilter.

+ * + * @param other the other BitSetBloomFilter. + * @return the cardinality of the result of {@code ( this AND other )}. + * @see #andCardinality(BloomFilter) + */ + @Override + public int andCardinality(BloomFilter other) { + if (other instanceof BitSetBloomFilter) { + verifyShape(other); + BitSet result = (BitSet) bitSet.clone(); + result.and(((BitSetBloomFilter)other).bitSet); + return result.cardinality(); + } + return super.andCardinality(other); + } + + + @Override + public int xorCardinality(BloomFilter other) { + if (other instanceof BitSetBloomFilter) { + verifyShape(other); + BitSet result = (BitSet) bitSet.clone(); + result.xor(((BitSetBloomFilter)other).bitSet); + return result.cardinality(); + } + return super.xorCardinality(other); + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java new file mode 100644 index 0000000000..717771d293 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + +/** + * The interface that describes a Bloom filter. + * @since 4.5 + */ +public interface BloomFilter { + + /** + * Gets an array of little-endian long values representing the on bits of this filter. + * bits 0-63 are in the first long. + * + * @return the LongBuffer representation of this filter. + */ + long[] getBits(); + + /** + * Creates a StaticHasher that contains the indexes of the bits that are on in this + * filter. + * + * @return a StaticHasher for that produces this Bloom filter. + */ + StaticHasher getHasher(); + + /** + * Gets the shape of this filter. + * + * @return The shape of this filter. + */ + Shape getShape(); + + /** + * Merge the other Bloom filter into this one. + * + * @param other the other Bloom filter. + */ + void merge(BloomFilter other); + + /** + * Merge the decomposed Bloom filter defined by the hasher into this Bloom + * filter. The hasher provides an iterator of bit indexes to enable. + * + * @param hasher the hasher to provide the indexes. + * @throws IllegalArgumentException if the shape argument does not match the shape of + * this filter, or if the hasher is not the specified one + */ + void merge(Hasher hasher); + + /** + * Gets the cardinality of this Bloom filter. + *

This is also known as the Hamming value.

+ * + * @return the cardinality (number of enabled bits) in this filter. + */ + int cardinality(); + + /** + * Performs a logical "AND" with the other Bloom filter and returns the cardinality of + * the result. + * + * @param other the other Bloom filter. + * @return the cardinality of the result of {@code ( this AND other )}. + */ + int andCardinality(BloomFilter other); + + /** + * Performs a logical "OR" with the other Bloom filter and returns the cardinality of + * the result. + * + * @param other the other Bloom filter. + * @return the cardinality of the result of {@code ( this OR other )}. + */ + int orCardinality(BloomFilter other); + + /** + * Performs a logical "XOR" with the other Bloom filter and returns the cardinality of + * the result. + * + * @param other the other Bloom filter. + * @return the cardinality of the result of {@code( this XOR other )} + */ + int xorCardinality(BloomFilter other); + + /** + * Performs a contains check. Effectively this AND other == other. + * + * @param other the Other Bloom filter. + * @return true if this filter matches the other. + */ + boolean contains(BloomFilter other); + + /** + * Performs a contains check against a decomposed Bloom filter. The shape must match + * the shape of this filter. The hasher provides bit indexes to check for. Effectively + * decomposed AND this == decomposed. + * + * @param hasher The hasher containing the bits to check. + * @return true if this filter contains the other. + * @throws IllegalArgumentException if the shape argument does not match the shape of + * this filter, or if the hasher is not the specified one + */ + boolean contains(Hasher hasher); + + + + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java new file mode 100644 index 0000000000..e8dc80db3a --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.AbstractMap; +import java.util.BitSet; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.PrimitiveIterator.OfInt; +import java.util.function.Consumer; +import java.util.function.IntConsumer; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Stream; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + + +/** + * A counting Bloom filter. + * This Bloom filter maintains a count of the number of times a bit has been + * turned on. This allows for removal of Bloom filters from the filter. + *

+ * This implementation uses a map to track enabled bit counts + *

+ * + * @since 4.5 + */ +public class CountingBloomFilter extends AbstractBloomFilter { + + /** + * the count of entries. Each enabled bit is a key with the count for that bit + * being the value. Entries with a value of zero are removed. + */ + private final TreeMap counts; + + /** + * Constructs a counting Bloom filter from a hasher and a shape. + * + * @param hasher The hasher to build the filter from. + * @param shape The shape of the resulting filter. + */ + public CountingBloomFilter(Hasher hasher, Shape shape) { + super(shape); + verifyHasher(hasher); + counts = new TreeMap(); + Set idxs = new HashSet(); + hasher.getBits(shape).forEachRemaining((IntConsumer) idxs::add); + idxs.stream().forEach(idx -> counts.put(idx, 1)); + } + + /** + * Constructs an empty Counting filter with the specified shape. + * + * @param shape The shape of the resulting filter. + */ + public CountingBloomFilter(Shape shape) { + super(shape); + this.counts = new TreeMap(); + } + + /** + * Constructs a counting Bloom filter with the provided counts and shape + * + * @param counts A map of data counts. + * @param shape The shape of the resulting filter. + */ + public CountingBloomFilter(Map counts, Shape shape) { + this(shape); + counts.entrySet().stream().forEach( e -> { + if (e.getKey() >= shape.getNumberOfBits()) + { + throw new IllegalArgumentException( "dataMap has an item with an index larger than "+ + (shape.getNumberOfBits()-1) ); + } + else if (e.getKey() < 0) + { + throw new IllegalArgumentException( "dataMap has an item with an index less than 0" ); + } + if (e.getValue() < 0) { + throw new IllegalArgumentException( "dataMap has an item with an value less than 0" ); + } else if (e.getValue() > 0) + { + this.counts.put( e.getKey(), e.getValue() ); + }}); + } + + /** + * Gets the count for each enabled bit. + * + * @return an immutable map of enabled bits (key) to counts for that bit + * (value). + */ + public Stream> getCounts() { + return counts.entrySet().stream() + .map(e -> new AbstractMap.SimpleEntry(e.getKey(), e.getValue())); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("{ "); + for (Map.Entry e : counts.entrySet()) { + sb.append(String.format("(%s,%s) ", e.getKey(), e.getValue())); + } + return sb.append("}").toString(); + } + + /** + * Merge this Bloom filter with the other creating a new filter. The counts for + * bits that are on in the other filter are incremented. + *

+ * For each bit that is turned on in the other filter; if the other filter is + * also a CountingBloomFilter the count is added to this filter, otherwise the + * count is incremented by one. + *

+ * + * @param other the other filter. + */ + @Override + public void merge(BloomFilter other) { + verifyShape(other); + if (other instanceof CountingBloomFilter) + { + merge(((CountingBloomFilter)other).counts.keySet().iterator()); + } else { + merge(BitSet.valueOf(other.getBits()).stream().iterator()); + } + } + + @Override + public void merge(Hasher hasher) { + verifyHasher( hasher ); + merge( hasher.getBits(getShape()) ); + } + + /** + * Merge an iterator of set bits into this filter. + * @param iter the iterator of bits to set. + */ + private void merge(Iterator iter) { + iter.forEachRemaining(idx -> { + Integer val = counts.get(idx); + if (val == null) { + counts.put(idx, 1 ); + } else if (val == Integer.MAX_VALUE) { + throw new IllegalStateException( "Overflow on index "+idx); + } else { + counts.put( idx, val+1 ); + } + }); + } + + /** + * Decrement the counts for the bits that are on in the other BloomFilter from this + * one. + * + *

+ * For each bit that is turned on in the other filter the count is decremented by 1. + *

+ * + * @param other the other filter. + */ + public void remove(BloomFilter other) { + verifyShape(other); + if (other instanceof CountingBloomFilter) + { + remove(((CountingBloomFilter)other).counts.keySet().stream()); + } else { + remove(BitSet.valueOf(other.getBits()).stream().boxed()); + } + } + + /** + * Decrement the counts for the bits that are on in the hasher from this + * Bloom filter. + * + *

+ * For each bit that is turned on in the other filter the count is decremented by 1. + *

+ * + * @param hasher the hasher to generate bits. + */ + public void remove(Hasher hasher) { + verifyHasher( hasher ); + Set lst = new HashSet(); + hasher.getBits(getShape()).forEachRemaining( (Consumer)lst::add ); + remove(lst.stream()); + } + + /** + * Decrements the counts for the bits specified in the Integer stream. + * + * @param idxStream The stream of bit counts to decrement. + */ + private void remove(Stream idxStream) { + idxStream.forEach(idx -> { + Integer val = counts.get(idx); + if (val != null) { + if (val - 1 == 0) { + counts.remove(idx); + } else { + counts.put(idx, val - 1); + } + } + if (val == null || val == 0) { + throw new IllegalStateException( "Underflow on index "+idx); + } else if (val - 1 == 0) { + counts.remove(idx); + } else { + counts.put(idx, val - 1); + } + }); + } + + @Override + public long[] getBits() { + BitSet bs = new BitSet(); + counts.keySet().stream().forEach(bs::set); + return bs.toLongArray(); + } + + @Override + public StaticHasher getHasher() { + return new StaticHasher(counts.keySet().iterator(), getShape()); + } + + @Override + public boolean contains(Hasher hasher) { + verifyHasher(hasher); + OfInt iter = hasher.getBits(getShape()); + while (iter.hasNext()) { + if (counts.get(iter.nextInt()) == null) { + return false; + } + } + return true; + } + + @Override + public int cardinality() { + return counts.size(); + } + + @Override + public int andCardinality(BloomFilter other) { + if (other instanceof CountingBloomFilter) { + Set result = new HashSet( counts.keySet()); + result.retainAll( ((CountingBloomFilter)other).counts.keySet() ); + return result.size(); + } + return super.andCardinality(other); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java new file mode 100644 index 0000000000..fb2722ada8 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Set; +import java.util.TreeSet; +import java.util.PrimitiveIterator.OfInt; +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.apache.commons.collections4.iterators.EmptyIterator; +import org.apache.commons.collections4.iterators.IteratorChain; + +/** + * A Bloom filter built on a single hasher. This filter type should only be used for small + * filters (few on bits). While this implementation correctly supports the merge() methods + * it is recommended that if merges are expected that one of the other Bloom filter + * implementations be used. + * @since 4.5 + */ +public class HasherBloomFilter extends AbstractBloomFilter { + + /** + * The internal hasher representation. + */ + private StaticHasher hasher; + + /** + * Constructs a HasherBloomFilter from a hasher and a shape. + * + * @param hasher the hasher to use. + * @param shape the shape of the Bloom filter. + */ + public HasherBloomFilter(Hasher hasher, Shape shape) { + super(shape); + verifyHasher(hasher); + if (hasher instanceof StaticHasher) { + this.hasher = (StaticHasher) hasher; + verifyShape(this.hasher.getShape()); + } else { + this.hasher = new StaticHasher(hasher, shape); + } + } + + /** + * Constructs an empty HasherBloomFilter from a shape. + * + * @param shape the shape of the Bloom filter. + */ + public HasherBloomFilter(Shape shape) { + super(shape); + this.hasher = new StaticHasher(EmptyIterator.emptyIterator(), shape); + } + + @Override + public long[] getBits() { + if (hasher.size() == 0) { + return new long[0]; + } + int n = (int) Math.ceil(hasher.getShape().getNumberOfBits() * 1.0 / Long.SIZE); + long[] result = new long[n]; + OfInt iter = hasher.getBits(hasher.getShape()); + iter.forEachRemaining((IntConsumer) idx -> { + long buff = result[idx / Long.SIZE]; + long pwr = Math.floorMod(idx, Long.SIZE); + long buffOffset = 1L << pwr; + buff |= buffOffset; + result[idx / Long.SIZE] = buff; + }); + + int limit = result.length; + while (limit > 0 && result[limit - 1] == 0) { + limit--; + } + if (limit == 0) { + return new long[0]; + } + if (limit < result.length) { + return Arrays.copyOf(result, limit); + } + return result; + } + + @Override + public StaticHasher getHasher() { + return hasher; + } + + @Override + public void merge(BloomFilter other) { + merge(other.getHasher()); + } + + @Override + public void merge(Hasher hasher) { + verifyHasher(hasher); + IteratorChain iter = new IteratorChain(this.hasher.getBits(getShape()), + hasher.getBits(getShape())); + this.hasher = new StaticHasher(iter, getShape()); + } + + @Override + public int cardinality() { + return hasher.size(); + } + + @Override + public boolean contains(Hasher hasher) { + verifyHasher(hasher); + Set set = new TreeSet(); + hasher.getBits(getShape()).forEachRemaining((IntConsumer) idx -> { + set.add(idx); + }); + OfInt iter = this.hasher.getBits(getShape()); + while (iter.hasNext()) { + int idx = iter.nextInt(); + set.remove(idx); + if (set.isEmpty()) { + return true; + } + } + return false; + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java new file mode 100644 index 0000000000..a31bbb16fd --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.Shape; + +/** + * Implementations of set operations on Bloom filters. + * + */ +public final class SetOperations { + + /** + * Do not instantiate. + */ + private SetOperations() {} + + /** + * Verifies the Bloom filters have the same shape. + * + * @param first the first filter to check. + * @param second the second filter to check. + * @throws IllegalArgumentException if the shapes are not the same. + */ + private static void verifyShape(BloomFilter first, BloomFilter second) { + if (!first.getShape().equals(second.getShape())) { + throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", + first.getShape(), second.getShape())); + } + } + + /** + * Calculates the Hamming distance between two Bloom filters. + * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Hamming distance. + */ + public static int hammingDistance(BloomFilter first, BloomFilter second) { + verifyShape(first,second); + return first.xorCardinality(second); + } + + + /** + * Calculates the Jaccard similarity between two Bloom filters. + * + *

Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Jaccard similarity. + */ + public static double jaccardSimilarity(BloomFilter first, BloomFilter second) { + verifyShape(first,second); + int orCard = first.orCardinality(second); + // if the orCard is zero then the hamming distance will also be zero. + return orCard==0?0:hammingDistance(first,second) / (double) orCard; + } + + /** + * Calculates the Jaccard distance between two Bloom filters. + * + *

Jaccard distance is defined as {@code 1 - Jaccard similarity}

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Jaccard distance. + */ + public static double jaccardDistance(BloomFilter first, BloomFilter second) { + return 1.0 - jaccardSimilarity(first,second); + } + + /** + * Calculates the Cosine similarity between two Bloom filters. + *

Also known as Orchini similarity and the Tucker coefficient of congruence or + * Ochiai similarity.

+ * + *

If either filter is empty (no enabled bits) the result is 0 (zero)

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Cosine similarity. + */ + public static double cosineSimilarity(BloomFilter first, BloomFilter second) { + verifyShape(first,second); + int numerator = first.andCardinality(second); + + return numerator==0?0:numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); + } + + /** + * Calculates the Cosine distance between two Bloom filters. + * + *

Cosine distance is defined as {@code 1 - Cosine similarity}

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the jaccard distance. + */ + public static double cosineDistance(BloomFilter first, BloomFilter second) { + return 1.0 - cosineSimilarity(first,second); + } + + /** + * Estimates the number of items in the Bloom filter based on the shape and the number + * of bits that are enabled. + * + * @param filter the Bloom filter to estimate size for. + * @return an estimate of the number of items that were placed in the Bloom filter. + */ + public static long estimateSize(BloomFilter filter) { + Shape shape = filter.getShape(); + double estimate = -(shape.getNumberOfBits() * + Math.log(1.0 - filter.cardinality() * 1.0 / shape.getNumberOfBits())) / + shape.getNumberOfHashFunctions(); + return Math.round(estimate); + } + + /** + * Estimates the number of items in the union of the sets represented by two + * Bloom filters. + * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return an estimate of the size of the union between the two filters. + */ + public static long estimateUnionSize(BloomFilter first, BloomFilter second) { + verifyShape(first,second); + Shape shape = first.getShape(); + double estimate = -(shape.getNumberOfBits() * + Math.log(1.0 - first.orCardinality(second) * 1.0 / shape.getNumberOfBits())) / + shape.getNumberOfHashFunctions(); + return Math.round(estimate); + } + + /** + * Estimates the number of items in the intersection of the sets represented by two + * Bloom filters. + * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return an estimate of the size of the intersection between the two filters. + */ + public static long estimateIntersectionSize(BloomFilter first, BloomFilter second) { + verifyShape(first,second); + // do subtraction early to avoid Long overflow. + return estimateSize(first) - estimateUnionSize(first,second) + estimateSize(second); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java new file mode 100644 index 0000000000..aa607791e9 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; + +/** + * The class that performs hashing on demand. + * @since 4.5 + */ +public class DynamicHasher implements Hasher { + + /** + * The list of byte arrays that are to be hashed. + */ + private final List buffers; + + /** + * The function to hash the buffers. + */ + private final HashFunction function; + + /** + * Constructs a DynamicHasher. + * + * @param function the function to use. + * @param buffers the byte buffers that will be hashed. + */ + public DynamicHasher(HashFunction function, List buffers) { + this.buffers = new ArrayList(buffers); + this.function = function; + } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return function; + } + + @Override + public boolean isEmpty() { + return buffers.isEmpty(); + } + + /** + * Return an iterator of integers that are the bits to enable in the Bloom filter + * based on the shape. The iterator may return the same value multiple times. There is + * no guarantee made as to the order of the integers. + * + * @param shape the shape of the desired Bloom filter. + * @return the Iterator of integers; + * @throws IllegalArgumentException if {@code shape.getHasherName()} does not equal + * {@code getName()} + */ + @Override + public PrimitiveIterator.OfInt getBits(Shape shape) { + if (HashFunctionIdentity.COMMON_COMPARATOR.compare(getHashFunctionIdentity(), + shape.getHashFunctionIdentity()) != 0) { + throw new IllegalArgumentException( + String.format("Shape hasher %s is not %s", + HashFunctionIdentity.asCommonString(shape.getHashFunctionIdentity()), + HashFunctionIdentity.asCommonString(getHashFunctionIdentity()))); + } + return new Iterator(shape); + } + + /** + * The iterator of integers. + */ + private class Iterator implements PrimitiveIterator.OfInt { + private int buffer = 0; + private int funcCount = 0; + private final Shape shape; + + /** + * Creates iterator with the specified shape. + * + * @param shape + */ + private Iterator(Shape shape) { + this.shape = shape; + } + + @Override + public boolean hasNext() { + if (buffers.isEmpty()) { + return false; + } + return buffer < buffers.size() - 1 || funcCount < shape.getNumberOfHashFunctions(); + } + + @Override + public int nextInt() { + if (hasNext()) { + if (funcCount >= shape.getNumberOfHashFunctions()) { + funcCount = 0; + buffer++; + } + return (int) Math.floorMod(function.apply(buffers.get(buffer), funcCount++), + (long) shape.getNumberOfBits()); + } + throw new NoSuchElementException(); + } + } + + /** + * The builder for DynamicHashers. + * @since 4.5 + */ + public static class Builder implements Hasher.Builder { + /** + * The list of byte[] that are to be hashed. + */ + private final List buffers; + + /** + * The function that the resulting DynamicHasher will use. + */ + private final HashFunction function; + + /** + * Constructs a DynamicHasher builder. + * + * @param function the function implementation. + */ + public Builder(HashFunction function) { + this.function = function; + this.buffers = new ArrayList(); + + } + + /** + * Builds the hasher. + * + * @return A DynamicHasher with the specified name, function and buffers. + */ + @Override + public DynamicHasher build() throws IllegalArgumentException { + return new DynamicHasher(function, buffers); + } + + @Override + public final Builder with(byte property) { + return with(new byte[] {property}); + } + + @Override + public final Builder with(byte[] property) { + buffers.add(property); + return this; + } + + @Override + public final Builder with(String property) { + return with(property.getBytes(StandardCharsets.UTF_8)); + } + + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java new file mode 100644 index 0000000000..24d2af3a6c --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +/** + * Defines a Hash Function used by Hashers. + * @since 4.5 + */ +public interface HashFunction extends HashFunctionIdentity { + + + /** + * Apply the hash function to the buffer. + * @param buffer the buffer to apply the hash function to. + * @param seed the seed for the hashing. + * @return the long value of the hash. + */ + long apply( byte[] buffer, int seed ); + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java new file mode 100644 index 0000000000..b33671cab2 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.nio.charset.StandardCharsets; +import java.util.Comparator; +import java.util.Locale; + +/** + * Defines the a Hash Function used by Hashers. + * + * @since 4.5 + */ +public interface HashFunctionIdentity { + + /** + * A comparator implementation that performs the most common comparison using the + * HashFunctionIdentity name, signedness, and process. + */ + Comparator COMMON_COMPARATOR = new Comparator() { + + @Override + public int compare(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + int result = identity1.getName().compareToIgnoreCase(identity2.getName()); + if (result == 0) { + result = identity1.getSignedness().compareTo(identity2.getSignedness()); + } + if (result == 0) { + result = identity1.getProcessType().compareTo(identity2.getProcessType()); + } + return result; + } + }; + + /** + * A comparator implementation that performs the most common comparison using the + * HashFunctionIdentity name, signedness, process, and privider.. + */ + Comparator DEEP_COMPARATOR = new Comparator() { + + @Override + public int compare(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + int result = COMMON_COMPARATOR.compare(identity1, identity2); + if (result == 0) { + result = identity1.getProvider().compareToIgnoreCase(identity2.getProvider()); + } + return result; + } + }; + + /** + * Get a common formatted string for general display. + * + * @param identity the identity to format. + * @return the String representing the identity. + */ + static String asCommonString(HashFunctionIdentity identity) { + return String.format("%s-%s-%s", identity.getName(), identity.getSignedness(), identity.getProcessType()); + } + + /** + * Get the signature buffer for a HashFunctionIdentity. + *

+ * The signature of this function is calculated as: + * {@code + * apply( String.format( "%s-%s-%s", getName().toUpperCase( Locale.ROOT ), getSignedness(), getProcess() ) + * .getBytes( "UTF-8" ), 0 ); + * } + *

+ * @param identity The HashFunctionIdentity to create the buffer for. + * @return the signature buffer for the identity + */ + static byte[] prepareSignatureBuffer(HashFunctionIdentity identity) { + + return String.format( "%s-%s-%s", + identity.getName().toUpperCase(Locale.ROOT), identity.getSignedness(), + identity.getProcessType() ).getBytes(StandardCharsets.UTF_8); + + } + + /** + * An enum that identifies the Signedness of the calculations for this function. + */ + enum Signedness { + SIGNED, UNSIGNED + }; + + /** + * An enum that identifies the process type of this function.
Iterative + * processes
Call the underlying algorithm for each buffer, seed pair call to + * {@code apply}.
Cyclic processes
Call the underlying algorithm to + * generate two values for each buffer. It returns the first value on the call with + * seed 0, and increments the result with the second value before returning it on all + * subsequent calls.
+ */ + enum ProcessType { + CYCLIC, ITERATIVE + }; + + /** + * Gets the name of this hash function. + *

Hash function should be the common name + * for the hash. This may include indications as to hash length + *

+ * Names are not case specific. Thus, "MD5" and "md5" should be considered as the same. + *

+ * @return the Hash name + */ + String getName(); + + /** + * Gets the name of the provider of this hash function implementation. + *

+ * Provider names are not case specific. Thus, "Apache Commons Collection" and + * "apache commons collection" should be considered as the same. + *

+ * @return the name of the provider of this hash implementation. + */ + String getProvider(); + + /** + * Gets the signedness of this function. + * + * @return signedness of this function. + */ + Signedness getSignedness(); + + /** + * Gets the process of this function. + * + * @return process of this function. + */ + ProcessType getProcessType(); + + /** + * Get the signature of this function.

The signature of this function is + * calculated as: {@code + * apply( String.format( "%s-%s-%s", getName(), getSignedness(), getProcess() ) + * .getBytes( "UTF-8" ), 0 ); + * }

+ * + * @return the signature of this function. + */ + long getSignature(); + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java new file mode 100644 index 0000000000..1d2124b7e3 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +/** + * An instance of HashFunctionIdentity that is suitable for deserializing + * HashFunctionIdentity data from a stream or any other situation where the + * hash function is not available but the identify of the function is required. + * + * @since 4.5 + */ +public final class HashFunctionIdentityImpl implements HashFunctionIdentity { + private final String name; + private final String provider; + private final Signedness signedness; + private final ProcessType process; + private final long signature; + + /** + * Creates a copy of the HashFunctionIdentity. + * @param identity the identity to copy. + */ + public HashFunctionIdentityImpl( HashFunctionIdentity identity) { + this.name = identity.getName(); + this.provider = identity.getProvider(); + this.signedness = identity.getSignedness(); + this.process = identity.getProcessType(); + this.signature = identity.getSignature(); + } + + /** + * Creates a HashFunctionIdentity from component values. + * @param provider the name of the provider. + * @param name the name of the hash function. + * @param signedness the signedness of the hash function. + * @param process the processes of the hash function. + * @param signature the signature for the hash function. + */ + public HashFunctionIdentityImpl( String provider, String name, Signedness signedness, ProcessType process, + long signature) { + this.name = name; + this.provider = provider; + this.signedness = signedness; + this.process = process; + this.signature = signature; + } + @Override + public String getName() { + return name; + } + + @Override + public String getProvider() { + return provider; + } + + @Override + public Signedness getSignedness() { + return signedness; + } + + @Override + public ProcessType getProcessType() { + return process; + } + + @Override + public long getSignature() { + return signature; + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java new file mode 100644 index 0000000000..38e2e46c39 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.PrimitiveIterator; + + +/** + * The class that performs hashing. + *

+ * Hashers have a Unique name based on the hashing algorithm used. + *

+ * Implementations of {@code getBits()} may return duplicate values and may return + * values in a random order. See implementation javadoc notes as to the guarantees + * provided by the specific implementation. + *

+ * @since 4.5 + */ +public interface Hasher { + + /** + * Gets HashFunctionIdentity of the hash function this Hasher uses. + * + * @return HashFunctionIdentity of the hash function this Hasher uses. + */ + HashFunctionIdentity getHashFunctionIdentity(); + + /** + * Returns true if the hasher specifies no bits. + * @return true if the hasher does not specify any bits. + */ + boolean isEmpty(); + + /** + * Return an iterator of integers that are the bits to enable in the Bloom + * filter based on the shape. No guarantee is made as to order + * or duplication of values. + * + * @param shape the shape of the desired Bloom filter. + * @return the Iterator of integers; + * @throws IllegalArgumentException if {@code shape.getHasherName()} does not + * equal {@code getName()} + */ + PrimitiveIterator.OfInt getBits(Shape shape); + + /** + * A builder to build a hasher. + * @since 4.5 + */ + interface Builder { + /** + * Build the hasher. + * @return the fully constructed hasher. + */ + Hasher build(); + + /** + * Adds a byte to the hasher. + * + * @param property the byte to add + * @return {@code this} for chaining. + * @throws IllegalStateException if the Hasher is locked. + * @see #getBits(Shape) + */ + Builder with(byte property); + + /** + * Adds an array of bytes to the hasher. + * + * @param property the array of bytes to add. + * @return {@code this} for chaining. + * @throws IllegalStateException if the Hasher is locked. + * @see #getBits(Shape) + */ + Builder with(byte[] property); + + /** + * Adds a string to the hasher. The string is converted to a byte array using + * the UTF-8 Character set. + * + * @param property the string to add. + * @return {@code this} for chaining. + * @throws IllegalStateException if the Hasher is locked. + * @see #getBits(Shape) + */ + Builder with(String property); + + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java new file mode 100644 index 0000000000..041078ed85 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Objects; + +/** + * The definition of a Bloom filter shape. + * + *

This class contains the values for the filter configuration and is used to + * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are + * compatible. (i.e. can be compared or merged)

+ * + *

Interrelatedness of values

+ * + *
Number of Items (AKA: {@code n})
+ *
{@code n = ceil(m / (-k / log(1 - exp(log(p) / k))))}
Probability of + * Collision (AKA: {@code p})
{@code p = (1 - exp(-kn/m))^k}
Number + * of Bits (AKA: {@code m})
+ *
{@code m = ceil((n * log(p)) / log(1 / pow(2, log(2))))}
Number of + * Functions (AKA: {@code k})
{@code k = round((m / n) * log(2))}
+ * + *

Comparisons

For purposes of equality checking and hashCode + * calculations a {@code Shape} is defined by the hashing function identity, the number of + * bits ({@code m}), and the number of functions ({@code k}).

+ * + * @see Bloom Filter calculator + * @see Bloom filter + * [Wikipedia] + * @since 4.5 + */ +public class Shape { + + /** + * The natural logarithm of 2. Used in several calculations. approx 0.693147180 + */ + private static final double LOG_OF_2 = Math.log(2.0); + + /** + * 1 / 2^log(2) approx -0.090619058. Used in calculating the number of bits. + */ + private static final double DENOMINATOR = Math.log(1.0 / (Math.pow(2.0, LOG_OF_2))); + /** + * number of items in the filter. (AKA: {@code n}) + */ + private final int numberOfItems; + /** + * number of bits in the filter. (AKA: {@code m}) + */ + private final int numberOfBits; + /** + * number of hash functions. (AKA: {@code k}) + */ + private final int numberOfHashFunctions; + + /** + * The hash code for this filter. + */ + private final int hashCode; + + /** + * The identity of the hasher function. + */ + private final HashFunctionIdentity hashFunctionIdentity; + + /** + * Create a filter configuration with the specified number of items and + * probability.

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated bloom filter size + * and function count.

+ * + * @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. + * @param numberOfItems Number of items to be placed in the filter. + * @param probability The desired probability of duplicates. Must be in the range + * (0.0,1.0). + */ + public Shape(HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) { + if (hashFunctionIdentity == null) { + throw new IllegalArgumentException("Hash function identity may not be null"); + } + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of Items must be greater than 0"); + } + if (probability <= 0.0) { + throw new IllegalArgumentException("Probability must be greater than 0.0"); + } + if (probability >= 1.0) { + throw new IllegalArgumentException("Probability must be less than 1.0"); + } + this.hashFunctionIdentity = hashFunctionIdentity; + this.numberOfItems = numberOfItems; + /* + * number of bits is called "m" in most mathematical statement describing + * bloom filters so we use it here. + */ + final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); + if (m > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits"); + } + this.numberOfBits = (int) m; + numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + hashCode = generateHashCode(); + // check that probability is within range + getProbability(); + + } + + /** + * Create a filter configuration with the specified number of items and + * probability. + * + * @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. + * @param numberOfItems Number of items to be placed in the filter. + * @param numberOfBits The number of bits in the filter. + */ + public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) { + if (hashFunctionIdentity == null) { + throw new IllegalArgumentException("Hash function name may not be null"); + } + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of Items must be greater than 0"); + } + if (numberOfBits < 8) { + throw new IllegalArgumentException("Number of Bits must be greater than or equal to 8"); + } + this.hashFunctionIdentity = hashFunctionIdentity; + this.numberOfItems = numberOfItems; + this.numberOfBits = numberOfBits; + this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + hashCode = generateHashCode(); + // check that probability is within range + getProbability(); + + } + + /** + * Create a filter configuration with the specified number of items and + * probability. + * + * @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. + * @param numberOfItems Number of items to be placed in the filter. + * @param numberOfBits The number of bits in the filter. + * @param numberOfHashFunctions The number of hash functions in the filter. + */ + public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits, + final int numberOfHashFunctions) { + if (hashFunctionIdentity == null) { + throw new IllegalArgumentException("Hash function name may not be null"); + } + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of Items must be greater than 0"); + } + if (numberOfBits < 8) { + throw new IllegalArgumentException("Number of Bits must be greater than or equal to 8"); + } + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of Hash Functions must be greater than or equal to 8"); + } + this.hashFunctionIdentity = hashFunctionIdentity; + this.numberOfItems = numberOfItems; + this.numberOfBits = numberOfBits; + this.numberOfHashFunctions = numberOfHashFunctions; + hashCode = generateHashCode(); + // check that probability is within range + getProbability(); + + } + + /** + * Create a filter configuration with the specified number of items and + * probability. + * + * @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. + * @param probability The probability of duplicates. Must be in the range + * (0.0,1.0). + * @param numberOfBits The number of bits in the filter. + * @param numberOfHashFunctions The number of hash functions in the filter. + */ + public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits, + final int numberOfHashFunctions) { + if (hashFunctionIdentity == null) { + throw new IllegalArgumentException("Hash function name may not be null"); + } + if (probability <= 0.0) { + throw new IllegalArgumentException("Probability must be greater than 0.0"); + } + if (probability >= 1.0) { + throw new IllegalArgumentException("Probability must be less than 1.0"); + } + if (numberOfBits < 8) { + throw new IllegalArgumentException("Number of bits must be greater than or equal to 8"); + } + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of hash functions must be greater than or equal to 8"); + } + this.hashFunctionIdentity = hashFunctionIdentity; + this.numberOfBits = numberOfBits; + this.numberOfHashFunctions = numberOfHashFunctions; + + // n = ceil(m / (-k / log(1 - exp(log(p) / k)))) + double n = Math.ceil(numberOfBits / + (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); + + // log of probability is always < 0 + // number of hash functions is >= 1 + // e^x where x < 0 = [0,1) + // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 + // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 + // ceil( >0 ) >= 1 + // so we can not produce a negative value thus we don't check for it. + // + // similarly we can not produce a number greater than numberOfBits so we + // do not have to check for Integer.MAX_VALUE either. + this.numberOfItems = (int) n; + hashCode = generateHashCode(); + // check that probability is within range + getProbability(); + } + + private int generateHashCode() { + return Objects.hash(hashFunctionIdentity, numberOfBits, numberOfHashFunctions); + } + + @Override + public String toString() { + return String.format("Shape[ %s n=%s m=%s k=%s ]", + HashFunctionIdentity.asCommonString(hashFunctionIdentity), + numberOfItems, numberOfBits, numberOfHashFunctions); + } + + /** + * Calculates the number of hash functions given numberOfItems and numberofBits. + * This is a method so that the calculation is consistent across all constructors. + * + * @param numberOfItems the number of items in the filter. + * @param numberOfBits the number of bits in the filter. + * @return the optimal number of hash functions. + */ + private int calculateNumberOfHashFunctions(int numberOfItems, int numberOfBits) { + /* + * k = round((m / n) * log(2)) We change order so that we use real math rather + * than integer math. + */ + long k = Math.round(LOG_OF_2 * numberOfBits / numberOfItems); + if (k < 1) { + throw new IllegalArgumentException( + String.format("Filter to small: Calculated number of hash functions (%s) was less than 1", k)); + } + /* + * normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but + * since numberOfBits is at most Integer.MAX_VALUE the numerator of + * numberofHashFunctions is log(2) * Integer.MAX_VALUE = 646456992.9449 the + * value of k can not be above Integer.MAX_VALUE. + */ + return (int) k; + } + + /** + * Calculates the probability of false positives (AKA: {@code p} given + * numberOfItems, numberofBits and numberOfHashFunctions. This is a method so that + * the calculation is consistent across all constructors. + * + * @return the probability of collision. + */ + public final double getProbability() { + // (1 - exp(-kn/m))^k + double p = Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), + numberOfHashFunctions); + /* + * We do not need to check for p < = since we only allow positive values for + * parameters and the closest we can come to exp(-kn/m) == 1 is + * exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will + * always be 00 + */ + if (p >= 1.0) { + throw new IllegalArgumentException( + String.format("Calculated probability (%s) is greater than or equal to 1.0", p)); + } + return p; + } + + /** + * Gets the number of items that are expected in the filter. AKA: {@code n} + * + * @return the number of items. + */ + public int getNumberOfItems() { + return numberOfItems; + } + + /** + * Gets the number of bits in the Bloom filter. AKA: {@code m} + * + * @return the number of bits in the Bloom filter. + */ + public int getNumberOfBits() { + return numberOfBits; + } + + /** + * Gets the number of hash functions used to construct the filter. AKA: {@code k} + * + * @return the number of hash functions used to construct the filter. + */ + public int getNumberOfHashFunctions() { + return numberOfHashFunctions; + } + + /** + * Gets the number of bytes in the Bloom filter. + * + * @return the number of bytes in the Bloom filter. + */ + public int getNumberOfBytes() { + return Double.valueOf(Math.ceil(numberOfBits / (double)Byte.SIZE )).intValue(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof Shape) { + Shape other = (Shape) o; + return + other.getNumberOfBits() == getNumberOfBits() && + other.getNumberOfHashFunctions() == getNumberOfHashFunctions() && + HashFunctionIdentity.COMMON_COMPARATOR.compare( getHashFunctionIdentity(), + other.getHashFunctionIdentity()) == 0; + } + return false; + } + + @Override + public int hashCode() { + return hashCode; + } + + /** + * Gets the HashFunctionIdentity of the hash function this shape uses. + * @return the HashFunctionIdentity of the hash function this shape uses. + */ + public HashFunctionIdentity getHashFunctionIdentity() { + return hashFunctionIdentity; + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java new file mode 100644 index 0000000000..ccc6c83de4 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.PrimitiveIterator.OfInt; +import java.util.Set; +import java.util.TreeSet; + +/** + * A Hasher implementation that contains the index for all enabled bits for a specific + * Shape. + * @since 4.5 + */ +public final class StaticHasher implements Hasher { + + /** + * The shape of this hasher + */ + private final Shape shape; + /** + * The ordered set of values that this hasher will return. + */ + private final int[] values; + + /** + * Constructs the StaticHasher from a StaticHasher and a Shape. + * @param hasher the StaticHasher to read. + * @param shape the Shape for the resulting values. + * @throws IllegalArgumentException if the shape of the hasher and the shape parameter are not the same. + */ + public StaticHasher(StaticHasher hasher, Shape shape) { + if (!hasher.shape.equals(shape)) { + throw new IllegalArgumentException(String.format("Hasher shape (%s) is not the same as shape (%s)", + hasher.getShape().toString(), shape.toString())); + } + this.shape = shape; + this.values = hasher.values; + } + + /** + * Constructs the StaticHasher from a Hasher and a Shape. + * @param hasher the Hasher to read. + * @param shape the Shape for the resulting values. + * @throws IllegalArgumentException if the hasher function and the shape function are not the same. + */ + public StaticHasher(Hasher hasher, Shape shape) { + this( hasher.getBits(shape), shape); + if ( + HashFunctionIdentity.COMMON_COMPARATOR.compare( + hasher.getHashFunctionIdentity(), shape.getHashFunctionIdentity()) != 0) { + throw new IllegalArgumentException(String.format("Hasher (%s) is not the same as for shape (%s)", + HashFunctionIdentity.asCommonString( hasher.getHashFunctionIdentity()), + shape.toString())); + } + } + + /** + * Constructs a StaticHasher from an Iterator of Integers and a Shape. + * @param iter the Iterator of Integers. + * @param shape the Shape that the integers were generated for. + * @throws IllegalArgumentException if any Integer is outside the range [0,shape.getNumberOfBits()) + */ + public StaticHasher(Iterator iter, Shape shape) { + this.shape = shape; + Set workingValues = new TreeSet(); + iter.forEachRemaining( idx -> { + if (idx >= this.shape.getNumberOfBits()) + { + throw new IllegalArgumentException( String.format( "Bit index (%s) is too big for %s", idx, shape )); + } + if (idx < 0 ) { + throw new IllegalArgumentException( String.format( "Bit index (%s) may not be less than zero", idx )); + } + workingValues.add( idx ); + }); + this.values = new int[workingValues.size()]; + int i=0; + for (Integer value : workingValues) + { + values[i++] = value.intValue(); + } + } + + /** + * Gets the shape this static hasher was created with. + * + * @return the Shape of this hasher. + */ + public Shape getShape() { + return shape; + } + + @Override + public boolean isEmpty() { + return values.length == 0; + } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return shape.getHashFunctionIdentity(); + } + + /** + * Gets the the number of unique values in this hasher. + * @return the number of unique values. + */ + public int size() { + return values.length; + } + + /** + * Returns an iterator of integers that are the bits to enable in the Bloom + * filter based on the shape. The iterator will not return the same value multiple + * times. Values will be returned in ascending order. + * + * @param shape the shape of the desired Bloom filter. + * @return the Iterator of integers; + * @throws IllegalArgumentException if {@code shape.getHasherName()} does not + * equal {@code getName()} + */ + @Override + public OfInt getBits(Shape shape) { + if (!this.shape.equals(shape)) { + throw new IllegalArgumentException( + String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); + } + return Arrays.stream( values ).iterator(); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java new file mode 100644 index 0000000000..58ca68eb36 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import java.nio.ByteBuffer; + +import java.nio.LongBuffer; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; + +/** + * An implementation of HashFunction that + * performs MD5 hashing using a signed cyclic method. + * @since 4.5 + */ +public final class MD5Cyclic implements HashFunction { + + /** + * The MD5 digest implementation. + */ + private final MessageDigest messageDigest; + + /** + * The signature for this hash function. + */ + private final long signature; + + /** + * The result from the digest 0 + */ + private final long[] result = new long[2]; + + /** + * The name of this hash function. + */ + public static final String NAME = "MD5"; + + /** + * Constructs the MD5 hashing function. + */ + public MD5Cyclic() { + try { + messageDigest = MessageDigest.getInstance(NAME); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException( e.getMessage() ); + } + signature = apply( HashFunctionIdentity.prepareSignatureBuffer(this), 0); + } + + @Override + public long apply(byte[] buffer, int seed) { + + if (seed == 0) { + byte[] hash; + synchronized (messageDigest) { + messageDigest.update(buffer); + hash = messageDigest.digest(); + messageDigest.reset(); + } + + LongBuffer lb = ByteBuffer.wrap(hash).asLongBuffer(); + result[0] = lb.get(0); + result[1] = lb.get(1); + } else { + result[0] += result[1]; + } + return result[0]; + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getProvider() { + return "Apache Commons Collections"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return signature; + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86Cyclic.java new file mode 100644 index 0000000000..300ba48699 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86Cyclic.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import org.apache.commons.codec.digest.MurmurHash3; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; + +/** + * An implementation of HashFunction that + * performs Murmur128 hashing using a signed cyclic method. + * + *

Requires the optional commons-codec library.

+ * + * @since 4.5 + */ +public final class Murmur128x86Cyclic implements HashFunction { + /** + * The result of the hash 0 call. + */ + private long[] parts = null; + + /** + * The signature for this hash function. + */ + private final long signature; + + /** + * The name of this hash method. + */ + public static final String NAME = "Murmur3_x64_128"; + + /** + * Constructs a Murmur3 x64 128 hash. + */ + public Murmur128x86Cyclic() { + signature = apply( HashFunctionIdentity.prepareSignatureBuffer(this), 0); + } + + + @Override + public long apply(byte[] buffer, int seed) { + if (parts == null || seed == 0) { + parts = MurmurHash3.hash128x64(buffer, 0, buffer.length, 0); + } else { + parts[0] += parts[1]; + } + return parts[0]; + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getProvider() { + return "Apache Commons Collections"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return signature; + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java new file mode 100644 index 0000000000..886fd08506 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import org.apache.commons.codec.digest.MurmurHash3; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; + +/** + * An implementation of HashFunction that + * performs Murmur32 hashing using a signed iterative method. + * + *

Requires the optional commons-codec library.

+ * + * @since 4.5 + */ +public final class Murmur32x86Iterative implements HashFunction { + + /** + * The signature for this hash function. + */ + private final long signature; + + /** + * The name of this hash function. + */ + public static final String NAME = "Murmur3_x86_32"; + + /** + * Constructs a Murmur3 x86 32 hash + */ + public Murmur32x86Iterative() { + signature = apply( HashFunctionIdentity.prepareSignatureBuffer(this), 0); + } + + @Override + public long apply(byte[] buffer, int seed) { + return MurmurHash3.hash32x86(buffer, 0, buffer.length, seed); + } + + @Override + public String getName() { + return NAME; + } + @Override + public String getProvider() { + return "Apache Commons Collections"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.ITERATIVE; + } + + @Override + public long getSignature() { + return signature; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java new file mode 100644 index 0000000000..fe756ee4dc --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import java.util.Arrays; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; + + +/** + * An implementation of HashFunction that + * performs {@code Objects.hash} hashing using a signed iterative method. + *

+ * Except in the case of seed 0, the value of the previous hash is + * used as a seed for the next hash. Hashes are seeded by calling + * {@code Arrays.deepHashCode( new Object[]{seed, buffer} )}. + *

+ * @since 4.5 + */ +public final class ObjectsHashIterative implements HashFunction { + + /** + * The name of the hash function. + */ + public static final String NAME = "Objects32"; + + /** + * The signature for this hash function. + */ + private final long signature; + + /** + * The value of the last hash. + */ + private long last = 0; + + /** + * Constructs a hash that uses the Objects.hash method to has values. + */ + public ObjectsHashIterative() { + signature = apply( HashFunctionIdentity.prepareSignatureBuffer(this), 0); + } + + @Override + public long apply(byte[] buffer, int seed) { + if (seed == 0) { + last = 0; + } + long result = Arrays.deepHashCode( new Object[] {last, buffer}); + last += result; + return result; + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getProvider() { + return "Apache Commons Collections"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.ITERATIVE; + } + + @Override + public long getSignature() { + return signature; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java new file mode 100644 index 0000000000..4edb95864e --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Implementations of org.apache.commons.collections4.bloomfilter.hasher.HasherFunction + * interface. + * @since 4.5 + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java new file mode 100644 index 0000000000..43888b7b48 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Implementations of org.apache.commons.collections4.bloomfilter.Hasher + * interface. + * @since 4.5 + */ +package org.apache.commons.collections4.bloomfilter.hasher; + diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java new file mode 100644 index 0000000000..0c71933f85 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * A collection of extensible Bloom filter classes and interfaces. + * + *

+ * Background:

+ *

+ * A Bloom filter is conceptually a bit vector. It is used to + * tell you where things are not. Basically, you create a Bloom filter by creating hashes + * and converting those to enabled bits in a vector. You can merge the Bloom filters + * together with logical "or" (call this filter "B"). You can then check to see if filter + * "A" was "or"ed into "B" by testing A & B == A. if the statement is false then "A" was + * not merged into "B", otherwise it _might_ have. They are generally used where hash + * tables would be too large or as a filter front end for longer processes. For example + * most browsers have a Bloom filter that is built from all known bad URLs (ones that + * serve up malware). When you enter a URL the browser builds a Bloom filter and checks to + * see if it is "in" the bad URL filter. If not the URL is good, if it matches, then the + * expensive lookup on a remote system is made to see if it actually is in the list. There + * are lots of other uses, and in most cases the reason is to perform a fast check as a + * gateway for a longer operation.

+ *

+ * BloomFilter

+ *

+ * The bloom filter code is + * an abstract class that requires implementation of 4 methods:

    + *
  • + * getBits() which + * returns the set bits as a buffer encoded into an array of long.
  • + *
  • + * getHasher() + * which returns a list of integers that are indexes of the bits that are enabled. These + * are returned in a Hasher construct.
  • + *
  • + * merge( BloomFilter ) to merge another + * Bloom filter into this one.
  • + *
  • + * merge( Hasher ) to merge the values in a hasher + * into this Bloom filter.
  • + *
+ * There are 3 implementations of Bloom filter + * provided:
    + *
  • + * BitSetBloomFilter - based on the Java BitSet class.
  • + *
  • + * + * CountingBloomFilter - uses a sparse array of integers (Map) to implement a counting + * Bloom filter. This filter also implements remove() methods as that is the great + * advantage of a counting Bloom filter.
  • + *
  • + * HasherBloomFilter - implements bloom + * filter on a Hasher. A rather slow implementation but convenient in some + * situations.
  • + *
+ * + *

+ * Shape

+ *

+ * Describes the Bloom filter using the + * standard number of bits, number of hash functions and number of items along with a + * description of the HashFunction. It is this description that has caused the most issues + * of late.

+ *

+ * Hasher

+ *

+ * converts byte buffers into an iterator if int based + * on a Shape. There are 2 implementations of Hasher provided

    + *
  • + * Dynamic - calls + * the HashFunction for each value required in the Bloom filter.
  • + *
  • + * Static - based + * on a pre-calculated list of Bloom filter index values. It is also limited to generating + * values for a specific Shape.
  • + *
+ * + *

+ * Hash Functions

+ *

+ * Hash + * functions generate individual index values for the filter from a byte buffer. There are + * four implementations provided.

+ *

+ * HashFunctionIdentity

+ *

+ * The + * HashFunctionIdentity is the base interface for the HashFunction. It tracks three (3) + * properties:

    + *
  • + * The Hashing algorithm
  • + *
  • + * Whether the contents of the + * resulting hash buffer are read as signed or unsigned values.
  • + *
  • + * Whether the hash + * function uses an iterative or cyclic method. In traditional iterative methods this is + * done by calling the selected hash function with a different seed for each hash + * required. The second method described by Adam Kirsch and Micheal Mitzenmacher[1] has + * become more common and is used in applications like Cassandra[2].
  • + *
+ * + *

References

+ * + *
    + *
  1. https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
  2. + *
  3. https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/BloomFilter.java#L60
  4. + *
+ * + * @since 4.5 + */ +package org.apache.commons.collections4.bloomfilter; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java new file mode 100644 index 0000000000..a1d10af004 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -0,0 +1,523 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.List; +import java.util.PrimitiveIterator.OfInt; +import java.util.ArrayList; +import java.util.Arrays; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.junit.Test; + +/** + * Test standard methods. + * + */ +public abstract class AbstractBloomFilterTest { + + /** + * Create the BloomFilter implementation we are testing. + * + * @param hasher the hasher to use to create the filter.. + * @param shape the shape of the filter. + * @return a BloomFilter implementation. + */ + protected abstract AbstractBloomFilter createFilter(Hasher hasher, Shape shape); + + /** + * A HashFunctionIdentity for testing. + */ + protected HashFunctionIdentity testFunction = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test Function"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + } + }; + + /** + * A second HashFunctionIdentity for testing. + */ + protected HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test FunctionX"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 1; + } + }; + + /** + * Create an empty version of the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @return a BloomFilter implementation. + */ + protected abstract AbstractBloomFilter createEmptyFilter(Shape shape); + + /** + * The shape of the Bloom filters for testing + */ + protected Shape shape = new Shape(testFunction, 3, 72, 17); + + /** + * Tests that creating a filter with a hasher works as expected. + */ + @Test + public final void constructorTest_Hasher() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + long[] lb = bf.getBits(); + assertEquals(0x1FFFF, lb[0]); + assertEquals(1, lb.length); + } + + /** + * Tests that creating an empty hasher works as expected. + */ + @Test + public final void constructorTest_Empty() { + + BloomFilter bf = createEmptyFilter(shape); + long[] lb = bf.getBits(); + assertEquals(0, lb.length); + } + + /** + * Tests that creating a Bloom filter with a Static hasher that has one shape and a + * different specified shape fails. + */ + @Test + public final void constructorTest_WrongShape() { + Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), anotherShape); + try { + createFilter(hasher, shape); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that cardinality is correct. + */ + @Test + public final void cardinalityTest() { + + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + assertEquals(17, bf.cardinality()); + } + + /** + * Tests that the orCardinality calculations are correct. + */ + @Test + public final void orCardinalityTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + AbstractBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + + BloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(27, bf.orCardinality(bf2)); + } + + /** + * Tests that the orCardinality calculations are correct when there are more than Long.LENGTH bits. + */ + @Test + public final void orCardinalityTest_ExtraLongs() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + AbstractBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + + AbstractBloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(27, bf.orCardinality(bf2)); + assertEquals(27, bf2.orCardinality(bf)); + } + + /** + * Tests that the andCardinality calculations are correct. + */ + @Test + public final void andCardinalityTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + + BloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(7, bf.andCardinality(bf2)); + } + + /** + * Tests that the andCardinality calculations are correct when there are more than Long.LENGTH bits. + */ + @Test + public final void andCardinalityTest_ExtraLongs() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + + BloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(7, bf.andCardinality(bf2)); + assertEquals(7, bf2.andCardinality(bf)); + } + + /** + * Tests that the zorCardinality calculations are correct. + */ + @Test + public final void xorCardinalityTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(20, bf.xorCardinality(bf2)); + } + + /** + * Tests that the xorCardinality calculations are correct when there are more than Long.LENGTH bits. + */ + @Test + public final void xorCardinalityTest_ExtraLongs() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals(20, bf.xorCardinality(bf2)); + assertEquals(20, bf2.xorCardinality(bf)); + } + + /** + * Tests that merging bloom filters works as expected. + */ + @Test + public final void mergeTest_BloomFilter() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter bf2 = createFilter(hasher2, shape); + + bf.merge(bf2); + assertEquals(27, bf.cardinality()); + } + + /** + * Tests that merging bloom filters with different shapes fails properly + */ + @Test + public final void mergeTest_BloomFilter_WrongShape() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); + BloomFilter bf2 = createFilter(hasher2, anotherShape); + + try { + bf.merge(bf2); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that merging a hasher into a Bloom filter works as expected + */ + @Test + public final void mergeTest_Hasher() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + + bf.merge(hasher2); + assertEquals(27, bf.cardinality()); + } + + /** + * Tests that merging a static hasher with the wrong shape into a Bloom filter fails as expected + */ + @Test + public final void mergeTest_Hasher_WrongShape() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + + BloomFilter bf = createFilter(hasher, shape); + + Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); + + try { + bf.merge(hasher2); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that isFull() returns the proper values. + */ + @Test + public final void isFullTest() { + + // create empty filter + AbstractBloomFilter filter = createEmptyFilter(shape); + assertFalse(filter.isFull()); + + List values = new ArrayList(shape.getNumberOfBits()); + for (int i = 0; i < shape.getNumberOfBits(); i++) { + values.add(i); + } + + StaticHasher hasher2 = new StaticHasher(values.iterator(), shape); + filter = createFilter(hasher2, shape); + + assertTrue(filter.isFull()); + + int mid = shape.getNumberOfBits() / 2; + values.remove(Integer.valueOf(mid)); + hasher2 = new StaticHasher(values.iterator(), shape); + filter = createFilter(hasher2, shape); + assertFalse(filter.isFull()); + + } + + /** + * Tests that contains() fails properly if the other Bloom filter is not of the proper shape. + */ + @Test + public final void containsTest_BloomFilter_WrongShape() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + + Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + Hasher hasher2 = new StaticHasher(lst.iterator(), anotherShape); + BloomFilter bf2 = createFilter(hasher2, anotherShape); + try { + bf.contains(bf2); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that contains() with a Bloom filter argument returns the proper results. + */ + @Test + public final void containsTest_BloomFilter() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter bf2 = createFilter(hasher2, shape); + assertTrue(bf.contains(bf2)); + assertFalse(bf2.contains(bf)); + } + + /** + * Tests that contains() with a Hasher argument returns the proper results. + */ + @Test + public final void containsTest_Hasher() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + assertTrue(bf.contains(hasher2)); + + lst2 = Arrays.asList(17, 18, 19, 20); + hasher2 = new StaticHasher(lst2.iterator(), shape); + assertFalse(bf.contains(hasher2)); + + lst2 = Arrays.asList(10, 11, 12, 17, 18, 19, 20); + hasher2 = new StaticHasher(lst2.iterator(), shape); + assertFalse(bf.contains(hasher2)); + } + + /** + * Tests that contains() fails properly if the hasher is not of the proper shape. + */ + @Test + public final void containsTest_Hasher_WrongShape() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + + Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); + + List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); + Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); + try { + bf.contains(hasher2); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Compare 2 static hashers to verify they have the same bits enabled. + * + * @param hasher1 the first static hasher. + * @param hasher2 the second static hasher. + */ + private void assertSameBits(StaticHasher hasher1, StaticHasher hasher2) { + OfInt iter1 = hasher1.getBits(shape); + OfInt iter2 = hasher2.getBits(shape); + + while (iter1.hasNext()) { + assertTrue("Not enough data in second hasher", iter2.hasNext()); + assertEquals(iter1.nextInt(), iter2.nextInt()); + } + assertFalse("Too much data in second hasher", iter2.hasNext()); + } + + /** + * Tests that the the hasher returned from getHasher() works correctly. + */ + @Test + public final void getHasherTest() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + + StaticHasher hasher2 = bf.getHasher(); + + assertEquals(shape, hasher2.getShape()); + assertSameBits(hasher, hasher2); + } + + /** + * Tests that getBits() works correctly when multiple long values are returned. + */ + @Test + public final void getBitsTest_SpanLong() { + List lst = Arrays.asList(63, 64); + StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter bf = createFilter(hasher, shape); + long[] lb = bf.getBits(); + assertEquals(2, lb.length); + assertEquals(0x8000000000000000L, lb[0]); + assertEquals(0x1, lb[1]); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java new file mode 100644 index 0000000000..fd5f15f57e --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.junit.Test; + +/** + * Tests for the BitSetBloomFilter implementation. + * + */ +public class BitSetBloomFilterTest extends AbstractBloomFilterTest { + + @Override + protected BitSetBloomFilter createFilter(Hasher hasher, Shape shape) { + return new BitSetBloomFilter( hasher, shape ); + } + + @Override + protected BitSetBloomFilter createEmptyFilter(Shape shape) { + return new BitSetBloomFilter( shape ); + } + + /** + * Test that andCardinality works for BitSetBloomFilter arguments. + */ + @Test + public void andCardinalityTest_BitSetBloomFilter() { + Hasher hasher = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ).iterator(), shape ); + + BitSetBloomFilter bf = createFilter(hasher, shape); + + Hasher hasher2 = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ).iterator(), shape ); + BitSetBloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals( 10, bf.andCardinality(bf2)); + assertEquals( 10, bf2.andCardinality(bf)); + + hasher2 = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5 ).iterator(), shape ); + bf2 = createFilter(hasher2, shape); + + assertEquals( 5, bf.andCardinality(bf2)); + assertEquals( 5, bf2.andCardinality(bf)); + + hasher2 = new StaticHasher( Arrays.asList( 11, 12, 13, 14, 15 ).iterator(), shape ); + bf2 = createFilter(hasher2, shape); + assertEquals( 0, bf.andCardinality(bf2)); + assertEquals( 0, bf2.andCardinality(bf)); + + + } + + /** + * Test that xorCardinality works for BitSetBloomFilter arguments. + */ + @Test + public void xorCardinalityTest_BitSetBloomFilter() { + Hasher hasher = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ).iterator(), shape ); + + BitSetBloomFilter bf = createFilter(hasher, shape); + + Hasher hasher2 = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ).iterator(), shape ); + BitSetBloomFilter bf2 = createFilter(hasher2, shape); + + assertEquals( 0, bf.xorCardinality(bf2)); + assertEquals( 0, bf2.xorCardinality(bf)); + + hasher2 = new StaticHasher( Arrays.asList( 1, 2, 3, 4, 5 ).iterator(), shape ); + bf2 = createFilter(hasher2, shape); + + assertEquals( 5, bf.xorCardinality(bf2)); + assertEquals( 5, bf2.xorCardinality(bf)); + + hasher2 = new StaticHasher( Arrays.asList( 11, 12, 13, 14, 15 ).iterator(), shape ); + bf2 = createFilter(hasher2, shape); + assertEquals( 15, bf.xorCardinality(bf2)); + assertEquals( 15, bf2.xorCardinality(bf)); + + + } + + /** + * Test that merge() works for BitSetBloomFilter arguments. + */ + @Test + public void mergeTest_BitSetBloomFilter() { + + List lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + BitSetBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ,26 ,27 ); + Hasher hasher2 = new StaticHasher( lst2.iterator(), shape ); + BloomFilter bf2 = new BitSetBloomFilter(hasher2, shape); + + bf.merge(bf2); + + assertEquals(27, bf.cardinality()); + + + } + + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java new file mode 100644 index 0000000000..c74dab9246 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java @@ -0,0 +1,439 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.junit.Test; + +/** + * Tests for the Counting Bloom filter implementation. + * + */ +public class CountingBloomFilterTest extends AbstractBloomFilterTest { + + + @Override + protected CountingBloomFilter createFilter(Hasher hasher, Shape shape) { + return new CountingBloomFilter( hasher, shape ); + } + + @Override + protected CountingBloomFilter createEmptyFilter(Shape shape) { + return new CountingBloomFilter( shape ); + } + + /** + * Tests that counts are correct when a hasher is used. + */ + @Test + public void ConstructorTest_HasherValues_CountsTest() { + List lst = Arrays.asList( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + long[] lb = bf.getBits(); + assertEquals(0x1FFFF, lb[0]); + assertEquals(1, lb.length); + + + assertEquals(17, bf.getCounts().count()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get()); + } + + /** + * Tests that counts are correct when a map of counts is used. + */ + @Test + public void ConstructorTest_Map_CountsTest() { + Map map = new HashMap(); + for (int i =0;i<17;i++) + { + map.put( i, 1 ); + } + + CountingBloomFilter bf = new CountingBloomFilter( map, shape); + assertEquals(17, bf.getCounts().count()); + + map.put( shape.getNumberOfBits(), 1 ); + try { + bf = new CountingBloomFilter( map, shape); + fail( "Should have thrown IllegalArgumentExceptionW"); + } catch (IllegalArgumentException exprected) + { + // expected + } + + map.clear(); + map.put( -1, 1 ); + try { + bf = new CountingBloomFilter( map, shape); + fail( "Should have thrown IllegalArgumentExceptionW"); + } catch (IllegalArgumentException exprected) + { + // expected + } + + map.clear(); + map.put( 1, -1 ); + try { + bf = new CountingBloomFilter( map, shape); + fail( "Should have thrown IllegalArgumentExceptionW"); + } catch (IllegalArgumentException exprected) + { + // expected + } + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is passed + */ + @Test + public void mergeTest_Counts() { + int[] expected = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0 + }; + List lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ,26 ,27 ); + Hasher hasher2 = new StaticHasher( lst2.iterator(), shape ); + BloomFilter bf2 = createFilter(hasher2, shape); + + bf.merge(bf2); + + assertEquals(27, bf.getCounts().count()); + assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get()); + + Map m = new HashMap(); + bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); + for (int i=0;i<29;i++) + { + if (m.get(i) == null) + { + assertEquals( "Wrong value for "+i, expected[i], 0 ); + } else + { + assertEquals( "Wrong value for "+i, expected[i], m.get(i).intValue()); + } + } + } + + + /** + * Test that merge correctly updates the counts when a BitSetBloomFilter is passed + */ + @Test + public void mergeTest_Counts_BitSetFilter() { + int[] expected = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0 + }; + List lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ,26 ,27 ); + Hasher hasher2 = new StaticHasher( lst2.iterator(), shape ); + BloomFilter bf2 = new BitSetBloomFilter(hasher2, shape); + + bf.merge(bf2); + + assertEquals(27, bf.getCounts().count()); + assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get()); + + Map m = new HashMap(); + bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); + for (int i=0;i<29;i++) + { + if (m.get(i) == null) + { + assertEquals( "Wrong value for "+i, expected[i], 0 ); + } else + { + assertEquals( "Wrong value for "+i, expected[i], m.get(i).intValue()); + } + } + + } + + /** + * Test that merge correctly updates the counts when a Hasher is passed + */ + @Test + public void mergeTest_Shape_Hasher_Count() { + int[] expected = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0 + }; + + List lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + + List lst2 = Arrays.asList( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ,26 ,27 ); + Hasher hasher2 = new StaticHasher( lst2.iterator(), shape ); + + bf.merge(hasher2); + + assertEquals(27, bf.getCounts().count()); + assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get()); + + Map m = new HashMap(); + bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); + for (int i=0;i<29;i++) + { + if (m.get(i) == null) + { + assertEquals( "Wrong value for "+i, expected[i], 0 ); + } else + { + assertEquals( "Wrong value for "+i, expected[i], m.get(i).intValue()); + } + } + } + + /** + * Test that merge correctly updates the counts when a CountingBloomFilter is passed and an integer overflow occurs. + */ + @Test + public void mergeTest_Overflow() { + + List lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + + + Map map = new HashMap(); + bf.getCounts().forEach( e -> map.put( e.getKey(), e.getValue())); + map.put(1, Integer.MAX_VALUE ); + + CountingBloomFilter bf2 = new CountingBloomFilter(map, shape); + + // should not fail + bf.merge(bf2); + + // try max int on other side of merge. + bf2 = createFilter(hasher, shape); + bf = new CountingBloomFilter(map, shape); + + try { + bf.merge(bf2); + fail( "Should have thrown IllegalStateException"); + } + catch (IllegalStateException expected) + { + // do nothing + } + } + + /** + * Tests that when removing a standard Bloom filter the counts are correctly updated. + */ + @Test + public void removeTest_Standard() { + int[] values = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + Map map = new HashMap(); + for (int i=1;i lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + BitSetBloomFilter bf2 = new BitSetBloomFilter( hasher, shape ); + + bf.remove( bf2 ); + assertEquals( 17, bf.cardinality() ); + Map map2 = new HashMap(); + bf.getCounts().forEach( e -> map2.put( e.getKey(), e.getValue())); + + for (int i = 11; i map = new HashMap(); + for (int i=1;i lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + BloomFilter bf2 = new CountingBloomFilter( hasher, shape ); + + bf.remove( bf2 ); + assertEquals( 17, bf.cardinality() ); + Map map2 = new HashMap(); + bf.getCounts().forEach( e -> map2.put( e.getKey(), e.getValue())); + + for (int i = 11; i lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + CountingBloomFilter bf = createFilter(hasher, shape); + + + Map map = new HashMap(); + bf.getCounts().forEach( e -> map.put( e.getKey(), e.getValue())); + map.remove(1); + + CountingBloomFilter bf2 = new CountingBloomFilter(map, shape); + + // should not fail + bf.remove(bf2); + + // try max int on other side of remove. + bf2 = createFilter(hasher, shape); + bf = new CountingBloomFilter(map, shape); + + try { + bf.remove(bf2); + fail( "Should have thrown IllegalStateException"); + } + catch (IllegalStateException expected) + { + // do nothing + } + } + + /** + * Tests that removing a hasher update the counts properly. + */ + @Test + public void removeTest_Hasher() { + int[] values = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + Map map = new HashMap(); + for (int i=1;i lst = Arrays.asList( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ,16 ,17 ); + Hasher hasher = new StaticHasher( lst.iterator(), shape ); + + + bf.remove( hasher ); + assertEquals( 17, bf.cardinality() ); + Map map2 = new HashMap(); + bf.getCounts().forEach( e -> map2.put( e.getKey(), e.getValue())); + + for (int i = 11; i lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + assertEquals(1, SetOperations.estimateSize(filter1)); + + // the data provided above do not generate an estimate that is equivalent to the + // actual. + lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); + hasher = new StaticHasher(lst.iterator(), shape); + filter1 = new HasherBloomFilter(hasher, shape); + assertEquals(1, SetOperations.estimateSize(filter1)); + + lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33); + Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(3, SetOperations.estimateSize(filter2)); + + } + + /** + * Tests that the union size estimate is correctly calculated. + */ + @Test + public final void estimateUnionSizeTest() { + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + lst = Arrays.asList(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40); + Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + long estimate = SetOperations.estimateUnionSize(filter1, filter2); + assertEquals(3, estimate); + + } + + /** + * Tests that the intersection size estimate is correctly calculated. + */ + @Test + public final void estimateIntersectionSizeTest() { + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + lst = Arrays.asList(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); + Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + long estimate = SetOperations.estimateIntersectionSize(filter1, filter2); + assertEquals(1, estimate); + + } + + /** + * Tests that the Hamming distance is correctly calculated. + */ + @Test + public final void hammingDistanceTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); + + lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(17, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(17, SetOperations.hammingDistance(filter2, filter1)); + + } + + /** + * Tests that the Jaccard similarity is correctly calculated. + */ + @Test + public final void jaccardSimilarityTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); + assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); + + lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.68, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); + assertEquals(0.68, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); + } + + /** + * Tests that the Jaccard similarity is correctly calculated when one or + * both filters are empty + */ + @Test + public final void jaccardSimilarityTest_NoValues() { + BloomFilter filter1 = new HasherBloomFilter(shape); + BloomFilter filter2 = new HasherBloomFilter(shape); + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter3 = new HasherBloomFilter( hasher, shape ); + + assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); + assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); + assertEquals(1.0, SetOperations.jaccardSimilarity(filter1, filter3), 0.0001); + assertEquals(1.0, SetOperations.jaccardSimilarity(filter3, filter1), 0.0001); + + } + + + /** + * Tests that the Jaccard distance is correctly calculated. + */ + @Test + public final void jaccardDistanceTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); + assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); + + lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.32, SetOperations.jaccardDistance(filter1, filter2), 0.001); + assertEquals(0.32, SetOperations.jaccardDistance(filter2, filter1), 0.001); + + } + + /** + * Tests that the Jaccard distance is correctly calculated when one or + * both filters are empty + */ + @Test + public final void jaccardDistanceTest_NoValues() { + BloomFilter filter1 = new HasherBloomFilter(shape); + BloomFilter filter2 = new HasherBloomFilter(shape); + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter3 = new HasherBloomFilter( hasher, shape ); + + assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); + assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); + assertEquals(0.0, SetOperations.jaccardDistance(filter1, filter3), 0.0001); + assertEquals(0.0, SetOperations.jaccardDistance(filter3, filter1), 0.0001); + + } + + + /** + * Tests that the Cosine similarity is correctly calculated. + */ + @Test + public final void cosineSimilarityTest() { + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); + assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); + + lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter1, filter2), 0.000000000000001); + assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter2, filter1), 0.000000000000001); + + } + + /** + * Tests that the Cosine similarity is correctly calculated when one or + * both filters are empty + */ + @Test + public final void cosineSimilarityTest_NoValues() { + BloomFilter filter1 = new HasherBloomFilter(shape); + BloomFilter filter2 = new HasherBloomFilter(shape); + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter3 = new HasherBloomFilter( hasher, shape ); + + assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); + assertEquals(0.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); + assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter3), 0.0001); + assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1), 0.0001); + + } + + /** + * Tests that the Cosine similarity is correctly calculated. + */ + @Test + public final void cosineDistanceTest() { + List lst = Arrays.asList(1, 2); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + + List lst2 = Arrays.asList(2, 3); + Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); + assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); + + lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + hasher = new StaticHasher(lst.iterator(), shape); + filter1 = new HasherBloomFilter(hasher, shape); + + lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); + assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); + + lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + hasher2 = new StaticHasher(lst2.iterator(), shape); + filter2 = new HasherBloomFilter(hasher2, shape); + + assertEquals(0.514928749927334, SetOperations.cosineDistance(filter1, filter2), 0.000000000000001); + assertEquals(0.514928749927334, SetOperations.cosineDistance(filter2, filter1), 0.000000000000001); + + } + + /** + * Tests that the Cosine distance is correctly calculated when one or + * both filters are empty + */ + @Test + public final void cosineDistanceTest_NoValues() { + BloomFilter filter1 = new HasherBloomFilter(shape); + BloomFilter filter2 = new HasherBloomFilter(shape); + // build a filter + List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + Hasher hasher = new StaticHasher(lst.iterator(), shape); + BloomFilter filter3 = new HasherBloomFilter( hasher, shape ); + + assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter1, filter3), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter3, filter1), 0.0001); + + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/CommonComparatorTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/CommonComparatorTest.java new file mode 100644 index 0000000000..8594599bfe --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/CommonComparatorTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.TreeSet; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; +import org.junit.Test; + +/** + * Tests of the HashFunctionIdentity.COMMON_COMPARATOR + * + */ +public class CommonComparatorTest { + + private void assertBefore(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + assertTrue(0 > HashFunctionIdentity.COMMON_COMPARATOR.compare(identity1, identity2)); + } + + private void assertAfter(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + assertTrue(0 < HashFunctionIdentity.COMMON_COMPARATOR.compare(identity1, identity2)); + } + + /** + * Tests the name ordering. + */ + @Test + public void nameOrderTestDifferentNames() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests the name ordering is not affected by case. + */ + @Test + public void nameOrderTestDifferentCapitalization() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "IMPL1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl1, impl2)); + + } + + /** + * Tests that signedness ordering is correct. + */ + @Test + public void signednessOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, + ProcessType.CYCLIC, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that the process type ordering in correct. + */ + @Test + public void processTypeOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.ITERATIVE, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that a change in producer does not change the order. + */ + @Test + public void producerDoesNotChangeOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertEquals(0, HashFunctionIdentity.COMMON_COMPARATOR.compare(impl1, impl2)); + } + + /** + * Tests that the ordering is correct when applied ot a collection. + */ + @Test + public void testSortOrder() { + // in this test the signature is the position in the final collection for the ID + TreeSet result = new TreeSet( + HashFunctionIdentity.COMMON_COMPARATOR); + List collection = new ArrayList(); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, ProcessType.CYCLIC, 0)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, ProcessType.ITERATIVE, 1)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, ProcessType.CYCLIC, 2)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, ProcessType.ITERATIVE, 3)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, ProcessType.CYCLIC, 4)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, ProcessType.ITERATIVE, 5)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.UNSIGNED, ProcessType.CYCLIC, 6)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.UNSIGNED, ProcessType.ITERATIVE, 7)); + + Collections.shuffle(collection); + + result.addAll(collection); + long idx = 0; + for (HashFunctionIdentity id : result) { + assertEquals("Unexpected order for " + HashFunctionIdentity.asCommonString(id), idx++, id.getSignature()); + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DeepComparatorTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DeepComparatorTest.java new file mode 100644 index 0000000000..d25adede08 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DeepComparatorTest.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.TreeSet; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; +import org.junit.Test; + +/** + * Tests of the HashFunctionIdentity.DEEP_COMPARATOR + * + */ +public class DeepComparatorTest { + + private void assertBefore(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + assertTrue(0 > HashFunctionIdentity.DEEP_COMPARATOR.compare(identity1, identity2)); + } + + private void assertAfter(HashFunctionIdentity identity1, HashFunctionIdentity identity2) { + assertTrue(0 < HashFunctionIdentity.DEEP_COMPARATOR.compare(identity1, identity2)); + } + + /** + * Tests that name order is correct. + */ + @Test + public void nameOrderTestDifferentNames() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that name order is not affected by case. + */ + @Test + public void nameOrderTestDifferentCapitalization() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "IMPL1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl1, impl2)); + + } + + /** + * Tests that signedness order is correct. + */ + @Test + public void signednessOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, + ProcessType.CYCLIC, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that process type order is correct. + */ + @Test + public void processTypeOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.ITERATIVE, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that producer order is correct. + */ + @Test + public void producerOrder() { + HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, + ProcessType.CYCLIC, 300L); + + assertBefore(impl1, impl2); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl1, impl1)); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(impl2, impl2)); + assertAfter(impl2, impl1); + } + + /** + * Tests that the ordering is correct when applied ot a collection. + */ + @Test + public void testSortOrder() { + // in this test the signature is the position in the final collection for the ID + TreeSet result = new TreeSet(HashFunctionIdentity.DEEP_COMPARATOR); + List collection = new ArrayList(); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, ProcessType.CYCLIC, 0)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, ProcessType.ITERATIVE, 2)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, ProcessType.CYCLIC, 4)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, ProcessType.ITERATIVE, 6)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, ProcessType.CYCLIC, 8)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, ProcessType.ITERATIVE, 10)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.UNSIGNED, ProcessType.CYCLIC, 12)); + + collection.add( + new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.UNSIGNED, ProcessType.ITERATIVE, 14)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, ProcessType.CYCLIC, 1)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, ProcessType.ITERATIVE, 3)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.UNSIGNED, ProcessType.CYCLIC, 5)); + + collection.add( + new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.UNSIGNED, ProcessType.ITERATIVE, 7)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl2", Signedness.SIGNED, ProcessType.CYCLIC, 9)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl2", Signedness.SIGNED, ProcessType.ITERATIVE, 11)); + + collection + .add(new HashFunctionIdentityImpl("Testing Suite2", "impl2", Signedness.UNSIGNED, ProcessType.CYCLIC, 13)); + + collection.add( + new HashFunctionIdentityImpl("Testing Suite2", "impl2", Signedness.UNSIGNED, ProcessType.ITERATIVE, 15)); + + Collections.shuffle(collection); + + result.addAll(collection); + long idx = 0; + for (HashFunctionIdentity id : result) { + assertEquals("Unexpected order for " + id.getProvider() + ":" + HashFunctionIdentity.asCommonString(id), + idx++, id.getSignature()); + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java new file mode 100644 index 0000000000..148e79246c --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.security.NoSuchAlgorithmException; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; +import org.junit.Before; +import org.junit.Test; + +/** + * DynamicHasher Builder tests. + * + */ +public class DynamicHasherBuilderTest { + + private DynamicHasher.Builder builder; + private Shape shape = new Shape( new MD5Cyclic(), 1, Integer.MAX_VALUE, 1 ); + + /** + * Sets up the builder for testing. + * @throws NoSuchAlgorithmException if MD5 is not available. + */ + @Before + public void setup() throws NoSuchAlgorithmException + { + builder = new DynamicHasher.Builder( new MD5Cyclic()); + } + + /** + * Tests that hashing a byte works as expected. + */ + @Test + public void buildTest_byte() { + DynamicHasher hasher = builder.with((byte) 0x1).build(); + + int expected = 1483089307; + + OfInt iter = hasher.getBits(shape); + + assertTrue(iter.hasNext()); + assertEquals( expected, iter.nextInt() ); + assertFalse( iter.hasNext()); + } + + /** + * Tests that hashing a byte array works as expected. + */ + @Test + public void buildTest_byteArray() { + DynamicHasher hasher = builder.with("Hello".getBytes()).build(); + int expected = 1519797563; + + OfInt iter = hasher.getBits(shape); + + assertTrue(iter.hasNext()); + assertEquals( expected, iter.nextInt() ); + assertFalse( iter.hasNext()); + + } + + /** + * Tests that hashing a string works as expected. + */ + @Test + public void buildTest_String() { + DynamicHasher hasher = builder.with("Hello").build(); + int expected = 1519797563; + + OfInt iter = hasher.getBits(shape); + + assertTrue(iter.hasNext()); + assertEquals( expected, iter.nextInt() ); + assertFalse( iter.hasNext()); + } + + /** + * Tests that an empty hasher works as expected. + */ + @Test + public void buildTest_Empty() { + DynamicHasher hasher = builder.build(); + + OfInt iter = hasher.getBits(shape); + + assertFalse(iter.hasNext()); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java new file mode 100644 index 0000000000..19cdc1d8cb --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.security.NoSuchAlgorithmException; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests the Dynamic Hasher + * + */ +public class DynamicHasherTest { + private DynamicHasher.Builder builder; + private Shape shape; + + private HashFunctionIdentity testFunction = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test Function"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + } + }; + + /** + * Sets up the DynamicHasher. + * + * @throws NoSuchAlgorithmException is MD5 is not available. + */ + @Before + public void setup() throws NoSuchAlgorithmException { + builder = new DynamicHasher.Builder(new MD5Cyclic()); + shape = new Shape(new MD5Cyclic(), 3, 72, 17); + } + + /** + * Tests that the expected bits are returned from hashing. + */ + @Test + public void testGetBits() { + + int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62}; + + Hasher hasher = builder.with("Hello").build(); + + OfInt iter = hasher.getBits(shape); + + for (int i = 0; i < expected.length; i++) { + assertTrue(iter.hasNext()); + assertEquals(expected[i], iter.nextInt()); + } + assertFalse(iter.hasNext()); + + } + + /** + * Tests that bits from multiple hashes are returned correctly. + */ + @Test + public void testGetBits_MultipleHashes() { + int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, + 59, 49, 39, 13, 3, 65, 55, 45, 35, 25}; + + Hasher hasher = builder.with("Hello").with("World").build(); + + OfInt iter = hasher.getBits(shape); + + for (int i = 0; i < expected.length; i++) { + assertTrue(iter.hasNext()); + assertEquals(expected[i], iter.nextInt()); + } + assertFalse(iter.hasNext()); + + } + + /** + * Tests that retrieving bits for the wrong shape throws an exception. + */ + @Test + public void testGetBits_WongShape() { + + Hasher hasher = builder.with("Hello").build(); + + try { + hasher.getBits(new Shape(testFunction, 3, 72, 17)); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Tests if isEmpty() reports correctly. + */ + @Test + public void testIsEmpty() { + assertTrue( builder.build().isEmpty() ); + assertFalse( builder.with("Hello").build().isEmpty() ); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java new file mode 100644 index 0000000000..365c73527f --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; +import org.junit.Test; + +/** + * Tests the HashFunctionIdentity implementation. + * + */ +public class HashFunctionIdentityImplTest { + + /** + * Tests a copy constructor of the HashFunctionIdentity. + */ + @Test + public void copyConstructorTest() { + HashFunctionIdentity identity = new HashFunctionIdentity() { + + @Override + public String getName() { + return "NAME"; + } + + @Override + public String getProvider() { + return "Provider"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return -1l; + } + + }; + HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl( identity ); + assertEquals( "NAME", impl.getName()); + assertEquals( "Provider", impl.getProvider()); + assertEquals( Signedness.SIGNED, impl.getSignedness()); + assertEquals( ProcessType.CYCLIC, impl.getProcessType()); + assertEquals( -1l, impl.getSignature()); + } + + /** + * Test the constructor from component values. + * @param provider the name of the provider. + * @param name the name of the hash function. + * @param signedness the signedness of the hash function. + * @param process the processes of the hash function. + * @param signature the signature for the hash function. + */ + @Test + public void valuesConstructorTest() { + HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl( "Provider", "NAME", + Signedness.UNSIGNED, ProcessType.ITERATIVE, -2l); + assertEquals( "NAME", impl.getName()); + assertEquals( "Provider", impl.getProvider()); + assertEquals( Signedness.UNSIGNED, impl.getSignedness()); + assertEquals( ProcessType.ITERATIVE, impl.getProcessType()); + assertEquals( -2l, impl.getSignature()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java new file mode 100644 index 0000000000..6ed068e2dc --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java @@ -0,0 +1,497 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.fail; + +import java.util.Objects; + +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.junit.Test; + +/** + * Tests that the Shap class. + * + */ +public class ShapeTest { + + + private HashFunctionIdentity testFunction = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test Function"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + }}; + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + + private Shape shape = new Shape(testFunction, 5, 0.1); + + /** + * Tests that the constructor with a null name, number of items, and probability fails. + */ + @Test + public void constructor_np_noName() { + + try { + new Shape(null, 5, 0.1); + fail( "Should throw IllegalArgumentException"); + } + catch (IllegalArgumentException expected) + { + // do nothing + } + } + + /** + * Tests that the constructor with a null name, number of items and size of filter fails. + */ + @Test + public void constructor_nm_noName() { + + try { + new Shape(null, 5, 72); + fail( "Should throw IllegalArgumentException"); + } + catch (IllegalArgumentException expected) + { + // do nothing + } + } + + /** + * Tests that the constructor with a null name, number of items, size of filter, + * and number of functions fails. + */ + @Test + public void constructor_nmk_noName() { + + try { + new Shape(null, 5, 72, 17); + fail( "Should throw IllegalArgumentException"); + } + catch (IllegalArgumentException expected) + { + // do nothing + } + } + + /** + * Tests that the constructor with a null name, probability, size of filter, + * and number of functions fails. + */ + @Test + public void constructor_pmk_noName() { + + try { + new Shape(null, 0.1, 72, 17); + fail( "Should throw IllegalArgumentException"); + } + catch (IllegalArgumentException expected) + { + // do nothing + } + } + + /** + * Tests the the probability is calculated correctly. + */ + @Test + public void constructor_items_probability_Test() { + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfBytes()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(5, shape.getNumberOfItems()); + assertEquals(0.100375138, shape.getProbability(), 0.000001); + + } + + /** + * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an + * IllegalArgumentException is thrown. + */ + @Test + public void constructor_items_probability_NumberOfBitsOverflowTest() { + try { + new Shape( testFunction, Integer.MAX_VALUE, 1.0 / 10); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that if the number of items is less than 1 an IllegalArgumentException is + * thrown. + */ + @Test + public void constructor_items_probability_BadNumberOfItemsTest() { + try { + new Shape( testFunction, 0, 1.0 / 10); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that if the probability is less than or equal to 0 an IllegalArgumentException + * is thrown. + */ + @Test + public void constructor_items_probability_BadProbabilityTest() { + try { + new Shape(testFunction, 10, 0.0); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + + try { + new Shape(testFunction, 10, 1.0); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that the number of items and number of bits is passed the other values are + * calculated correctly. + */ + @Test + public void constructor_items_bitsTest() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24 + */ + Shape filterConfig = new Shape(testFunction, 5, 24); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfBytes()); + assertEquals(3, filterConfig.getNumberOfHashFunctions()); + assertEquals(5, filterConfig.getNumberOfItems()); + assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + + } + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException + * is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfItemsTest() { + try { + new Shape(testFunction, 0, 24); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of bits less than 8 an IllegalArgumentException + * is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfBitsTest() { + try { + new Shape(testFunction, 5, 6); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfHashFunctionsTest() { + try { + new Shape(testFunction, 16,8); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that when the number of items, number of bits and number of hash functions + * is passed the values are calculated correctly. + */ + @Test + public void constructor_items_bits_hashTest() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + */ + Shape filterConfig = new Shape(testFunction, 5, 24, 4); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfBytes()); + assertEquals(4, filterConfig.getNumberOfHashFunctions()); + assertEquals(5, filterConfig.getNumberOfItems()); + assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); + + } + + /** + * Tests that if the number of items is less than 1 an exception is thrown. + */ + @Test + public void constructor_items_bits_hash_BadNumberOfItemsTest() { + try { + new Shape(testFunction, 0, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of bits is less than 8 an exception is thrown + */ + @Test + public void constructor_items_bits_hash_BadNumberOfBitsTest() { + try { + new Shape(testFunction, 5, 6, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is + * thrown. + */ + @Test + public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { + try { + new Shape(testFunction, 5, 24, 0); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an + * IllegalArgumentException is thrown + */ + @Test + public void constructor_items_bits_hash_BadProbabilityTest() { + try { + new Shape(testFunction, 4000,8,1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests the calculated values of calling the constructor with the + * probability, number of bits and number of hash functions. + */ + @Test + public void constructor_probability_bits_hashTest() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + */ + Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfBytes()); + assertEquals(3, filterConfig.getNumberOfHashFunctions()); + assertEquals(5, filterConfig.getNumberOfItems()); + assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + } + + /** + * Tests that invalid probability values cause and IllegalArgumentException to + * be thrown. + */ + @Test + public void constructor__probability_bits_hash_BadProbabilityTest() { + // probability should not be 0 + try { + new Shape(testFunction, 0.0, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + + // probability should not be = -1 + try { + new Shape(testFunction, -1.0, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + + // probability should not be < -1 + try { + new Shape(testFunction, -1.5, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + + // probability should not be = 1 + try { + new Shape(testFunction, 1.0, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + + // probability should not be > 1 + try { + new Shape(testFunction, 2.0, 24, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of bits is less than 8 an exception is thrown + */ + @Test + public void constructor__probability_bits_hash__BadNumberOfBitsTest() { + try { + new Shape(testFunction, 0.5, 6, 1); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Tests that if the number of functions is less than 1 an exception is thrown + */ + @Test + public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { + try { + new Shape(testFunction, 0.5, 24, 0); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) + { + //expected + } + } + + /** + * Test equality of shape. + */ + @Test + public void equalsTest() { + + assertEquals(new Shape(testFunction, 5, 1.0 / 10), shape); + assertNotEquals(new Shape(testFunction, 5, 1.0 / 11), shape); + assertNotEquals(new Shape(testFunction, 4, 1.0 / 10), shape); + + HashFunctionIdentity testFunction2 = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test Function2"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + }}; + + assertNotEquals(new Shape(testFunction2, 4, 1.0 / 10), shape); + + } + + /** + * Test that hashCode equals hashCode of hashFunctionIdentity + */ + @Test + public void hashCodeTest() { + int hashCode = Objects.hash(testFunction, 24, 3 ); + assertEquals(hashCode, shape.hashCode()); + } + + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java new file mode 100644 index 0000000000..29e038de7d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.PrimitiveIterator.OfInt; + +import org.junit.Test; + +/** + * Tests the static hasher. + */ +public class StaticHasherTest { + + private HashFunctionIdentity testFunction = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test Function"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + } + }; + + private HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { + + @Override + public String getName() { + return "Test FunctionX"; + } + + @Override + public String getProvider() { + return "Apache Commons Collection Tests"; + } + + @Override + public Signedness getSignedness() { + return Signedness.SIGNED; + } + + @Override + public ProcessType getProcessType() { + return ProcessType.CYCLIC; + } + + @Override + public long getSignature() { + return 0; + } + }; + + private Shape shape = new Shape(testFunction, 3, 72, 17); + + /** + * Tests that getBits returns the proper values. + */ + @Test + public void testGetBits() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + + StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + assertEquals(17, hasher.size()); + OfInt iter = hasher.getBits(shape); + for (int i = 0; i < 17; i++) { + assertTrue(iter.hasNext()); + assertEquals(i, iter.nextInt()); + } + assertFalse(iter.hasNext()); + + } + + /** + * Tests that gitBits does not return duplicates and orders the indices. + */ + @Test + public void testGetBits_DuplicateValues() { + int[] input = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, 59, + 49, 39, 13, 3, 65, 55, 45, 35, 25}; + int[] expected = {1, 2, 3, 6, 7, 10, 11, 13, 15, 17, 19, 23, 24, 25, 35, 36, 39, 43, 44, 45, 48, 49, 53, 55, 57, + 59, 61, 62, 63, 65, 69, 70}; + + StaticHasher hasher = new StaticHasher(Arrays.stream(input).iterator(), shape); + + OfInt iter = hasher.getBits(shape); + for (int i = 0; i < expected.length; i++) { + assertTrue(iter.hasNext()); + assertEquals(expected[i], iter.nextInt()); + } + assertFalse(iter.hasNext()); + } + + /** + * Tests that gitBits is called with the wrong shape an exeption is thrown. + */ + @Test + public void testGetBits_WrongShape() { + List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + + try { + hasher.getBits(new Shape(testFunctionX, 3, 72, 17)); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + + } + + /** + * Test that the iterator based constructor works correctly and removes duplicates. + */ + @Test + public void testConstructor_Iterator() { + + int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; + Iterator iter = Arrays.stream(values).iterator(); + StaticHasher hasher = new StaticHasher(iter, shape); + + assertEquals(5, hasher.size()); + assertEquals(shape, hasher.getShape()); + assertEquals(0, HashFunctionIdentity.DEEP_COMPARATOR.compare(testFunction, hasher.getHashFunctionIdentity())); + + iter = hasher.getBits(shape); + int idx = 0; + while (iter.hasNext()) { + assertEquals("Error at idx " + idx, Integer.valueOf(values[idx]), iter.next()); + idx++; + } + assertEquals(5, idx); + } + + /** + * Tests that if the iterator passed to the constructor contains a value greater than + * or equal to Shape.numerOfBits() an exception is thrown. + */ + @Test + public void testConstructor_Iterator_ValueTooBig() { + + int[] values = {shape.getNumberOfBits(), 3, 5, 7, 9, 3, 5, 1}; + Iterator iter = Arrays.stream(values).iterator(); + try { + new StaticHasher(iter, shape); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Tests that if the iterator passed to the constructor contains a value less than 0 + * (zero) an exception is thrown. + */ + @Test + public void testConstructor_Iterator_ValueTooSmall() { + + int[] values = {-1, 3, 5, 7, 9, 3, 5, 1}; + Iterator iter = Arrays.stream(values).iterator(); + try { + new StaticHasher(iter, shape); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Compare 2 static hashers to verify they have the same bits enabled. + * + * @param hasher1 the first static hasher. + * @param hasher2 the second static hasher. + */ + private void assertSameBits(StaticHasher hasher1, StaticHasher hasher2) { + OfInt iter1 = hasher1.getBits(shape); + OfInt iter2 = hasher2.getBits(shape); + + while (iter1.hasNext()) { + assertTrue("Not enough data in second hasher", iter2.hasNext()); + assertEquals(iter1.nextInt(), iter2.nextInt()); + } + assertFalse("Too much data in second hasher", iter2.hasNext()); + } + + /** + * Tests that the constructor that accepts a static hasher properly builds the hasher. + */ + @Test + public void testConstructor_StaticHasher() { + int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; + Iterator iter = Arrays.stream(values).iterator(); + StaticHasher hasher = new StaticHasher(iter, shape); + + StaticHasher hasher2 = new StaticHasher(hasher, shape); + assertEquals(shape, hasher2.getShape()); + assertSameBits(hasher, hasher2); + + } + + /** + * Tests that calling the constructor with a hasher and the wrong shape throws an + * IllegalArgumentException. + */ + @Test + public void testConstructor_StaticHasher_WrongShape() { + int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; + Iterator iter = Arrays.stream(values).iterator(); + StaticHasher hasher = new StaticHasher(iter, new Shape(testFunctionX, 3, 72, 17)); + + try { + new StaticHasher(hasher, shape); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Tests that passing a hasher other than a StaticHahser to the constructor works as + * expected. + */ + @Test + public void testConstructor_Hasher() { + int[] expected = {1, 3, 5, 7, 9}; + + Hasher testHasher = new Hasher() { + + @Override + public boolean isEmpty() { return false; } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return testFunction; + } + + @Override + public OfInt getBits(Shape shape) { + int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; + return Arrays.stream(values).iterator(); + } + }; + + StaticHasher hasher = new StaticHasher(testHasher, shape); + OfInt iter = hasher.getBits(shape); + for (int i = 0; i < expected.length; i++) { + assertTrue(iter.hasNext()); + assertEquals(expected[i], iter.nextInt()); + } + assertFalse(iter.hasNext()); + } + + /** + * Tests that passing a hasher other than a StaticHahser and the wrong Shape to the + * constructor throws an IllegalArgumentException. + */ + @Test + public void testConstructor_Hasher_WrongShape() { + Hasher testHasher = new Hasher() { + + @Override + public boolean isEmpty() { return false; } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return testFunctionX; + } + + @Override + public OfInt getBits(Shape shape) { + int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; + return Arrays.stream(values).iterator(); + } + }; + + try { + new StaticHasher(testHasher, shape); + fail("Should have thown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + + /** + * Tests if isEmpty() reports correctly. + */ + @Test + public void testIsEmpty() { + List lst = new ArrayList(); + StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + + + assertTrue( hasher.isEmpty() ); + + lst.add( Integer.valueOf( 1 )); + hasher = new StaticHasher(lst.iterator(), shape); + assertFalse( hasher.isEmpty() ); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java new file mode 100644 index 0000000000..348fc3b20a --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; +import org.junit.Test; + +/** + * Tests the MD5 cyclic hash function. + * + */ +public class MD5CyclicTest { + + /** + * Test that the apply function returns the proper values. + */ + @Test + public void applyTest() { + MD5Cyclic md5 = new MD5Cyclic(); + long l1 = 0x8b1a9953c4611296L; + long l2 = 0xa827abf8c47804d7L; + byte[] buffer = "Hello".getBytes(); + + long l = md5.apply(buffer, 0); + assertEquals(l1, l); + l = md5.apply(buffer, 1); + assertEquals(l1 + l2, l); + l = md5.apply(buffer, 2); + assertEquals(l1 + l2 + l2, l); + } + + /** + * Test that the signature is properly generated. + */ + @Test + public void signatureTest() { + MD5Cyclic md5 = new MD5Cyclic(); + String arg = String.format("%s-%s-%s", md5.getName().toUpperCase(Locale.ROOT), md5.getSignedness(), + md5.getProcessType()); + long expected = md5.apply(arg.getBytes(StandardCharsets.UTF_8), 0); + assertEquals(expected, md5.getSignature()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86CyclicTest.java new file mode 100644 index 0000000000..b3de2d1b6d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x86CyclicTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +import org.apache.commons.collections4.bloomfilter.hasher.function.Murmur128x86Cyclic; +import org.junit.Test; + +/** + * Test that the Murmur3 128 x86 hash function works correctly. + * + */ +public class Murmur128x86CyclicTest { + + /** + * Test that the apply function returns the proper values. + */ + @Test + public void applyTest() { + Murmur128x86Cyclic murmur = new Murmur128x86Cyclic(); + + long l1 = 0xe7eb60dabb386407L; + long l2 = 0xc3ca49f691f73056L; + byte[] buffer = "Now is the time for all good men to come to the aid of their country" + .getBytes(StandardCharsets.UTF_8); + + long l = murmur.apply(buffer, 0); + assertEquals(l1, l); + l = murmur.apply(buffer, 1); + assertEquals(l1 + l2, l); + l = murmur.apply(buffer, 2); + assertEquals(l1 + l2 + l2, l); + } + + /** + * Test that the signature is properly generated. + */ + @Test + public void signatureTest() { + Murmur128x86Cyclic murmur = new Murmur128x86Cyclic(); + String arg = String.format("%s-%s-%s", murmur.getName().toUpperCase(Locale.ROOT), murmur.getSignedness(), + murmur.getProcessType()); + long expected = murmur.apply(arg.getBytes(StandardCharsets.UTF_8), 0); + assertEquals(expected, murmur.getSignature()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java new file mode 100644 index 0000000000..a12a743e12 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +import org.apache.commons.collections4.bloomfilter.hasher.function.Murmur32x86Iterative; +import org.junit.Test; + +/** + * Test that the Murmur3 32 x86 hash function works correctly. + * + */ +public class Murmur32x86IterativeTest { + + /** + * Test that the apply function returns the proper values. + */ + @Test + public void applyTest() { + Murmur32x86Iterative murmur = new Murmur32x86Iterative(); + + byte[] buffer = "Now is the time for all good men to come to the aid of their country" + .getBytes(StandardCharsets.UTF_8); + + long l = murmur.apply(buffer, 0); + assertEquals(82674681, l); + l = murmur.apply(buffer, 1); + assertEquals(-1475490736, l); + l = murmur.apply(buffer, 2); + assertEquals(-1561435247, l); + } + + /** + * Test that the signature is properly generated. + */ + @Test + public void signatureTest() { + Murmur32x86Iterative murmur = new Murmur32x86Iterative(); + String arg = String.format("%s-%s-%s", murmur.getName().toUpperCase(Locale.ROOT), murmur.getSignedness(), + murmur.getProcessType()); + long expected = murmur.apply(arg.getBytes(StandardCharsets.UTF_8), 0); + assertEquals(expected, murmur.getSignature()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java new file mode 100644 index 0000000000..fec1f4160c --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher.function; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Locale; +import org.apache.commons.collections4.bloomfilter.hasher.function.ObjectsHashIterative; +import org.junit.Test; + +/** + * Tests that the Objects hash works correctly.. + * + */ +public class ObjectsHashIterativeTest { + + /** + * Test that the apply function returns the proper values. + */ + @Test + public void applyTest() { + ObjectsHashIterative obj = new ObjectsHashIterative(); + + byte[] buffer = "Now is the time for all good men to come to the aid of their country" + .getBytes(StandardCharsets.UTF_8); + + long l = obj.apply(buffer, 0); + long prev = 0; + assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); + prev += l; + l = obj.apply(buffer, 1); + assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); + prev += l; + l = obj.apply(buffer, 2); + assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); + } + + /** + * Test that the signature is properly generated. + */ + @Test + public void signatureTest() { + ObjectsHashIterative obj = new ObjectsHashIterative(); + String arg = String.format("%s-%s-%s", obj.getName().toUpperCase(Locale.ROOT), obj.getSignedness(), + obj.getProcessType()); + long expected = obj.apply(arg.getBytes(StandardCharsets.UTF_8), 0); + long expected2 = obj.apply(arg.getBytes(StandardCharsets.UTF_8), 0); + assertEquals(expected, expected2); + assertEquals(expected, obj.getSignature()); + } + +} From d61b83be16d27fd37b3a45d7443ae202b99fcd66 Mon Sep 17 00:00:00 2001 From: Gary Gregory Date: Fri, 24 Jan 2020 18:15:22 -0500 Subject: [PATCH 2/2] [COLLECTIONS-728] BloomFilter contribution. --- src/changes/changes.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index f6bf7d5373..d5a290d74d 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -111,6 +111,9 @@ Add org.apache.commons.collections4.EnumerationUtils.asIterable(Enumeration). + + BloomFilter contribution. +