A CompactHashSet Implementation

There are two commonly used Set implementations in Java by default: HashSet and CopyOnWriteArraySet. The main attributes or each implementation are:

HashSet:

  • Not Thread-Safe
  • Seemingly random ordering of items
  • Very very fast
  • Sparse by default (much more space used than the number of elements)
  • Implemented on top of HashMap

CopyOnWriteArraySet

  • Thread-Safe
  • Items are ordered according to their insertion order
  • Relatively fast for small data-sets
  • Compact (it’s size is exactly the number of elements)
  • Implemented on top of CopyOnWriteArrayList

What I wanted was an implementation with a mixture of these attributes. Basically a CopyOnWriteArraySet with the same Thread-Safety and size considerations, but that is nice and fast for medium and large data-sets. What I came up with I call the CompactHashMap. It’s a relatively simple implementation that stores it’s elements ordered by their hash-codes. This allows the implementation to binary-search the array and jump to where the element is likely to be before doing a normal Object.equals() search.

The implementation is below:

import java.util.Set;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Collection;
import java.util.Comparator;
import java.util.Collections;
import java.util.AbstractSet;
import java.util.NoSuchElementException;

/**
*
* A Compact {@code Set} implementation similar to {@link java.util.concurrent.CopyOnWriteArraySet}.
* This class is thread-safe in that all operations that can cause change will be run inside
* a synchronized block. Any time a write occures the underlying array is copied for the change
* to be made. This is useful when the number of reads vastly outnumbers the number of writes
*

* Like {@code CopyOnWriteArraySet} this class’s {@code Iterator} will be un-affected by
* subsequent changes to the {@code Set} object. This is because they have two separate
* arrays.
*

* The bulk {@link #addAll(java.util.Collection)} method (unlike {@code CopyOnWriteArraySet})
* acts atomically. The changes will not be seen until the method has completed. Also unlike
* {@code CopyOnWriteArraySet} this class yeilds excelent time performance on non-modifying
* method invokations ({@link #add(java.lang.Object) add(existingValue)},
* {@link #contains(java.lang.Object)}, etc.). This is because this class keeps it’s array
* sorted according to the objects {@link Object#hashCode() hashCodes}. This means the
* array can be binary-searched instead of searched from begining to end.
*

* It is strongly recommended that the objects added to a {@code CompactHashSet} cache their
* hashCodes rather than computing them with each invokation to {@link Object#hashCode()}. If
* the class is immutable the hashCode can be computed in the constructor for additional performace.
*

* This class operates best with medium to large data-sets. With a small data-set a
* {@code CopyOnWriteArraySet} may yeild better time performance. However this class will
* always yeild much better space performance than a normal {@link java.util.HashSet}.
*
* @author Jason Morris
*/
public class CompactHashSet extends AbstractSet {
private static final Comparator HASH_COMPARATOR = new Comparator() {
public int compare(final Object o1, final Object o2) {
return o1.hashCode() – o2.hashCode();
}

};

private volatile Object[] entries = null;

public CompactHashSet() {
}

public CompactHashSet(final Collection collection) {
addAll(collection);
}

private static int search(final Object[] array, final int hash) {
int min = 0;
int max = array.length – 1;

while(min <= max) { final int midPoint = (min + max) >>> 1;
@SuppressWarnings(“unchecked”)
final int midHash = (array[midPoint]).hashCode();

if(midHash < hash) { min = midPoint + 1; } else if(midHash > hash) {
max = midPoint – 1;
} else {
return midPoint;
}
}

return -min – 1; // this is the same as Arrays.binarySearch
}

/**
* When doing a {@link #addAll(java.util.Collection)} operation, we use this method
* since the array is in an unsorted state for the duration of that method.
*/
private static boolean slowSearch(final Object[] array, final Object value) {
final int len = array.length;
final int hash = value.hashCode();

for(int i = 0; i < len; i++) { final Object other = array[i]; if(other != null) { if(other.hashCode() == hash && value.equals(other)) { return true; } } else { break; } } return false; } private static int getEntry(final Object value, final Object[] array, final int hash, final int index) { for(int i = index; i >= 0; i–) {
if(array[i].hashCode() != hash) {
break;
} else if(array[i].equals(value)) {
return i;
}
}

final int len = array.length;

for(int i = index + 1; i < len; i++) { if(array[i].hashCode() != hash) { break; } else if(array[i].equals(value)) { return i; } } return -1; } private void removeEntry(final int index) { synchronized(this) { final int len = entries.length - index - 1; final Object[] tmp = new Object[entries.length - 1]; System.arraycopy(entries, 0, tmp, 0, index); System.arraycopy(entries, index + 1, tmp, index, len); entries = tmp; } } @SuppressWarnings("unchecked") private void add0(final E e, final int index) { final Object[] tmp = new Object[entries.length + 1]; final int len = entries.length - index; System.arraycopy(entries, 0, tmp, 0, index); if(index < entries.length) { System.arraycopy(entries, index, tmp, index + 1, len); } tmp[index] = e; entries = tmp; } @SuppressWarnings("unchecked") private boolean remove0(final Object o, final int hash, final int probableIndex) { final int realIndex = getEntry(o, entries, hash, probableIndex); if(realIndex != -1) { removeEntry(realIndex); return true; } else { return false; } } @Override public boolean add(final E e) { if(e == null) { throw new NullPointerException(); } final int hash = e.hashCode(); synchronized(this) { if(entries == null) { entries = new Object[]{e}; return true; } else { final int index = search(entries, hash); if(index < 0) { add0(e, -(index + 1)); return true; } else if(getEntry(e, entries, hash, index) == -1) { add0(e, index); return true; } else { return false; } } } } @Override public boolean addAll(final Collection c) {
if(c.isEmpty()) {
return false;
} else {
synchronized(this) {
boolean modified = false;

final Object[] newValues = entries != null ?
Arrays.copyOf(entries, entries.length + c.size()) :
new Object[c.size()];

int index = entries != null ? entries.length : 0;

final Iterator it = c.iterator();

while(it.hasNext()) {
final E obj = it.next();

if(obj != null && !slowSearch(newValues, obj)) {
newValues[index++] = obj;
modified |= true;
}
}

if(newValues.length > index) {
entries = Arrays.copyOf(newValues, index);
} else {
entries = newValues;
}

Arrays.sort(entries, HASH_COMPARATOR);

return modified;
}
}
}

@Override
public boolean remove(final Object o) {
if(o == null || isEmpty()) {
return false;
}

synchronized(this) {
final int hash = o.hashCode();

if(size() == 1) {
final Object ent = entries[0];
if(ent.hashCode() == hash && ent.equals(o)) {
entries = null;
return true;
}
} else {
final int index = search(entries, hash);

if(index >= 0) {
return remove0(o, hash, index);
}
}
}

return false;
}

@Override
public boolean contains(final Object o) {
if(o == null || isEmpty()) {
return false;
}

final Object[] array = entries;

final int hash = o.hashCode();
final int index = search(array, hash);

if(index >= 0) {
return getEntry(o, array, hash, index) != -1;
}

return false;
}

@Override
public Iterator iterator() {
if(isEmpty()) {
final Set set = Collections.emptySet();
return set.iterator();
}

return new EntryIterator();
}

@Override
public void clear() {
synchronized(this) {
entries = null;
}
}

@Override
public int size() {
return entries != null ? entries.length : 0;
}

private class EntryIterator implements Iterator {
private final Object[] values = entries;

private int index = -1;

public boolean hasNext() {
return index < values.length; } @SuppressWarnings("unchecked") public E next() { if(index < values.length) { return (E)values[++index]; } else { throw new NoSuchElementException("No more elements in CompactHashSet"); } } public void remove() { if(values[index] != null) { CompactHashSet.this.remove(values[index]); this.values[index] = null; } } } } [/sourcecode]

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s