Fix uniquify to handle multiple successive duplicates (#126889) (#126951)

CollectionUtils.uniquify is based on C++ std::unique. However, C++
iterators are not quite the same as Java iterators. In particular,
advancing them only allows grabbing the value once. This commit reworks
uniquify to be based on list indices instead of iterators.

closes #126883

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Ryan Ernst 2025-04-17 14:05:21 -07:00 committed by GitHub
parent b111bf6d5e
commit 24b4966b2c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 36 additions and 10 deletions

View file

@ -0,0 +1,6 @@
pr: 126889
summary: Rework uniquify to not use iterators
area: Infra/Core
type: bug
issues:
- 126883

View file

@ -51,20 +51,24 @@ public class CollectionUtils {
return;
}
ListIterator<T> uniqueItr = list.listIterator();
ListIterator<T> existingItr = list.listIterator();
T uniqueValue = uniqueItr.next(); // get first element to compare with
existingItr.next(); // advance the existing iterator to the second element, where we will begin comparing
ListIterator<T> resultItr = list.listIterator();
ListIterator<T> currentItr = list.listIterator(1); // start at second element to compare
T lastValue = resultItr.next(); // result always includes first element, so advance it and grab first
do {
T existingValue = existingItr.next();
if (cmp.compare(existingValue, uniqueValue) != 0 && (uniqueValue = uniqueItr.next()) != existingValue) {
uniqueItr.set(existingValue);
T currentValue = currentItr.next(); // each iter we check if the next element is different from the last we put in the result
if (cmp.compare(lastValue, currentValue) != 0) {
lastValue = currentValue;
resultItr.next(); // advance result so the current position is where we want to set
if (resultItr.previousIndex() != currentItr.previousIndex()) {
// optimization: only need to set if different
resultItr.set(currentValue);
}
}
} while (existingItr.hasNext());
} while (currentItr.hasNext());
// Lop off the rest of the list. Note with LinkedList this requires advancing back to this index,
// but Java provides no way to efficiently remove from the end of a non random-access list.
list.subList(uniqueItr.nextIndex(), list.size()).clear();
list.subList(resultItr.nextIndex(), list.size()).clear();
}
/**

View file

@ -65,7 +65,7 @@ public class CollectionUtilsTests extends ESTestCase {
for (List<T> listCopy : List.of(new ArrayList<T>(list), new LinkedList<T>(list))) {
CollectionUtils.uniquify(listCopy, cmp);
for (int i = 0; i < listCopy.size() - 1; ++i) {
assertThat(cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
assertThat(listCopy.get(i) + " < " + listCopy.get(i + 1), cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
}
assertThat(listCopy.size(), equalTo(size));
}
@ -75,9 +75,25 @@ public class CollectionUtilsTests extends ESTestCase {
assertUniquify(List.<Integer>of(), Comparator.naturalOrder(), 0);
assertUniquify(List.of(1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 1, 3), Comparator.naturalOrder(), 2);
assertUniquify(List.of(1, 1, 1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 2, 2, 2), Comparator.naturalOrder(), 2);
assertUniquify(List.of(1, 2, 2, 3, 3, 5), Comparator.naturalOrder(), 4);
for (int i = 0; i < 10; ++i) {
int uniqueItems = randomIntBetween(1, 10);
var list = new ArrayList<Integer>();
int next = 1;
for (int j = 0; j < uniqueItems; ++j) {
int occurences = randomIntBetween(1, 10);
while (occurences-- > 0) {
list.add(next);
}
next++;
}
assertUniquify(list, Comparator.naturalOrder(), uniqueItems);
}
}
public void testEmptyPartition() {