Fix uniquify to handle multiple successive duplicates (#126889) (#126951)

CollectionUtils.uniquify is based on C++ std::unique. However, C++ iterators are not quite the same as Java iterators. In particular, advancing them only allows grabbing the value once. This commit reworks uniquify to be based on list indices instead of iterators. closes #126883 Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
2025-04-19 04:45:07 -04:00 · 2025-04-17 14:05:21 -07:00 · 2025-04-17 14:05:21 -07:00 · 24b4966b2c
commit 24b4966b2c
parent b111bf6d5e
3 changed files with 36 additions and 10 deletions
--- a/docs/changelog/126889.yaml
+++ b/docs/changelog/126889.yaml
@ -0,0 +1,6 @@
+pr: 126889
+summary: Rework uniquify to not use iterators
+area: Infra/Core
+type: bug
+issues:
+ - 126883
--- a/server/src/main/java/org/elasticsearch/common/util/CollectionUtils.java
+++ b/server/src/main/java/org/elasticsearch/common/util/CollectionUtils.java
@ -51,20 +51,24 @@ public class CollectionUtils {
            return;
        }

-        ListIterator<T> uniqueItr = list.listIterator();
-        ListIterator<T> existingItr = list.listIterator();
-        T uniqueValue = uniqueItr.next(); // get first element to compare with
-        existingItr.next(); // advance the existing iterator to the second element, where we will begin comparing
+        ListIterator<T> resultItr = list.listIterator();
+        ListIterator<T> currentItr = list.listIterator(1); // start at second element to compare
+        T lastValue = resultItr.next(); // result always includes first element, so advance it and grab first
        do {
-            T existingValue = existingItr.next();
-            if (cmp.compare(existingValue, uniqueValue) != 0 && (uniqueValue = uniqueItr.next()) != existingValue) {
-                uniqueItr.set(existingValue);
+            T currentValue = currentItr.next(); // each iter we check if the next element is different from the last we put in the result
+            if (cmp.compare(lastValue, currentValue) != 0) {
+                lastValue = currentValue;
+                resultItr.next(); // advance result so the current position is where we want to set
+                if (resultItr.previousIndex() != currentItr.previousIndex()) {
+                    // optimization: only need to set if different
+                    resultItr.set(currentValue);
+                }
            }
-        } while (existingItr.hasNext());
+        } while (currentItr.hasNext());

        // Lop off the rest of the list. Note with LinkedList this requires advancing back to this index,
        // but Java provides no way to efficiently remove from the end of a non random-access list.
-        list.subList(uniqueItr.nextIndex(), list.size()).clear();
+        list.subList(resultItr.nextIndex(), list.size()).clear();
    }

    /**
--- a/server/src/test/java/org/elasticsearch/common/util/CollectionUtilsTests.java
+++ b/server/src/test/java/org/elasticsearch/common/util/CollectionUtilsTests.java
@ -65,7 +65,7 @@ public class CollectionUtilsTests extends ESTestCase {
        for (List<T> listCopy : List.of(new ArrayList<T>(list), new LinkedList<T>(list))) {
            CollectionUtils.uniquify(listCopy, cmp);
            for (int i = 0; i < listCopy.size() - 1; ++i) {
-                assertThat(cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
+                assertThat(listCopy.get(i) + " < " + listCopy.get(i + 1), cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
            }
            assertThat(listCopy.size(), equalTo(size));
        }
@ -75,9 +75,25 @@ public class CollectionUtilsTests extends ESTestCase {
        assertUniquify(List.<Integer>of(), Comparator.naturalOrder(), 0);
        assertUniquify(List.of(1), Comparator.naturalOrder(), 1);
        assertUniquify(List.of(1, 2, 3), Comparator.naturalOrder(), 3);
+        assertUniquify(List.of(1, 1, 3), Comparator.naturalOrder(), 2);
        assertUniquify(List.of(1, 1, 1), Comparator.naturalOrder(), 1);
        assertUniquify(List.of(1, 2, 2, 3), Comparator.naturalOrder(), 3);
        assertUniquify(List.of(1, 2, 2, 2), Comparator.naturalOrder(), 2);
+        assertUniquify(List.of(1, 2, 2, 3, 3, 5), Comparator.naturalOrder(), 4);
+
+        for (int i = 0; i < 10; ++i) {
+            int uniqueItems = randomIntBetween(1, 10);
+            var list = new ArrayList<Integer>();
+            int next = 1;
+            for (int j = 0; j < uniqueItems; ++j) {
+                int occurences = randomIntBetween(1, 10);
+                while (occurences-- > 0) {
+                    list.add(next);
+                }
+                next++;
+            }
+            assertUniquify(list, Comparator.naturalOrder(), uniqueItems);
+        }
    }

    public void testEmptyPartition() {