Optimized text for full unicode and some escape sequences (#129169)

Follow-up to #126492 to apply the json parsing optimization to strings
containing unicode characters and some backslash-escaped characters.

Supporting backslash-escaped strings is tricky as it requires modifying the
string. There are two types of modification: some just remove the backslash
(e.g. \", \\), and some replace the whole escape sequence with a new
character (e.g. \n, \r, \u00e5). In this implementation, the optimization
only supports the first case--removing the backslash. This is done by
making a copy of the data, skipping the backslash. It should still be more
optimized than full String decoding, but it won't be as fast as 
non-backslashed strings where we can directly reference the input bytes.

Relates to #129072.
This commit is contained in:
Jordan Powers 2025-06-12 09:55:07 -07:00 committed by GitHub
parent 03ba5b12e5
commit 96300a9d80
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 169 additions and 44 deletions

View file

@ -21,9 +21,14 @@ import org.elasticsearch.xcontent.XContentString;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
protected int stringEnd = -1;
protected int stringLength;
private final List<Integer> backslashes = new ArrayList<>();
public ESUTF8StreamJsonParser(
IOContext ctxt,
@ -43,15 +48,12 @@ public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
/**
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
*/
public Text getValueAsText() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
if (stringEnd > 0) {
final int len = stringEnd - 1 - _inputPtr;
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength);
}
return _finishAndReturnText();
}
@ -69,21 +71,71 @@ public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
final int[] codes = INPUT_CODES_UTF8;
final int max = _inputEnd;
final byte[] inputBuffer = _inputBuffer;
while (ptr < max) {
int c = inputBuffer[ptr] & 0xFF;
if (codes[c] != 0) {
if (c == INT_QUOTE) {
stringEnd = ptr + 1;
final int len = ptr - startPtr;
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
}
stringLength = 0;
backslashes.clear();
loop: while (true) {
if (ptr >= max) {
return null;
}
++ptr;
int c = inputBuffer[ptr] & 0xFF;
switch (codes[c]) {
case 0 -> {
++ptr;
++stringLength;
}
case 1 -> {
if (c == INT_QUOTE) {
// End of the string
break loop;
}
assert c == INT_BACKSLASH;
backslashes.add(ptr);
++ptr;
if (ptr >= max) {
// Backslash at end of file
return null;
}
c = inputBuffer[ptr] & 0xFF;
if (c == '"' || c == '/' || c == '\\') {
ptr += 1;
stringLength += 1;
} else {
// Any other escaped sequence requires replacing the sequence with
// a new character, which we don't support in the optimized path
return null;
}
}
case 2, 3, 4 -> {
int bytesToSkip = codes[c];
if (ptr + bytesToSkip > max) {
return null;
}
ptr += bytesToSkip;
++stringLength;
}
default -> {
return null;
}
}
}
stringEnd = ptr + 1;
if (backslashes.isEmpty()) {
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
} else {
byte[] buff = new byte[ptr - startPtr - backslashes.size()];
int copyPtr = startPtr;
int destPtr = 0;
for (Integer backslash : backslashes) {
int length = backslash - copyPtr;
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
destPtr += length;
copyPtr = backslash + 1;
}
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
return new Text(new XContentString.UTF8Bytes(buff), stringLength);
}
return null;
}
@Override

View file

@ -13,16 +13,14 @@ import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import org.elasticsearch.common.Strings;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentString;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.hamcrest.Matchers;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
public class ESUTF8StreamJsonParserTests extends ESTestCase {
@ -45,11 +43,13 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
var textRef = parser.getValueAsText().bytes();
assertThat(textRef, Matchers.notNullValue());
assertThat(textRef.offset(), Matchers.equalTo(9));
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12));
assertTextRef(textRef, "bar");
var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
var bytes = text.bytes();
assertThat(bytes.offset(), Matchers.equalTo(9));
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12));
assertTextRef(bytes, "bar");
assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
assertThat(parser.getValueAsText(), Matchers.nullValue());
@ -62,8 +62,18 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
assertTextRef(text.bytes(), "bar\"baz\"");
});
testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\""));
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
});
testParseJson("{\"foo\": \"bår\"}", parser -> {
@ -71,8 +81,17 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.getValueAsText(), Matchers.nullValue());
var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
var bytes = text.bytes();
assertThat(bytes.offset(), Matchers.equalTo(9));
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13));
assertTextRef(bytes, "bår");
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
});
testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> {
@ -112,43 +131,97 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
});
}
private boolean validForTextRef(String value) {
for (char c : value.toCharArray()) {
if (c == '"') {
return false;
private record TestInput(String input, String result, boolean supportsOptimized) {}
private static final TestInput[] ESCAPE_SEQUENCES = {
new TestInput("\\b", "\b", false),
new TestInput("\\t", "\t", false),
new TestInput("\\n", "\n", false),
new TestInput("\\f", "\f", false),
new TestInput("\\r", "\r", false),
new TestInput("\\\"", "\"", true),
new TestInput("\\/", "/", true),
new TestInput("\\\\", "\\", true) };
private int randomCodepoint(boolean includeAscii) {
while (true) {
char val = Character.toChars(randomInt(0xFFFF))[0];
if (val <= 0x7f && includeAscii == false) {
continue;
}
if (c == '\\') {
return false;
if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
continue;
}
if ((int) c < 32 || (int) c >= 128) {
return false;
return val;
}
}
private TestInput buildRandomInput(int length) {
StringBuilder input = new StringBuilder(length);
StringBuilder result = new StringBuilder(length);
boolean forceSupportOptimized = randomBoolean();
boolean doesSupportOptimized = true;
for (int i = 0; i < length; ++i) {
if (forceSupportOptimized == false && randomBoolean()) {
switch (randomInt(9)) {
case 0 -> {
var escape = randomFrom(ESCAPE_SEQUENCES);
input.append(escape.input());
result.append(escape.result());
doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
}
case 1 -> {
int value = randomCodepoint(true);
input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
result.append(Character.toChars(value));
doesSupportOptimized = false;
}
default -> {
var value = Character.toChars(randomCodepoint(false));
input.append(value);
result.append(value);
}
}
} else {
var value = randomAlphanumericOfLength(1);
input.append(value);
result.append(value);
}
}
return true;
return new TestInput(input.toString(), result.toString(), doesSupportOptimized);
}
public void testGetValueRandomized() throws IOException {
XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject();
StringBuilder inputBuilder = new StringBuilder();
inputBuilder.append('{');
final int numKeys = 128;
String[] keys = new String[numKeys];
String[] values = new String[numKeys];
TestInput[] inputs = new TestInput[numKeys];
for (int i = 0; i < numKeys; i++) {
String currKey = randomAlphanumericOfLength(6);
String currVal = randomUnicodeOfLengthBetween(0, 512);
jsonBuilder.field(currKey, currVal);
var currVal = buildRandomInput(randomInt(512));
inputBuilder.append('"');
inputBuilder.append(currKey);
inputBuilder.append("\":\"");
inputBuilder.append(currVal.input());
inputBuilder.append('"');
if (i < numKeys - 1) {
inputBuilder.append(',');
}
keys[i] = currKey;
values[i] = currVal;
inputs[i] = currVal;
}
jsonBuilder.endObject();
testParseJson(Strings.toString(jsonBuilder), parser -> {
inputBuilder.append('}');
testParseJson(inputBuilder.toString(), parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
for (int i = 0; i < numKeys; i++) {
assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i]));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
String currVal = values[i];
if (validForTextRef(currVal)) {
String currVal = inputs[i].result();
if (inputs[i].supportsOptimized()) {
assertTextRef(parser.getValueAsText().bytes(), currVal);
} else {
assertThat(parser.getValueAsText(), Matchers.nullValue());