mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-27 17:10:22 -04:00
Optimized text for full unicode and some escape sequences (#129169)
Follow-up to #126492 to apply the json parsing optimization to strings containing unicode characters and some backslash-escaped characters. Supporting backslash-escaped strings is tricky as it requires modifying the string. There are two types of modification: some just remove the backslash (e.g. \", \\), and some replace the whole escape sequence with a new character (e.g. \n, \r, \u00e5). In this implementation, the optimization only supports the first case--removing the backslash. This is done by making a copy of the data, skipping the backslash. It should still be more optimized than full String decoding, but it won't be as fast as non-backslashed strings where we can directly reference the input bytes. Relates to #129072.
This commit is contained in:
parent
03ba5b12e5
commit
96300a9d80
2 changed files with 169 additions and 44 deletions
|
@ -21,9 +21,14 @@ import org.elasticsearch.xcontent.XContentString;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
|
||||
protected int stringEnd = -1;
|
||||
protected int stringLength;
|
||||
|
||||
private final List<Integer> backslashes = new ArrayList<>();
|
||||
|
||||
public ESUTF8StreamJsonParser(
|
||||
IOContext ctxt,
|
||||
|
@ -43,15 +48,12 @@ public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
|
|||
/**
|
||||
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
|
||||
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
|
||||
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
|
||||
*/
|
||||
public Text getValueAsText() throws IOException {
|
||||
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
|
||||
if (stringEnd > 0) {
|
||||
final int len = stringEnd - 1 - _inputPtr;
|
||||
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
|
||||
// which means each character uses exactly 1 byte.
|
||||
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
|
||||
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength);
|
||||
}
|
||||
return _finishAndReturnText();
|
||||
}
|
||||
|
@ -69,21 +71,71 @@ public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
|
|||
final int[] codes = INPUT_CODES_UTF8;
|
||||
final int max = _inputEnd;
|
||||
final byte[] inputBuffer = _inputBuffer;
|
||||
while (ptr < max) {
|
||||
int c = inputBuffer[ptr] & 0xFF;
|
||||
if (codes[c] != 0) {
|
||||
if (c == INT_QUOTE) {
|
||||
stringEnd = ptr + 1;
|
||||
final int len = ptr - startPtr;
|
||||
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
|
||||
// which means each character uses exactly 1 byte.
|
||||
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
|
||||
}
|
||||
stringLength = 0;
|
||||
backslashes.clear();
|
||||
|
||||
loop: while (true) {
|
||||
if (ptr >= max) {
|
||||
return null;
|
||||
}
|
||||
++ptr;
|
||||
int c = inputBuffer[ptr] & 0xFF;
|
||||
switch (codes[c]) {
|
||||
case 0 -> {
|
||||
++ptr;
|
||||
++stringLength;
|
||||
}
|
||||
case 1 -> {
|
||||
if (c == INT_QUOTE) {
|
||||
// End of the string
|
||||
break loop;
|
||||
}
|
||||
assert c == INT_BACKSLASH;
|
||||
backslashes.add(ptr);
|
||||
++ptr;
|
||||
if (ptr >= max) {
|
||||
// Backslash at end of file
|
||||
return null;
|
||||
}
|
||||
c = inputBuffer[ptr] & 0xFF;
|
||||
if (c == '"' || c == '/' || c == '\\') {
|
||||
ptr += 1;
|
||||
stringLength += 1;
|
||||
} else {
|
||||
// Any other escaped sequence requires replacing the sequence with
|
||||
// a new character, which we don't support in the optimized path
|
||||
return null;
|
||||
}
|
||||
}
|
||||
case 2, 3, 4 -> {
|
||||
int bytesToSkip = codes[c];
|
||||
if (ptr + bytesToSkip > max) {
|
||||
return null;
|
||||
}
|
||||
ptr += bytesToSkip;
|
||||
++stringLength;
|
||||
}
|
||||
default -> {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stringEnd = ptr + 1;
|
||||
if (backslashes.isEmpty()) {
|
||||
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
|
||||
} else {
|
||||
byte[] buff = new byte[ptr - startPtr - backslashes.size()];
|
||||
int copyPtr = startPtr;
|
||||
int destPtr = 0;
|
||||
for (Integer backslash : backslashes) {
|
||||
int length = backslash - copyPtr;
|
||||
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
|
||||
destPtr += length;
|
||||
copyPtr = backslash + 1;
|
||||
}
|
||||
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
|
||||
return new Text(new XContentString.UTF8Bytes(buff), stringLength);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -13,16 +13,14 @@ import com.fasterxml.jackson.core.JsonFactory;
|
|||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.core.JsonToken;
|
||||
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.core.CheckedConsumer;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.xcontent.XContentString;
|
||||
import org.elasticsearch.xcontent.json.JsonXContent;
|
||||
import org.hamcrest.Matchers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Locale;
|
||||
|
||||
public class ESUTF8StreamJsonParserTests extends ESTestCase {
|
||||
|
||||
|
@ -45,11 +43,13 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
|
|||
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
|
||||
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
|
||||
|
||||
var textRef = parser.getValueAsText().bytes();
|
||||
assertThat(textRef, Matchers.notNullValue());
|
||||
assertThat(textRef.offset(), Matchers.equalTo(9));
|
||||
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12));
|
||||
assertTextRef(textRef, "bar");
|
||||
var text = parser.getValueAsText();
|
||||
assertThat(text, Matchers.notNullValue());
|
||||
|
||||
var bytes = text.bytes();
|
||||
assertThat(bytes.offset(), Matchers.equalTo(9));
|
||||
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12));
|
||||
assertTextRef(bytes, "bar");
|
||||
|
||||
assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
|
||||
assertThat(parser.getValueAsText(), Matchers.nullValue());
|
||||
|
@ -62,8 +62,18 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
|
|||
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
|
||||
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
|
||||
|
||||
var text = parser.getValueAsText();
|
||||
assertThat(text, Matchers.notNullValue());
|
||||
assertTextRef(text.bytes(), "bar\"baz\"");
|
||||
});
|
||||
|
||||
testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> {
|
||||
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
|
||||
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
|
||||
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
|
||||
|
||||
assertThat(parser.getValueAsText(), Matchers.nullValue());
|
||||
assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\""));
|
||||
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
|
||||
});
|
||||
|
||||
testParseJson("{\"foo\": \"bår\"}", parser -> {
|
||||
|
@ -71,8 +81,17 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
|
|||
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
|
||||
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
|
||||
|
||||
assertThat(parser.getValueAsText(), Matchers.nullValue());
|
||||
var text = parser.getValueAsText();
|
||||
assertThat(text, Matchers.notNullValue());
|
||||
|
||||
var bytes = text.bytes();
|
||||
assertThat(bytes.offset(), Matchers.equalTo(9));
|
||||
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13));
|
||||
assertTextRef(bytes, "bår");
|
||||
|
||||
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
|
||||
|
||||
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
|
||||
});
|
||||
|
||||
testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> {
|
||||
|
@ -112,43 +131,97 @@ public class ESUTF8StreamJsonParserTests extends ESTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
private boolean validForTextRef(String value) {
|
||||
for (char c : value.toCharArray()) {
|
||||
if (c == '"') {
|
||||
return false;
|
||||
private record TestInput(String input, String result, boolean supportsOptimized) {}
|
||||
|
||||
private static final TestInput[] ESCAPE_SEQUENCES = {
|
||||
new TestInput("\\b", "\b", false),
|
||||
new TestInput("\\t", "\t", false),
|
||||
new TestInput("\\n", "\n", false),
|
||||
new TestInput("\\f", "\f", false),
|
||||
new TestInput("\\r", "\r", false),
|
||||
new TestInput("\\\"", "\"", true),
|
||||
new TestInput("\\/", "/", true),
|
||||
new TestInput("\\\\", "\\", true) };
|
||||
|
||||
private int randomCodepoint(boolean includeAscii) {
|
||||
while (true) {
|
||||
char val = Character.toChars(randomInt(0xFFFF))[0];
|
||||
if (val <= 0x7f && includeAscii == false) {
|
||||
continue;
|
||||
}
|
||||
if (c == '\\') {
|
||||
return false;
|
||||
if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
|
||||
continue;
|
||||
}
|
||||
if ((int) c < 32 || (int) c >= 128) {
|
||||
return false;
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
private TestInput buildRandomInput(int length) {
|
||||
StringBuilder input = new StringBuilder(length);
|
||||
StringBuilder result = new StringBuilder(length);
|
||||
boolean forceSupportOptimized = randomBoolean();
|
||||
boolean doesSupportOptimized = true;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (forceSupportOptimized == false && randomBoolean()) {
|
||||
switch (randomInt(9)) {
|
||||
case 0 -> {
|
||||
var escape = randomFrom(ESCAPE_SEQUENCES);
|
||||
input.append(escape.input());
|
||||
result.append(escape.result());
|
||||
doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
|
||||
}
|
||||
case 1 -> {
|
||||
int value = randomCodepoint(true);
|
||||
input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
|
||||
result.append(Character.toChars(value));
|
||||
doesSupportOptimized = false;
|
||||
}
|
||||
default -> {
|
||||
var value = Character.toChars(randomCodepoint(false));
|
||||
input.append(value);
|
||||
result.append(value);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
var value = randomAlphanumericOfLength(1);
|
||||
input.append(value);
|
||||
result.append(value);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return new TestInput(input.toString(), result.toString(), doesSupportOptimized);
|
||||
}
|
||||
|
||||
public void testGetValueRandomized() throws IOException {
|
||||
XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject();
|
||||
StringBuilder inputBuilder = new StringBuilder();
|
||||
inputBuilder.append('{');
|
||||
|
||||
final int numKeys = 128;
|
||||
String[] keys = new String[numKeys];
|
||||
String[] values = new String[numKeys];
|
||||
TestInput[] inputs = new TestInput[numKeys];
|
||||
for (int i = 0; i < numKeys; i++) {
|
||||
String currKey = randomAlphanumericOfLength(6);
|
||||
String currVal = randomUnicodeOfLengthBetween(0, 512);
|
||||
jsonBuilder.field(currKey, currVal);
|
||||
var currVal = buildRandomInput(randomInt(512));
|
||||
inputBuilder.append('"');
|
||||
inputBuilder.append(currKey);
|
||||
inputBuilder.append("\":\"");
|
||||
inputBuilder.append(currVal.input());
|
||||
inputBuilder.append('"');
|
||||
if (i < numKeys - 1) {
|
||||
inputBuilder.append(',');
|
||||
}
|
||||
keys[i] = currKey;
|
||||
values[i] = currVal;
|
||||
inputs[i] = currVal;
|
||||
}
|
||||
|
||||
jsonBuilder.endObject();
|
||||
testParseJson(Strings.toString(jsonBuilder), parser -> {
|
||||
inputBuilder.append('}');
|
||||
testParseJson(inputBuilder.toString(), parser -> {
|
||||
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
|
||||
for (int i = 0; i < numKeys; i++) {
|
||||
assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i]));
|
||||
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
|
||||
|
||||
String currVal = values[i];
|
||||
if (validForTextRef(currVal)) {
|
||||
String currVal = inputs[i].result();
|
||||
if (inputs[i].supportsOptimized()) {
|
||||
assertTextRef(parser.getValueAsText().bytes(), currVal);
|
||||
} else {
|
||||
assertThat(parser.getValueAsText(), Matchers.nullValue());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue