|
@@ -44,36 +44,31 @@ import static java.lang.Character.toCodePoint;
|
|
|
import java.nio.ByteBuffer;
|
|
import java.nio.ByteBuffer;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * A set of low-level, high-performance static utility methods related
|
|
|
|
|
- * to the UTF-8 character encoding. This class has no dependencies
|
|
|
|
|
- * outside of the core JDK libraries.
|
|
|
|
|
|
|
+ * A set of low-level, high-performance static utility methods related to the UTF-8 character
|
|
|
|
|
+ * encoding. This class has no dependencies outside of the core JDK libraries.
|
|
|
*
|
|
*
|
|
|
- * <p>There are several variants of UTF-8. The one implemented by
|
|
|
|
|
- * this class is the restricted definition of UTF-8 introduced in
|
|
|
|
|
- * Unicode 3.1, which mandates the rejection of "overlong" byte
|
|
|
|
|
- * sequences as well as rejection of 3-byte surrogate codepoint byte
|
|
|
|
|
- * sequences. Note that the UTF-8 decoder included in Oracle's JDK
|
|
|
|
|
- * has been modified to also reject "overlong" byte sequences, but (as
|
|
|
|
|
- * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
|
|
|
|
|
|
|
+ * <p>There are several variants of UTF-8. The one implemented by this class is the restricted
|
|
|
|
|
+ * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte
|
|
|
|
|
+ * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8
|
|
|
|
|
+ * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but
|
|
|
|
|
+ * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences.
|
|
|
*
|
|
*
|
|
|
- * <p>The byte sequences considered valid by this class are exactly
|
|
|
|
|
- * those that can be roundtrip converted to Strings and back to bytes
|
|
|
|
|
- * using the UTF-8 charset, without loss: <pre> {@code
|
|
|
|
|
|
|
+ * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip
|
|
|
|
|
+ * converted to Strings and back to bytes using the UTF-8 charset, without loss:
|
|
|
|
|
+ *
|
|
|
|
|
+ * <pre>{@code
|
|
|
* Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
|
|
* Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
|
|
|
* }</pre>
|
|
* }</pre>
|
|
|
*
|
|
*
|
|
|
- * <p>See the Unicode Standard,</br>
|
|
|
|
|
- * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
|
|
|
|
|
- * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
|
|
|
|
|
|
|
+ * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7.
|
|
|
|
|
+ * <em>Well Formed UTF-8 Byte Sequences</em>.
|
|
|
*
|
|
*
|
|
|
- * <p>This class supports decoding of partial byte sequences, so that the
|
|
|
|
|
- * bytes in a complete UTF-8 byte sequences can be stored in multiple
|
|
|
|
|
- * segments. Methods typically return {@link #MALFORMED} if the partial
|
|
|
|
|
- * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
|
|
|
|
|
- * well-formed in the absence of additional input, or if the byte sequence
|
|
|
|
|
- * apparently terminated in the middle of a character, an opaque integer
|
|
|
|
|
- * "state" value containing enough information to decode the character when
|
|
|
|
|
- * passed to a subsequent invocation of a partial decoding method.
|
|
|
|
|
|
|
+ * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8
|
|
|
|
|
+ * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if
|
|
|
|
|
+ * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed
|
|
|
|
|
+ * in the absence of additional input, or if the byte sequence apparently terminated in the middle
|
|
|
|
|
+ * of a character, an opaque integer "state" value containing enough information to decode the
|
|
|
|
|
+ * character when passed to a subsequent invocation of a partial decoding method.
|
|
|
*
|
|
*
|
|
|
* @author martinrb@google.com (Martin Buchholz)
|
|
* @author martinrb@google.com (Martin Buchholz)
|
|
|
*/
|
|
*/
|
|
@@ -98,31 +93,28 @@ final class Utf8 {
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Maximum number of bytes per Java UTF-16 char in UTF-8.
|
|
* Maximum number of bytes per Java UTF-16 char in UTF-8.
|
|
|
|
|
+ *
|
|
|
* @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
|
|
* @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
|
|
|
*/
|
|
*/
|
|
|
static final int MAX_BYTES_PER_CHAR = 3;
|
|
static final int MAX_BYTES_PER_CHAR = 3;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * State value indicating that the byte sequence is well-formed and
|
|
|
|
|
- * complete (no further bytes are needed to complete a character).
|
|
|
|
|
|
|
+ * State value indicating that the byte sequence is well-formed and complete (no further bytes are
|
|
|
|
|
+ * needed to complete a character).
|
|
|
*/
|
|
*/
|
|
|
public static final int COMPLETE = 0;
|
|
public static final int COMPLETE = 0;
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * State value indicating that the byte sequence is definitely not
|
|
|
|
|
- * well-formed.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** State value indicating that the byte sequence is definitely not well-formed. */
|
|
|
public static final int MALFORMED = -1;
|
|
public static final int MALFORMED = -1;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
|
|
* Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
|
|
|
* above which to employ an optimized algorithm for counting ASCII characters. The reason for this
|
|
* above which to employ an optimized algorithm for counting ASCII characters. The reason for this
|
|
|
* threshold is that for small strings, the optimization may not be beneficial or may even
|
|
* threshold is that for small strings, the optimization may not be beneficial or may even
|
|
|
- * negatively impact performance since it requires additional logic to avoid unaligned reads
|
|
|
|
|
- * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial
|
|
|
|
|
- * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()}
|
|
|
|
|
- * which provides a performance improvement that entirely subsumes the cost of the additional
|
|
|
|
|
- * logic.
|
|
|
|
|
|
|
+ * negatively impact performance since it requires additional logic to avoid unaligned reads (when
|
|
|
|
|
+ * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is
|
|
|
|
|
+ * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which
|
|
|
|
|
+ * provides a performance improvement that entirely subsumes the cost of the additional logic.
|
|
|
*/
|
|
*/
|
|
|
private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
|
|
private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
|
|
|
|
|
|
|
@@ -146,76 +138,69 @@ final class Utf8 {
|
|
|
// are valid trailing bytes.
|
|
// are valid trailing bytes.
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Returns {@code true} if the given byte array is a well-formed
|
|
|
|
|
- * UTF-8 byte sequence.
|
|
|
|
|
|
|
+ * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence.
|
|
|
*
|
|
*
|
|
|
- * <p>This is a convenience method, equivalent to a call to {@code
|
|
|
|
|
- * isValidUtf8(bytes, 0, bytes.length)}.
|
|
|
|
|
|
|
+ * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0,
|
|
|
|
|
+ * bytes.length)}.
|
|
|
*/
|
|
*/
|
|
|
public static boolean isValidUtf8(byte[] bytes) {
|
|
public static boolean isValidUtf8(byte[] bytes) {
|
|
|
return processor.isValidUtf8(bytes, 0, bytes.length);
|
|
return processor.isValidUtf8(bytes, 0, bytes.length);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Returns {@code true} if the given byte array slice is a
|
|
|
|
|
- * well-formed UTF-8 byte sequence. The range of bytes to be
|
|
|
|
|
- * checked extends from index {@code index}, inclusive, to {@code
|
|
|
|
|
- * limit}, exclusive.
|
|
|
|
|
|
|
+ * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
|
|
|
|
|
+ * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
|
|
|
|
|
+ * exclusive.
|
|
|
*
|
|
*
|
|
|
- * <p>This is a convenience method, equivalent to {@code
|
|
|
|
|
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
|
|
|
|
|
|
|
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
|
|
|
|
|
+ * == Utf8.COMPLETE}.
|
|
|
*/
|
|
*/
|
|
|
public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
|
|
public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
|
|
|
return processor.isValidUtf8(bytes, index, limit);
|
|
return processor.isValidUtf8(bytes, index, limit);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Tells whether the given byte array slice is a well-formed,
|
|
|
|
|
- * malformed, or incomplete UTF-8 byte sequence. The range of bytes
|
|
|
|
|
- * to be checked extends from index {@code index}, inclusive, to
|
|
|
|
|
|
|
+ * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
|
|
|
|
|
+ * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
|
|
|
* {@code limit}, exclusive.
|
|
* {@code limit}, exclusive.
|
|
|
*
|
|
*
|
|
|
- * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
|
|
|
|
|
- * operation) or the value returned from a call to a partial decoding method
|
|
|
|
|
- * for the previous bytes
|
|
|
|
|
- *
|
|
|
|
|
- * @return {@link #MALFORMED} if the partial byte sequence is
|
|
|
|
|
- * definitely not well-formed, {@link #COMPLETE} if it is well-formed
|
|
|
|
|
- * (no additional input needed), or if the byte sequence is
|
|
|
|
|
- * "incomplete", i.e. apparently terminated in the middle of a character,
|
|
|
|
|
- * an opaque integer "state" value containing enough information to
|
|
|
|
|
- * decode the character when passed to a subsequent invocation of a
|
|
|
|
|
- * partial decoding method.
|
|
|
|
|
|
|
+ * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
|
|
|
|
|
+ * value returned from a call to a partial decoding method for the previous bytes
|
|
|
|
|
+ * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
|
|
|
|
|
+ * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
|
|
|
|
|
+ * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
|
|
|
|
|
+ * "state" value containing enough information to decode the character when passed to a
|
|
|
|
|
+ * subsequent invocation of a partial decoding method.
|
|
|
*/
|
|
*/
|
|
|
public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
|
|
public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
|
|
|
return processor.partialIsValidUtf8(state, bytes, index, limit);
|
|
return processor.partialIsValidUtf8(state, bytes, index, limit);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int incompleteStateFor(int byte1) {
|
|
private static int incompleteStateFor(int byte1) {
|
|
|
- return (byte1 > (byte) 0xF4) ?
|
|
|
|
|
- MALFORMED : byte1;
|
|
|
|
|
|
|
+ return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int incompleteStateFor(int byte1, int byte2) {
|
|
private static int incompleteStateFor(int byte1, int byte2) {
|
|
|
- return (byte1 > (byte) 0xF4 ||
|
|
|
|
|
- byte2 > (byte) 0xBF) ?
|
|
|
|
|
- MALFORMED : byte1 ^ (byte2 << 8);
|
|
|
|
|
|
|
+ return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int incompleteStateFor(int byte1, int byte2, int byte3) {
|
|
private static int incompleteStateFor(int byte1, int byte2, int byte3) {
|
|
|
- return (byte1 > (byte) 0xF4 ||
|
|
|
|
|
- byte2 > (byte) 0xBF ||
|
|
|
|
|
- byte3 > (byte) 0xBF) ?
|
|
|
|
|
- MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
|
|
|
|
|
|
|
+ return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF)
|
|
|
|
|
+ ? MALFORMED
|
|
|
|
|
+ : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int incompleteStateFor(byte[] bytes, int index, int limit) {
|
|
private static int incompleteStateFor(byte[] bytes, int index, int limit) {
|
|
|
int byte1 = bytes[index - 1];
|
|
int byte1 = bytes[index - 1];
|
|
|
switch (limit - index) {
|
|
switch (limit - index) {
|
|
|
- case 0: return incompleteStateFor(byte1);
|
|
|
|
|
- case 1: return incompleteStateFor(byte1, bytes[index]);
|
|
|
|
|
- case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
|
|
|
|
|
- default: throw new AssertionError();
|
|
|
|
|
|
|
+ case 0:
|
|
|
|
|
+ return incompleteStateFor(byte1);
|
|
|
|
|
+ case 1:
|
|
|
|
|
+ return incompleteStateFor(byte1, bytes[index]);
|
|
|
|
|
+ case 2:
|
|
|
|
|
+ return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
|
|
|
|
|
+ default:
|
|
|
|
|
+ throw new AssertionError();
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -236,7 +221,7 @@ final class Utf8 {
|
|
|
// These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
|
|
// These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
|
|
|
// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
|
|
// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
|
|
|
// fallback to more lenient behavior.
|
|
// fallback to more lenient behavior.
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
static class UnpairedSurrogateException extends IllegalArgumentException {
|
|
static class UnpairedSurrogateException extends IllegalArgumentException {
|
|
|
UnpairedSurrogateException(int index, int length) {
|
|
UnpairedSurrogateException(int index, int length) {
|
|
|
super("Unpaired surrogate at index " + index + " of " + length);
|
|
super("Unpaired surrogate at index " + index + " of " + length);
|
|
@@ -244,9 +229,9 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
|
|
|
|
|
- * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
|
|
|
|
|
- * both time and space.
|
|
|
|
|
|
|
+ * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
|
|
|
|
|
+ * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
|
|
|
|
|
+ * time and space.
|
|
|
*
|
|
*
|
|
|
* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
|
|
* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
|
|
|
* surrogates)
|
|
* surrogates)
|
|
@@ -266,7 +251,7 @@ final class Utf8 {
|
|
|
for (; i < utf16Length; i++) {
|
|
for (; i < utf16Length; i++) {
|
|
|
char c = sequence.charAt(i);
|
|
char c = sequence.charAt(i);
|
|
|
if (c < 0x800) {
|
|
if (c < 0x800) {
|
|
|
- utf8Length += ((0x7f - c) >>> 31); // branch free!
|
|
|
|
|
|
|
+ utf8Length += ((0x7f - c) >>> 31); // branch free!
|
|
|
} else {
|
|
} else {
|
|
|
utf8Length += encodedLengthGeneral(sequence, i);
|
|
utf8Length += encodedLengthGeneral(sequence, i);
|
|
|
break;
|
|
break;
|
|
@@ -275,8 +260,8 @@ final class Utf8 {
|
|
|
|
|
|
|
|
if (utf8Length < utf16Length) {
|
|
if (utf8Length < utf16Length) {
|
|
|
// Necessary and sufficient condition for overflow because of maximum 3x expansion
|
|
// Necessary and sufficient condition for overflow because of maximum 3x expansion
|
|
|
- throw new IllegalArgumentException("UTF-8 length does not fit in int: "
|
|
|
|
|
- + (utf8Length + (1L << 32)));
|
|
|
|
|
|
|
+ throw new IllegalArgumentException(
|
|
|
|
|
+ "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
|
|
|
}
|
|
}
|
|
|
return utf8Length;
|
|
return utf8Length;
|
|
|
}
|
|
}
|
|
@@ -370,15 +355,15 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Counts (approximately) the number of consecutive ASCII characters in the given buffer.
|
|
|
|
|
- * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if
|
|
|
|
|
- * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
|
|
|
|
|
|
|
+ * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte
|
|
|
|
|
+ * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte
|
|
|
|
|
+ * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
|
|
|
*
|
|
*
|
|
|
* @param buffer the buffer to be scanned for ASCII chars
|
|
* @param buffer the buffer to be scanned for ASCII chars
|
|
|
* @param index the starting index of the scan
|
|
* @param index the starting index of the scan
|
|
|
* @param limit the limit within buffer for the scan
|
|
* @param limit the limit within buffer for the scan
|
|
|
- * @return the number of ASCII characters found. The stopping position will be at or
|
|
|
|
|
- * before the first non-ASCII byte.
|
|
|
|
|
|
|
+ * @return the number of ASCII characters found. The stopping position will be at or before the
|
|
|
|
|
+ * first non-ASCII byte.
|
|
|
*/
|
|
*/
|
|
|
private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
|
|
private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
|
|
|
int i = index;
|
|
int i = index;
|
|
@@ -390,52 +375,43 @@ final class Utf8 {
|
|
|
return i - index;
|
|
return i - index;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * A processor of UTF-8 strings, providing methods for checking validity and encoding.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */
|
|
|
// TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
|
|
// TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
|
|
|
abstract static class Processor {
|
|
abstract static class Processor {
|
|
|
/**
|
|
/**
|
|
|
- * Returns {@code true} if the given byte array slice is a
|
|
|
|
|
- * well-formed UTF-8 byte sequence. The range of bytes to be
|
|
|
|
|
- * checked extends from index {@code index}, inclusive, to {@code
|
|
|
|
|
- * limit}, exclusive.
|
|
|
|
|
|
|
+ * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
|
|
|
|
|
+ * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
|
|
|
|
|
+ * exclusive.
|
|
|
*
|
|
*
|
|
|
- * <p>This is a convenience method, equivalent to {@code
|
|
|
|
|
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
|
|
|
|
|
|
|
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
|
|
|
|
|
+ * == Utf8.COMPLETE}.
|
|
|
*/
|
|
*/
|
|
|
final boolean isValidUtf8(byte[] bytes, int index, int limit) {
|
|
final boolean isValidUtf8(byte[] bytes, int index, int limit) {
|
|
|
return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
|
|
return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Tells whether the given byte array slice is a well-formed,
|
|
|
|
|
- * malformed, or incomplete UTF-8 byte sequence. The range of bytes
|
|
|
|
|
- * to be checked extends from index {@code index}, inclusive, to
|
|
|
|
|
- * {@code limit}, exclusive.
|
|
|
|
|
|
|
+ * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8
|
|
|
|
|
+ * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
|
|
|
|
|
+ * to {@code limit}, exclusive.
|
|
|
*
|
|
*
|
|
|
- * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
|
|
|
|
|
- * operation) or the value returned from a call to a partial decoding method
|
|
|
|
|
- * for the previous bytes
|
|
|
|
|
- *
|
|
|
|
|
- * @return {@link #MALFORMED} if the partial byte sequence is
|
|
|
|
|
- * definitely not well-formed, {@link #COMPLETE} if it is well-formed
|
|
|
|
|
- * (no additional input needed), or if the byte sequence is
|
|
|
|
|
- * "incomplete", i.e. apparently terminated in the middle of a character,
|
|
|
|
|
- * an opaque integer "state" value containing enough information to
|
|
|
|
|
- * decode the character when passed to a subsequent invocation of a
|
|
|
|
|
- * partial decoding method.
|
|
|
|
|
|
|
+ * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
|
|
|
|
|
+ * value returned from a call to a partial decoding method for the previous bytes
|
|
|
|
|
+ * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
|
|
|
|
|
+ * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
|
|
|
|
|
+ * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
|
|
|
|
|
+ * "state" value containing enough information to decode the character when passed to a
|
|
|
|
|
+ * subsequent invocation of a partial decoding method.
|
|
|
*/
|
|
*/
|
|
|
abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
|
|
abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Returns {@code true} if the given portion of the {@link ByteBuffer} is a
|
|
|
|
|
- * well-formed UTF-8 byte sequence. The range of bytes to be
|
|
|
|
|
- * checked extends from index {@code index}, inclusive, to {@code
|
|
|
|
|
- * limit}, exclusive.
|
|
|
|
|
|
|
+ * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
|
|
|
|
|
+ * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
|
|
|
|
|
+ * to {@code limit}, exclusive.
|
|
|
*
|
|
*
|
|
|
- * <p>This is a convenience method, equivalent to {@code
|
|
|
|
|
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
|
|
|
|
|
|
|
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
|
|
|
|
|
+ * == Utf8.COMPLETE}.
|
|
|
*/
|
|
*/
|
|
|
final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
|
|
final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
|
|
|
return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
|
|
return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
|
|
@@ -452,22 +428,20 @@ final class Utf8 {
|
|
|
if (buffer.hasArray()) {
|
|
if (buffer.hasArray()) {
|
|
|
final int offset = buffer.arrayOffset();
|
|
final int offset = buffer.arrayOffset();
|
|
|
return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
|
|
return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
|
|
|
- } else if (buffer.isDirect()){
|
|
|
|
|
|
|
+ } else if (buffer.isDirect()) {
|
|
|
return partialIsValidUtf8Direct(state, buffer, index, limit);
|
|
return partialIsValidUtf8Direct(state, buffer, index, limit);
|
|
|
}
|
|
}
|
|
|
return partialIsValidUtf8Default(state, buffer, index, limit);
|
|
return partialIsValidUtf8Default(state, buffer, index, limit);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Performs validation for direct {@link ByteBuffer} instances.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Performs validation for direct {@link ByteBuffer} instances. */
|
|
|
abstract int partialIsValidUtf8Direct(
|
|
abstract int partialIsValidUtf8Direct(
|
|
|
final int state, final ByteBuffer buffer, int index, final int limit);
|
|
final int state, final ByteBuffer buffer, int index, final int limit);
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
|
|
* Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
|
|
|
- * than potentially faster approaches. This first completes validation for the current
|
|
|
|
|
- * character (provided by {@code state}) and then finishes validation for the sequence.
|
|
|
|
|
|
|
+ * than potentially faster approaches. This first completes validation for the current character
|
|
|
|
|
+ * (provided by {@code state}) and then finishes validation for the sequence.
|
|
|
*/
|
|
*/
|
|
|
final int partialIsValidUtf8Default(
|
|
final int partialIsValidUtf8Default(
|
|
|
final int state, final ByteBuffer buffer, int index, final int limit) {
|
|
final int state, final ByteBuffer buffer, int index, final int limit) {
|
|
@@ -566,7 +540,7 @@ final class Utf8 {
|
|
|
private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
|
|
private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
|
|
|
index += estimateConsecutiveAscii(buffer, index, limit);
|
|
index += estimateConsecutiveAscii(buffer, index, limit);
|
|
|
|
|
|
|
|
- for (;;) {
|
|
|
|
|
|
|
+ for (; ; ) {
|
|
|
// Optimize for interior runs of ASCII bytes.
|
|
// Optimize for interior runs of ASCII bytes.
|
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
@@ -658,15 +632,13 @@ final class Utf8 {
|
|
|
return decodeUtf8Default(buffer, index, size);
|
|
return decodeUtf8Default(buffer, index, size);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Decodes direct {@link ByteBuffer} instances into {@link String}.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Decodes direct {@link ByteBuffer} instances into {@link String}. */
|
|
|
abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
|
|
abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
|
|
|
throws InvalidProtocolBufferException;
|
|
throws InvalidProtocolBufferException;
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than
|
|
|
|
|
- * potentially faster approaches.
|
|
|
|
|
|
|
+ * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially
|
|
|
|
|
+ * faster approaches.
|
|
|
*/
|
|
*/
|
|
|
final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
|
|
final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
|
|
|
throws InvalidProtocolBufferException {
|
|
throws InvalidProtocolBufferException {
|
|
@@ -747,21 +719,22 @@ final class Utf8 {
|
|
|
/**
|
|
/**
|
|
|
* Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
|
|
* Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
|
|
|
* For a string, this method is similar to
|
|
* For a string, this method is similar to
|
|
|
|
|
+ *
|
|
|
* <pre>{@code
|
|
* <pre>{@code
|
|
|
* byte[] a = string.getBytes(UTF_8);
|
|
* byte[] a = string.getBytes(UTF_8);
|
|
|
* System.arraycopy(a, 0, bytes, offset, a.length);
|
|
* System.arraycopy(a, 0, bytes, offset, a.length);
|
|
|
* return offset + a.length;
|
|
* return offset + a.length;
|
|
|
* }</pre>
|
|
* }</pre>
|
|
|
*
|
|
*
|
|
|
- * but is more efficient in both time and space. One key difference is that this method
|
|
|
|
|
- * requires paired surrogates, and therefore does not support chunking.
|
|
|
|
|
- * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default
|
|
|
|
|
- * replacement character, this method throws {@link UnpairedSurrogateException}.
|
|
|
|
|
|
|
+ * but is more efficient in both time and space. One key difference is that this method requires
|
|
|
|
|
+ * paired surrogates, and therefore does not support chunking. While {@code
|
|
|
|
|
+ * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character,
|
|
|
|
|
+ * this method throws {@link UnpairedSurrogateException}.
|
|
|
*
|
|
*
|
|
|
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
|
|
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
|
|
|
- * compute the exact amount needed, or leave room for
|
|
|
|
|
- * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number
|
|
|
|
|
- * of bytes that any input can be encoded to.
|
|
|
|
|
|
|
+ * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
|
|
|
|
|
+ * sequence.length()}, which is the largest possible number of bytes that any input can be
|
|
|
|
|
+ * encoded to.
|
|
|
*
|
|
*
|
|
|
* @param in the input character sequence to be encoded
|
|
* @param in the input character sequence to be encoded
|
|
|
* @param out the target array
|
|
* @param out the target array
|
|
@@ -778,26 +751,24 @@ final class Utf8 {
|
|
|
/**
|
|
/**
|
|
|
* Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
|
|
* Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
|
|
|
* Upon returning from this method, the {@code out} position will point to the position after
|
|
* Upon returning from this method, the {@code out} position will point to the position after
|
|
|
- * the last encoded byte. This method requires paired surrogates, and therefore does not
|
|
|
|
|
- * support chunking.
|
|
|
|
|
|
|
+ * the last encoded byte. This method requires paired surrogates, and therefore does not support
|
|
|
|
|
+ * chunking.
|
|
|
*
|
|
*
|
|
|
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
|
|
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
|
|
|
- * compute the exact amount needed, or leave room for
|
|
|
|
|
- * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number
|
|
|
|
|
- * of bytes that any input can be encoded to.
|
|
|
|
|
|
|
+ * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
|
|
|
|
|
+ * in.length()}, which is the largest possible number of bytes that any input can be encoded to.
|
|
|
*
|
|
*
|
|
|
* @param in the source character sequence to be encoded
|
|
* @param in the source character sequence to be encoded
|
|
|
* @param out the target buffer
|
|
* @param out the target buffer
|
|
|
* @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
|
|
* @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
|
|
|
* surrogates)
|
|
* surrogates)
|
|
|
- * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than
|
|
|
|
|
- * {@code out.remaining()}
|
|
|
|
|
|
|
+ * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code
|
|
|
|
|
+ * out.remaining()}
|
|
|
*/
|
|
*/
|
|
|
final void encodeUtf8(CharSequence in, ByteBuffer out) {
|
|
final void encodeUtf8(CharSequence in, ByteBuffer out) {
|
|
|
if (out.hasArray()) {
|
|
if (out.hasArray()) {
|
|
|
final int offset = out.arrayOffset();
|
|
final int offset = out.arrayOffset();
|
|
|
- int endIndex =
|
|
|
|
|
- Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
|
|
|
|
|
|
|
+ int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
|
|
|
out.position(endIndex - offset);
|
|
out.position(endIndex - offset);
|
|
|
} else if (out.isDirect()) {
|
|
} else if (out.isDirect()) {
|
|
|
encodeUtf8Direct(in, out);
|
|
encodeUtf8Direct(in, out);
|
|
@@ -806,9 +777,7 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Encodes the input character sequence to a direct {@link ByteBuffer} instance.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
|
|
|
abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
|
|
abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -887,9 +856,7 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
|
|
|
static final class SafeProcessor extends Processor {
|
|
static final class SafeProcessor extends Processor {
|
|
|
@Override
|
|
@Override
|
|
|
int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
|
|
int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
|
|
@@ -901,7 +868,7 @@ final class Utf8 {
|
|
|
//
|
|
//
|
|
|
// We expect such "straddler characters" to be rare.
|
|
// We expect such "straddler characters" to be rare.
|
|
|
|
|
|
|
|
- if (index >= limit) { // No bytes? No progress.
|
|
|
|
|
|
|
+ if (index >= limit) { // No bytes? No progress.
|
|
|
return state;
|
|
return state;
|
|
|
}
|
|
}
|
|
|
int byte1 = (byte) state;
|
|
int byte1 = (byte) state;
|
|
@@ -1098,8 +1065,7 @@ final class Utf8 {
|
|
|
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
|
|
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
|
|
|
// four UTF-8 bytes
|
|
// four UTF-8 bytes
|
|
|
final char low;
|
|
final char low;
|
|
|
- if (i + 1 == in.length()
|
|
|
|
|
- || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
|
|
|
|
|
|
|
+ if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
|
|
|
throw new UnpairedSurrogateException((i - 1), utf16Length);
|
|
throw new UnpairedSurrogateException((i - 1), utf16Length);
|
|
|
}
|
|
}
|
|
|
int codePoint = Character.toCodePoint(c, low);
|
|
int codePoint = Character.toCodePoint(c, low);
|
|
@@ -1111,8 +1077,7 @@ final class Utf8 {
|
|
|
// If we are surrogates and we're not a surrogate pair, always throw an
|
|
// If we are surrogates and we're not a surrogate pair, always throw an
|
|
|
// UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
|
|
// UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
|
|
|
if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
|
|
if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
|
|
|
- && (i + 1 == in.length()
|
|
|
|
|
- || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
|
|
|
|
|
|
|
+ && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
|
|
|
throw new UnpairedSurrogateException(i, utf16Length);
|
|
throw new UnpairedSurrogateException(i, utf16Length);
|
|
|
}
|
|
}
|
|
|
throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
|
|
throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
|
|
@@ -1138,7 +1103,7 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
|
|
private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
|
|
|
- for (;;) {
|
|
|
|
|
|
|
+ for (; ; ) {
|
|
|
int byte1, byte2;
|
|
int byte1, byte2;
|
|
|
|
|
|
|
|
// Optimize for interior runs of ASCII bytes.
|
|
// Optimize for interior runs of ASCII bytes.
|
|
@@ -1158,8 +1123,7 @@ final class Utf8 {
|
|
|
|
|
|
|
|
// Simultaneously checks for illegal trailing-byte in
|
|
// Simultaneously checks for illegal trailing-byte in
|
|
|
// leading position and overlong 2-byte form.
|
|
// leading position and overlong 2-byte form.
|
|
|
- if (byte1 < (byte) 0xC2
|
|
|
|
|
- || bytes[index++] > (byte) 0xBF) {
|
|
|
|
|
|
|
+ if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
|
|
|
return MALFORMED;
|
|
return MALFORMED;
|
|
|
}
|
|
}
|
|
|
} else if (byte1 < (byte) 0xF0) {
|
|
} else if (byte1 < (byte) 0xF0) {
|
|
@@ -1180,7 +1144,7 @@ final class Utf8 {
|
|
|
} else {
|
|
} else {
|
|
|
// four-byte form
|
|
// four-byte form
|
|
|
|
|
|
|
|
- if (index >= limit - 2) { // incomplete sequence
|
|
|
|
|
|
|
+ if (index >= limit - 2) { // incomplete sequence
|
|
|
return incompleteStateFor(bytes, index, limit);
|
|
return incompleteStateFor(bytes, index, limit);
|
|
|
}
|
|
}
|
|
|
if ((byte2 = bytes[index++]) > (byte) 0xBF
|
|
if ((byte2 = bytes[index++]) > (byte) 0xBF
|
|
@@ -1200,13 +1164,9 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */
|
|
|
static final class UnsafeProcessor extends Processor {
|
|
static final class UnsafeProcessor extends Processor {
|
|
|
- /**
|
|
|
|
|
- * Indicates whether or not all required unsafe operations are supported on this platform.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Indicates whether or not all required unsafe operations are supported on this platform. */
|
|
|
static boolean isAvailable() {
|
|
static boolean isAvailable() {
|
|
|
return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
|
|
return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
|
|
|
}
|
|
}
|
|
@@ -1228,7 +1188,7 @@ final class Utf8 {
|
|
|
//
|
|
//
|
|
|
// We expect such "straddler characters" to be rare.
|
|
// We expect such "straddler characters" to be rare.
|
|
|
|
|
|
|
|
- if (offset >= offsetLimit) { // No bytes? No progress.
|
|
|
|
|
|
|
+ if (offset >= offsetLimit) { // No bytes? No progress.
|
|
|
return state;
|
|
return state;
|
|
|
}
|
|
}
|
|
|
int byte1 = (byte) state;
|
|
int byte1 = (byte) state;
|
|
@@ -1685,8 +1645,8 @@ final class Utf8 {
|
|
|
* @param bytes the array containing the character sequence
|
|
* @param bytes the array containing the character sequence
|
|
|
* @param offset the offset position of the index (same as index + arrayBaseOffset)
|
|
* @param offset the offset position of the index (same as index + arrayBaseOffset)
|
|
|
* @param maxChars the maximum number of characters to count
|
|
* @param maxChars the maximum number of characters to count
|
|
|
- * @return the number of ASCII characters found. The stopping position will be at or
|
|
|
|
|
- * before the first non-ASCII byte.
|
|
|
|
|
|
|
+ * @return the number of ASCII characters found. The stopping position will be at or before the
|
|
|
|
|
+ * first non-ASCII byte.
|
|
|
*/
|
|
*/
|
|
|
private static int unsafeEstimateConsecutiveAscii(
|
|
private static int unsafeEstimateConsecutiveAscii(
|
|
|
byte[] bytes, long offset, final int maxChars) {
|
|
byte[] bytes, long offset, final int maxChars) {
|
|
@@ -1728,24 +1688,24 @@ final class Utf8 {
|
|
|
// To speed things up further, we're reading longs instead of bytes so we use a mask to
|
|
// To speed things up further, we're reading longs instead of bytes so we use a mask to
|
|
|
// determine if any byte in the current long is non-ASCII.
|
|
// determine if any byte in the current long is non-ASCII.
|
|
|
remaining -= unaligned;
|
|
remaining -= unaligned;
|
|
|
- for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
|
|
|
|
|
|
|
+ for (;
|
|
|
|
|
+ remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
|
|
|
address += 8, remaining -= 8) {}
|
|
address += 8, remaining -= 8) {}
|
|
|
return maxChars - remaining;
|
|
return maxChars - remaining;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
|
|
private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
|
|
|
- // Skip past ASCII characters as quickly as possible.
|
|
|
|
|
|
|
+ // Skip past ASCII characters as quickly as possible.
|
|
|
final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
|
|
final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
|
|
|
remaining -= skipped;
|
|
remaining -= skipped;
|
|
|
offset += skipped;
|
|
offset += skipped;
|
|
|
|
|
|
|
|
- for (;;) {
|
|
|
|
|
|
|
+ for (; ; ) {
|
|
|
// Optimize for interior runs of ASCII bytes.
|
|
// Optimize for interior runs of ASCII bytes.
|
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
|
int byte1 = 0;
|
|
int byte1 = 0;
|
|
|
- for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {}
|
|
|
if (remaining == 0) {
|
|
if (remaining == 0) {
|
|
|
return COMPLETE;
|
|
return COMPLETE;
|
|
|
}
|
|
}
|
|
@@ -1762,8 +1722,7 @@ final class Utf8 {
|
|
|
|
|
|
|
|
// Simultaneously checks for illegal trailing-byte in
|
|
// Simultaneously checks for illegal trailing-byte in
|
|
|
// leading position and overlong 2-byte form.
|
|
// leading position and overlong 2-byte form.
|
|
|
- if (byte1 < (byte) 0xC2
|
|
|
|
|
- || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
|
|
|
|
|
|
|
+ if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
|
|
|
return MALFORMED;
|
|
return MALFORMED;
|
|
|
}
|
|
}
|
|
|
} else if (byte1 < (byte) 0xF0) {
|
|
} else if (byte1 < (byte) 0xF0) {
|
|
@@ -1815,13 +1774,12 @@ final class Utf8 {
|
|
|
address += skipped;
|
|
address += skipped;
|
|
|
remaining -= skipped;
|
|
remaining -= skipped;
|
|
|
|
|
|
|
|
- for (;;) {
|
|
|
|
|
|
|
+ for (; ; ) {
|
|
|
// Optimize for interior runs of ASCII bytes.
|
|
// Optimize for interior runs of ASCII bytes.
|
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
|
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
|
|
|
int byte1 = 0;
|
|
int byte1 = 0;
|
|
|
- for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {}
|
|
|
if (remaining == 0) {
|
|
if (remaining == 0) {
|
|
|
return COMPLETE;
|
|
return COMPLETE;
|
|
|
}
|
|
}
|
|
@@ -1886,40 +1844,32 @@ final class Utf8 {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset,
|
|
|
|
|
- int remaining) {
|
|
|
|
|
|
|
+ private static int unsafeIncompleteStateFor(
|
|
|
|
|
+ byte[] bytes, int byte1, long offset, int remaining) {
|
|
|
switch (remaining) {
|
|
switch (remaining) {
|
|
|
- case 0: {
|
|
|
|
|
|
|
+ case 0:
|
|
|
return incompleteStateFor(byte1);
|
|
return incompleteStateFor(byte1);
|
|
|
- }
|
|
|
|
|
- case 1: {
|
|
|
|
|
|
|
+ case 1:
|
|
|
return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
|
|
return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
|
|
|
- }
|
|
|
|
|
- case 2: {
|
|
|
|
|
- return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),
|
|
|
|
|
- UnsafeUtil.getByte(bytes, offset + 1));
|
|
|
|
|
- }
|
|
|
|
|
- default: {
|
|
|
|
|
|
|
+ case 2:
|
|
|
|
|
+ return incompleteStateFor(
|
|
|
|
|
+ byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1));
|
|
|
|
|
+ default:
|
|
|
throw new AssertionError();
|
|
throw new AssertionError();
|
|
|
- }
|
|
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
|
|
private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
|
|
|
switch (remaining) {
|
|
switch (remaining) {
|
|
|
- case 0: {
|
|
|
|
|
|
|
+ case 0:
|
|
|
return incompleteStateFor(byte1);
|
|
return incompleteStateFor(byte1);
|
|
|
- }
|
|
|
|
|
- case 1: {
|
|
|
|
|
|
|
+ case 1:
|
|
|
return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
|
|
return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
|
|
|
- }
|
|
|
|
|
- case 2: {
|
|
|
|
|
- return incompleteStateFor(byte1, UnsafeUtil.getByte(address),
|
|
|
|
|
- UnsafeUtil.getByte(address + 1));
|
|
|
|
|
- }
|
|
|
|
|
- default: {
|
|
|
|
|
|
|
+ case 2:
|
|
|
|
|
+ return incompleteStateFor(
|
|
|
|
|
+ byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1));
|
|
|
|
|
+ default:
|
|
|
throw new AssertionError();
|
|
throw new AssertionError();
|
|
|
- }
|
|
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -1931,23 +1881,17 @@ final class Utf8 {
|
|
|
*/
|
|
*/
|
|
|
private static class DecodeUtil {
|
|
private static class DecodeUtil {
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */
|
|
|
private static boolean isOneByte(byte b) {
|
|
private static boolean isOneByte(byte b) {
|
|
|
return b >= 0;
|
|
return b >= 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */
|
|
|
private static boolean isTwoBytes(byte b) {
|
|
private static boolean isTwoBytes(byte b) {
|
|
|
return b < (byte) 0xE0;
|
|
return b < (byte) 0xE0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */
|
|
|
private static boolean isThreeBytes(byte b) {
|
|
private static boolean isThreeBytes(byte b) {
|
|
|
return b < (byte) 0xF0;
|
|
return b < (byte) 0xF0;
|
|
|
}
|
|
}
|
|
@@ -1956,13 +1900,11 @@ final class Utf8 {
|
|
|
resultArr[resultPos] = (char) byte1;
|
|
resultArr[resultPos] = (char) byte1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- private static void handleTwoBytes(
|
|
|
|
|
- byte byte1, byte byte2, char[] resultArr, int resultPos)
|
|
|
|
|
|
|
+ private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)
|
|
|
throws InvalidProtocolBufferException {
|
|
throws InvalidProtocolBufferException {
|
|
|
// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
|
|
// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
|
|
|
// overlong 2-byte, '11000001'.
|
|
// overlong 2-byte, '11000001'.
|
|
|
- if (byte1 < (byte) 0xC2
|
|
|
|
|
- || isNotTrailingByte(byte2)) {
|
|
|
|
|
|
|
+ if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) {
|
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
|
}
|
|
}
|
|
|
resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
|
|
resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
|
|
@@ -1979,13 +1921,14 @@ final class Utf8 {
|
|
|
|| isNotTrailingByte(byte3)) {
|
|
|| isNotTrailingByte(byte3)) {
|
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
|
}
|
|
}
|
|
|
- resultArr[resultPos] = (char)
|
|
|
|
|
- (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
|
|
|
|
|
|
|
+ resultArr[resultPos] =
|
|
|
|
|
+ (char)
|
|
|
|
|
+ (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static void handleFourBytes(
|
|
private static void handleFourBytes(
|
|
|
byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
|
|
byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
|
|
|
- throws InvalidProtocolBufferException{
|
|
|
|
|
|
|
+ throws InvalidProtocolBufferException {
|
|
|
if (isNotTrailingByte(byte2)
|
|
if (isNotTrailingByte(byte2)
|
|
|
// Check that 1 <= plane <= 16. Tricky optimized form of:
|
|
// Check that 1 <= plane <= 16. Tricky optimized form of:
|
|
|
// valid 4-byte leading byte?
|
|
// valid 4-byte leading byte?
|
|
@@ -1999,31 +1942,28 @@ final class Utf8 {
|
|
|
|| isNotTrailingByte(byte4)) {
|
|
|| isNotTrailingByte(byte4)) {
|
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
throw InvalidProtocolBufferException.invalidUtf8();
|
|
|
}
|
|
}
|
|
|
- int codepoint = ((byte1 & 0x07) << 18)
|
|
|
|
|
- | (trailingByteValue(byte2) << 12)
|
|
|
|
|
- | (trailingByteValue(byte3) << 6)
|
|
|
|
|
- | trailingByteValue(byte4);
|
|
|
|
|
|
|
+ int codepoint =
|
|
|
|
|
+ ((byte1 & 0x07) << 18)
|
|
|
|
|
+ | (trailingByteValue(byte2) << 12)
|
|
|
|
|
+ | (trailingByteValue(byte3) << 6)
|
|
|
|
|
+ | trailingByteValue(byte4);
|
|
|
resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
|
|
resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
|
|
|
resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
|
|
resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */
|
|
|
private static boolean isNotTrailingByte(byte b) {
|
|
private static boolean isNotTrailingByte(byte b) {
|
|
|
return b > (byte) 0xBF;
|
|
return b > (byte) 0xBF;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
|
|
|
|
|
- */
|
|
|
|
|
|
|
+ /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */
|
|
|
private static int trailingByteValue(byte b) {
|
|
private static int trailingByteValue(byte b) {
|
|
|
return b & 0x3F;
|
|
return b & 0x3F;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static char highSurrogate(int codePoint) {
|
|
private static char highSurrogate(int codePoint) {
|
|
|
- return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
|
|
|
|
|
- + (codePoint >>> 10));
|
|
|
|
|
|
|
+ return (char)
|
|
|
|
|
+ ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private static char lowSurrogate(int codePoint) {
|
|
private static char lowSurrogate(int codePoint) {
|