This source file includes following definitions.
- generateFourByteShardsExpectedRunnables
- generateFourByteShards
- testBytes
- testBytes
- testBytesUsingByteBuffers
- outputFailure
- outputFailure
- toHexString
- toHexString
package com.google.protobuf;
import static junit.framework.Assert.*;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.logging.Logger;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.Charset;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
class IsValidUtf8TestUtil {
private static Logger logger = Logger.getLogger(
IsValidUtf8TestUtil.class.getName());
static long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
static long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
static long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
static long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
static long THREE_BYTE_SURROGATES = 2 * 1024;
static long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
static long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
static long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
static long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
static class Shard {
final long index;
final long start;
final long lim;
final long expected;
public Shard(long index, long start, long lim, long expected) {
assertTrue(start < lim);
this.index = index;
this.start = start;
this.lim = lim;
this.expected = expected;
}
}
static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
generateFourByteShardsExpectedRunnables();
private static long[] generateFourByteShardsExpectedRunnables() {
long[] expected = new long[128];
for (int i = 0; i <= 63; i++) {
expected[i] = 5300224;
}
for (int i = 97; i <= 111; i++) {
expected[i] = 2342912;
}
for (int i = 113; i <= 117; i++) {
expected[i] = 1048576;
}
expected[112] = 786432;
expected[118] = 786432;
expected[119] = 1048576;
expected[120] = 458752;
expected[121] = 524288;
expected[122] = 65536;
return expected;
}
static final List<Shard> FOUR_BYTE_SHARDS = generateFourByteShards(
128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
private static List<Shard> generateFourByteShards(
int numShards, long[] expected) {
assertEquals(numShards, expected.length);
List<Shard> shards = new ArrayList<Shard>(numShards);
long LIM = 1L << 32;
long increment = LIM / numShards;
assertTrue(LIM % numShards == 0);
for (int i = 0; i < numShards; i++) {
shards.add(new Shard(i,
increment * i,
increment * (i + 1),
expected[i]));
}
return shards;
}
static void testBytes(int numBytes, long expectedCount)
throws UnsupportedEncodingException {
testBytes(numBytes, expectedCount, 0, -1);
}
static void testBytes(int numBytes, long expectedCount, long start, long lim)
throws UnsupportedEncodingException {
Random rnd = new Random();
byte[] bytes = new byte[numBytes];
if (lim == -1) {
lim = 1L << (numBytes * 8);
}
long count = 0;
long countRoundTripped = 0;
for (long byteChar = start; byteChar < lim; byteChar++) {
long tmpByteChar = byteChar;
for (int i = 0; i < numBytes; i++) {
bytes[bytes.length - i - 1] = (byte) tmpByteChar;
tmpByteChar = tmpByteChar >> 8;
}
ByteString bs = ByteString.copyFrom(bytes);
boolean isRoundTrippable = bs.isValidUtf8();
String s = new String(bytes, "UTF-8");
byte[] bytesReencoded = s.getBytes("UTF-8");
boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
if (bytesEqual != isRoundTrippable) {
outputFailure(byteChar, bytes, bytesReencoded);
}
assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
int i = rnd.nextInt(numBytes);
int j = rnd.nextInt(numBytes);
if (j < i) {
int tmp = i; i = j; j = tmp;
}
int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
System.out.printf("state=%04x %04x %04x i=%d j=%d%n",
state1, state2, state3, i, j);
outputFailure(byteChar, bytes, bytesReencoded);
}
assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
ByteString rope = RopeByteString.newInstanceForTest(
bs.substring(0, i),
RopeByteString.newInstanceForTest(
bs.substring(i, j),
bs.substring(j, numBytes)));
assertSame(RopeByteString.class, rope.getClass());
ByteString[] byteStrings = { bs, bs.substring(0, numBytes), rope };
for (ByteString x : byteStrings) {
assertEquals(isRoundTrippable,
x.isValidUtf8());
assertEquals(state3,
x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
assertEquals(state1,
x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
assertEquals(state1,
x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
assertEquals(state2,
x.partialIsValidUtf8(state1, i, j - i));
assertEquals(state2,
x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
assertEquals(state3,
x.partialIsValidUtf8(state2, j, numBytes - j));
assertEquals(state3,
x.substring(j, numBytes)
.partialIsValidUtf8(state2, 0, numBytes - j));
}
ByteString ropeADope =
RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
if (isRoundTrippable) {
countRoundTripped++;
}
count++;
if (byteChar != 0 && byteChar % 1000000L == 0) {
logger.info("Processed " + (byteChar / 1000000L) +
" million characters");
}
}
logger.info("Round tripped " + countRoundTripped + " of " + count);
assertEquals(expectedCount, countRoundTripped);
}
void testBytesUsingByteBuffers(
int numBytes, long expectedCount, long start, long lim)
throws UnsupportedEncodingException {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
byte[] bytes = new byte[numBytes];
int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
char[] charsDecoded =
new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
byte[] bytesReencoded = new byte[maxBytes];
ByteBuffer bb = ByteBuffer.wrap(bytes);
CharBuffer cb = CharBuffer.wrap(charsDecoded);
ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
if (lim == -1) {
lim = 1L << (numBytes * 8);
}
long count = 0;
long countRoundTripped = 0;
for (long byteChar = start; byteChar < lim; byteChar++) {
bb.rewind();
bb.limit(bytes.length);
cb.rewind();
cb.limit(charsDecoded.length);
bbReencoded.rewind();
bbReencoded.limit(bytesReencoded.length);
encoder.reset();
decoder.reset();
long tmpByteChar = byteChar;
for (int i = 0; i < bytes.length; i++) {
bytes[bytes.length - i - 1] = (byte) tmpByteChar;
tmpByteChar = tmpByteChar >> 8;
}
boolean isRoundTrippable = ByteString.copyFrom(bytes).isValidUtf8();
CoderResult result = decoder.decode(bb, cb, true);
assertFalse(result.isError());
result = decoder.flush(cb);
assertFalse(result.isError());
int charLen = cb.position();
cb.rewind();
cb.limit(charLen);
result = encoder.encode(cb, bbReencoded, true);
assertFalse(result.isError());
result = encoder.flush(bbReencoded);
assertFalse(result.isError());
boolean bytesEqual = true;
int bytesLen = bbReencoded.position();
if (bytesLen != numBytes) {
bytesEqual = false;
} else {
for (int i = 0; i < numBytes; i++) {
if (bytes[i] != bytesReencoded[i]) {
bytesEqual = false;
break;
}
}
}
if (bytesEqual != isRoundTrippable) {
outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
}
count++;
if (isRoundTrippable) {
countRoundTripped++;
}
if (byteChar != 0 && byteChar % 1000000 == 0) {
logger.info("Processed " + (byteChar / 1000000) +
" million characters");
}
}
logger.info("Round tripped " + countRoundTripped + " of " + count);
assertEquals(expectedCount, countRoundTripped);
}
private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
outputFailure(byteChar, bytes, after, after.length);
}
private static void outputFailure(long byteChar, byte[] bytes, byte[] after,
int len) {
fail("Failure: (" + Long.toHexString(byteChar) + ") " +
toHexString(bytes) + " => " + toHexString(after, len));
}
private static String toHexString(byte[] b) {
return toHexString(b, b.length);
}
private static String toHexString(byte[] b, int len) {
StringBuilder s = new StringBuilder();
s.append("\"");
for (int i = 0; i < len; i++) {
if (i > 0) {
s.append(" ");
}
s.append(String.format("%02x", b[i] & 0xFF));
}
s.append("\"");
return s.toString();
}
}