9 年之前 · c6f3d700b9
--- a/js/binary/decoder.js
+++ b/js/binary/decoder.js
@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
 
				 
			
 
				 /**
			
 
				  * Reads and parses a UTF-8 encoded unicode string from the stream.
			
 
				- * The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
			
 
				- * the exception that the implementation here does not get confused if it
			
 
				- * encounters characters longer than three bytes. These characters are ignored
			
 
				- * though, as they are extremely rare: three UTF-8 bytes cover virtually all
			
 
				- * characters in common use (http://en.wikipedia.org/wiki/UTF-8).
			
 
				+ * The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
			
 
				+ * Supports codepoints from U+0000 up to U+10FFFF. 
			
 
				+ * (http://en.wikipedia.org/wiki/UTF-8).
			
 
				  * @param {number} length The length of the string to read.
			
 
				  * @return {string} The decoded string.
			
 
				  */
			
@@ -907,30 +905,45 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
 
				   var bytes = this.bytes_;
			
 
				   var cursor = this.cursor_;
			
 
				   var end = cursor + length;
			
 
				-  var chars = [];
			
 
				+  var codeUnits = [];
			
 
				 
			
 
				   while (cursor < end) {
			
 
				     var c = bytes[cursor++];
			
 
				     if (c < 128) { // Regular 7-bit ASCII.
			
 
				-      chars.push(c);
			
 
				+      codeUnits.push(c);
			
 
				     } else if (c < 192) {
			
 
				       // UTF-8 continuation mark. We are out of sync. This
			
 
				       // might happen if we attempted to read a character
			
 
				-      // with more than three bytes.
			
 
				+      // with more than four bytes.
			
 
				       continue;
			
 
				     } else if (c < 224) { // UTF-8 with two bytes.
			
 
				       var c2 = bytes[cursor++];
			
 
				-      chars.push(((c & 31) << 6) | (c2 & 63));
			
 
				+      codeUnits.push(((c & 31) << 6) | (c2 & 63));
			
 
				     } else if (c < 240) { // UTF-8 with three bytes.
			
 
				       var c2 = bytes[cursor++];
			
 
				       var c3 = bytes[cursor++];
			
 
				-      chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
			
 
				+      codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
			
 
				+    } else if (c < 248) { // UTF-8 with 4 bytes.
			
 
				+      var c2 = bytes[cursor++];
			
 
				+      var c3 = bytes[cursor++];
			
 
				+      var c4 = bytes[cursor++];
			
 
				+      // Characters written on 4 bytes have 21 bits for a codepoint. 
			
 
				+      // We can't fit that on 16bit characters, so we use surrogates.
			
 
				+      var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
			
 
				+      // Surrogates formula from wikipedia.
			
 
				+      // 1. Subtract 0x10000 from codepoint
			
 
				+      codepoint -= 0x10000;
			
 
				+      // 2. Split this into the high 10-bit value and the low 10-bit value
			
 
				+      // 3. Add 0xD800 to the high value to form the high surrogate
			
 
				+      // 4. Add 0xDC00 to the low value to form the low surrogate:
			
 
				+      var low = (codepoint & 1023) + 0xDC00;
			
 
				+      var high = ((codepoint >> 10) & 1023) + 0xD800;
			
 
				+      codeUnits.push(high, low)
			
 
				     }
			
 
				   }
			
 
				-
			
 
				   // String.fromCharCode.apply is faster than manually appending characters on
			
 
				   // Chrome 25+, and generates no additional cons string garbage.
			
 
				-  var result = String.fromCharCode.apply(null, chars);
			
 
				+  var result = String.fromCharCode.apply(null, codeUnits);
			
 
				   this.cursor_ = cursor;
			
 
				   return result;
			
 
				 };
			
--- a/js/binary/decoder_test.js
+++ b/js/binary/decoder_test.js
@@ -209,7 +209,30 @@ describe('binaryDecoderTest', function() {
 
				     assertEquals(hashC, decoder.readFixedHash64());
			
 
				     assertEquals(hashD, decoder.readFixedHash64());
			
 
				   });
			
 
				+  
			
 
				+  /**
			
 
				+   * Test encoding and decoding utf-8.
			
 
				+   */
			
 
				+   it('testUtf8', function() {
			
 
				+    var encoder = new jspb.BinaryEncoder();
			
 
				 
			
 
				+    var ascii = "ASCII should work in 3, 2, 1..."
			
 
				+    var utf8_two_bytes = "©";
			
 
				+    var utf8_three_bytes = "❄";
			
 
				+    var utf8_four_bytes = "😁";
			
 
				+    
			
 
				+    encoder.writeString(ascii);
			
 
				+    encoder.writeString(utf8_two_bytes);
			
 
				+    encoder.writeString(utf8_three_bytes);
			
 
				+    encoder.writeString(utf8_four_bytes);
			
 
				+    
			
 
				+    var decoder = jspb.BinaryDecoder.alloc(encoder.end());
			
 
				+    
			
 
				+    assertEquals(ascii, decoder.readString(ascii.length));
			
 
				+    assertEquals(utf8_two_bytes, decoder.readString(utf8_two_bytes.length));
			
 
				+    assertEquals(utf8_three_bytes, decoder.readString(utf8_three_bytes.length));
			
 
				+    assertEquals(utf8_four_bytes, decoder.readString(utf8_four_bytes.length));
			
 
				+   });
			
 
				 
			
 
				   /**
			
 
				    * Verifies that misuse of the decoder class triggers assertions.
			
--- a/js/binary/encoder.js
+++ b/js/binary/encoder.js
@@ -409,19 +409,36 @@ jspb.BinaryEncoder.prototype.writeFixedHash64 = function(hash) {
 
				  */
			
 
				 jspb.BinaryEncoder.prototype.writeString = function(value) {
			
 
				   var oldLength = this.buffer_.length;
			
 
				-
			
 
				-  // UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray.
			
 
				+ 
			
 
				   for (var i = 0; i < value.length; i++) {
			
 
				+    
			
 
				     var c = value.charCodeAt(i);
			
 
				+
			
 
				     if (c < 128) {
			
 
				       this.buffer_.push(c);
			
 
				     } else if (c < 2048) {
			
 
				       this.buffer_.push((c >> 6) | 192);
			
 
				       this.buffer_.push((c & 63) | 128);
			
 
				-    } else {
			
 
				-      this.buffer_.push((c >> 12) | 224);
			
 
				-      this.buffer_.push(((c >> 6) & 63) | 128);
			
 
				-      this.buffer_.push((c & 63) | 128);
			
 
				+    } else if (c < 65536) {
			
 
				+      // Look for surrogates
			
 
				+      if (c >= 0xD800 && c <= 0xDBFF && i + 1 < value.length) {
			
 
				+        var second = value.charCodeAt(i + 1);
			
 
				+        if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate
			
 
				+          // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
			
 
				+          c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
			
 
				+
			
 
				+          this.buffer_.push((c >> 18) | 240);
			
 
				+          this.buffer_.push(((c >> 12) & 63 ) | 128);
			
 
				+          this.buffer_.push(((c >> 6) & 63) | 128);
			
 
				+          this.buffer_.push((c & 63) | 128);
			
 
				+          i++;
			
 
				+        }
			
 
				+      }
			
 
				+      else {
			
 
				+        this.buffer_.push((c >> 12) | 224);
			
 
				+        this.buffer_.push(((c >> 6) & 63) | 128);
			
 
				+        this.buffer_.push((c & 63) | 128);
			
 
				+      }
			
 
				     }
			
 
				   }