JsonTokenizer.cs 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. #region Copyright notice and license
  2. // Protocol Buffers - Google's data interchange format
  3. // Copyright 2008 Google Inc. All rights reserved.
  4. // https://developers.google.com/protocol-buffers/
  5. //
  6. // Redistribution and use in source and binary forms, with or without
  7. // modification, are permitted provided that the following conditions are
  8. // met:
  9. //
  10. // * Redistributions of source code must retain the above copyright
  11. // notice, this list of conditions and the following disclaimer.
  12. // * Redistributions in binary form must reproduce the above
  13. // copyright notice, this list of conditions and the following disclaimer
  14. // in the documentation and/or other materials provided with the
  15. // distribution.
  16. // * Neither the name of Google Inc. nor the names of its
  17. // contributors may be used to endorse or promote products derived from
  18. // this software without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. #endregion
  32. using System;
  33. using System.Collections.Generic;
  34. using System.Globalization;
  35. using System.IO;
  36. using System.Text;
  37. namespace Google.Protobuf
  38. {
  39. /// <summary>
  40. /// Simple but strict JSON tokenizer, rigidly following RFC 7159.
  41. /// </summary>
  42. /// <remarks>
  43. /// <para>
  44. /// This tokenizer is stateful, and only returns "useful" tokens - names, values etc.
  45. /// It does not create tokens for the separator between names and values, or for the comma
  46. /// between values. It validates the token stream as it goes - so callers can assume that the
  47. /// tokens it produces are appropriate. For example, it would never produce "start object, end array."
  48. /// </para>
  49. /// <para>Not thread-safe.</para>
  50. /// </remarks>
  51. internal sealed class JsonTokenizer
  52. {
  53. // The set of states in which a value is valid next token.
  54. private static readonly State ValueStates = State.ArrayStart | State.ArrayAfterComma | State.ObjectAfterColon | State.StartOfDocument;
  55. private readonly Stack<ContainerType> containerStack = new Stack<ContainerType>();
  56. private readonly PushBackReader reader;
  57. private JsonToken bufferedToken;
  58. private State state;
  59. internal JsonTokenizer(TextReader reader)
  60. {
  61. this.reader = new PushBackReader(reader);
  62. state = State.StartOfDocument;
  63. containerStack.Push(ContainerType.Document);
  64. }
  65. internal void PushBack(JsonToken token)
  66. {
  67. if (bufferedToken != null)
  68. {
  69. throw new InvalidOperationException("Can't push back twice");
  70. }
  71. bufferedToken = token;
  72. }
  73. /// <summary>
  74. /// Returns the next JSON token in the stream. An EndDocument token is returned to indicate the end of the stream,
  75. /// after which point <c>Next()</c> should not be called again.
  76. /// </summary>
  77. /// <remarks>
  78. /// This method essentially just loops through characters skipping whitespace, validating and
  79. /// changing state (e.g. from ObjectBeforeColon to ObjectAfterColon)
  80. /// until it reaches something which will be a genuine token (e.g. a start object, or a value) at which point
  81. /// it returns the token. Although the method is large, it would be relatively hard to break down further... most
  82. /// of it is the large switch statement, which sometimes returns and sometimes doesn't.
  83. /// </remarks>
  84. /// <returns>The next token in the stream. This is never null.</returns>
  85. /// <exception cref="InvalidOperationException">This method is called after an EndDocument token has been returned</exception>
  86. /// <exception cref="InvalidJsonException">The input text does not comply with RFC 7159</exception>
  87. internal JsonToken Next()
  88. {
  89. if (bufferedToken != null)
  90. {
  91. var ret = bufferedToken;
  92. bufferedToken = null;
  93. return ret;
  94. }
  95. if (state == State.ReaderExhausted)
  96. {
  97. throw new InvalidOperationException("Next() called after end of document");
  98. }
  99. while (true)
  100. {
  101. var next = reader.Read();
  102. if (next == null)
  103. {
  104. ValidateState(State.ExpectedEndOfDocument, "Unexpected end of document in state: ");
  105. state = State.ReaderExhausted;
  106. return JsonToken.EndDocument;
  107. }
  108. switch (next.Value)
  109. {
  110. // Skip whitespace between tokens
  111. case ' ':
  112. case '\t':
  113. case '\r':
  114. case '\n':
  115. break;
  116. case ':':
  117. ValidateState(State.ObjectBeforeColon, "Invalid state to read a colon: ");
  118. state = State.ObjectAfterColon;
  119. break;
  120. case ',':
  121. ValidateState(State.ObjectAfterProperty | State.ArrayAfterValue, "Invalid state to read a colon: ");
  122. state = state == State.ObjectAfterProperty ? State.ObjectAfterComma : State.ArrayAfterComma;
  123. break;
  124. case '"':
  125. string stringValue = ReadString();
  126. if ((state & (State.ObjectStart | State.ObjectAfterComma)) != 0)
  127. {
  128. state = State.ObjectBeforeColon;
  129. return JsonToken.Name(stringValue);
  130. }
  131. else
  132. {
  133. ValidateAndModifyStateForValue("Invalid state to read a double quote: ");
  134. return JsonToken.Value(stringValue);
  135. }
  136. case '{':
  137. ValidateState(ValueStates, "Invalid state to read an open brace: ");
  138. state = State.ObjectStart;
  139. containerStack.Push(ContainerType.Object);
  140. return JsonToken.StartObject;
  141. case '}':
  142. ValidateState(State.ObjectAfterProperty | State.ObjectStart, "Invalid state to read a close brace: ");
  143. PopContainer();
  144. return JsonToken.EndObject;
  145. case '[':
  146. ValidateState(ValueStates, "Invalid state to read an open square bracket: ");
  147. state = State.ArrayStart;
  148. containerStack.Push(ContainerType.Array);
  149. return JsonToken.StartArray;
  150. case ']':
  151. ValidateState(State.ArrayAfterValue | State.ArrayStart, "Invalid state to read a close square bracket: ");
  152. PopContainer();
  153. return JsonToken.EndArray;
  154. case 'n': // Start of null
  155. ConsumeLiteral("null");
  156. ValidateAndModifyStateForValue("Invalid state to read a null literal: ");
  157. return JsonToken.Null;
  158. case 't': // Start of true
  159. ConsumeLiteral("true");
  160. ValidateAndModifyStateForValue("Invalid state to read a true literal: ");
  161. return JsonToken.True;
  162. case 'f': // Start of false
  163. ConsumeLiteral("false");
  164. ValidateAndModifyStateForValue("Invalid state to read a false literal: ");
  165. return JsonToken.False;
  166. case '-': // Start of a number
  167. case '0':
  168. case '1':
  169. case '2':
  170. case '3':
  171. case '4':
  172. case '5':
  173. case '6':
  174. case '7':
  175. case '8':
  176. case '9':
  177. double number = ReadNumber(next.Value);
  178. ValidateAndModifyStateForValue("Invalid state to read a number token: ");
  179. return JsonToken.Value(number);
  180. default:
  181. throw new InvalidJsonException("Invalid first character of token: " + next.Value);
  182. }
  183. }
  184. }
  185. private void ValidateState(State validStates, string errorPrefix)
  186. {
  187. if ((validStates & state) == 0)
  188. {
  189. throw reader.CreateException(errorPrefix + state);
  190. }
  191. }
  192. /// <summary>
  193. /// Reads a string token. It is assumed that the opening " has already been read.
  194. /// </summary>
  195. private string ReadString()
  196. {
  197. var value = new StringBuilder();
  198. bool haveHighSurrogate = false;
  199. while (true)
  200. {
  201. char c = reader.ReadOrFail("Unexpected end of text while reading string");
  202. if (c < ' ')
  203. {
  204. throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in string literal: U+{0:x4}", (int) c));
  205. }
  206. if (c == '"')
  207. {
  208. if (haveHighSurrogate)
  209. {
  210. throw reader.CreateException("Invalid use of surrogate pair code units");
  211. }
  212. return value.ToString();
  213. }
  214. if (c == '\\')
  215. {
  216. c = ReadEscapedCharacter();
  217. }
  218. // TODO: Consider only allowing surrogate pairs that are either both escaped,
  219. // or both not escaped. It would be a very odd text stream that contained a "lone" high surrogate
  220. // followed by an escaped low surrogate or vice versa... and that couldn't even be represented in UTF-8.
  221. if (haveHighSurrogate != char.IsLowSurrogate(c))
  222. {
  223. throw reader.CreateException("Invalid use of surrogate pair code units");
  224. }
  225. haveHighSurrogate = char.IsHighSurrogate(c);
  226. value.Append(c);
  227. }
  228. }
  229. /// <summary>
  230. /// Reads an escaped character. It is assumed that the leading backslash has already been read.
  231. /// </summary>
  232. private char ReadEscapedCharacter()
  233. {
  234. char c = reader.ReadOrFail("Unexpected end of text while reading character escape sequence");
  235. switch (c)
  236. {
  237. case 'n':
  238. return '\n';
  239. case '\\':
  240. return '\\';
  241. case 'b':
  242. return '\b';
  243. case 'f':
  244. return '\f';
  245. case 'r':
  246. return '\r';
  247. case 't':
  248. return '\t';
  249. case '"':
  250. return '"';
  251. case '/':
  252. return '/';
  253. case 'u':
  254. return ReadUnicodeEscape();
  255. default:
  256. throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in character escape sequence: U+{0:x4}", (int) c));
  257. }
  258. }
  259. /// <summary>
  260. /// Reads an escaped Unicode 4-nybble hex sequence. It is assumed that the leading \u has already been read.
  261. /// </summary>
  262. private char ReadUnicodeEscape()
  263. {
  264. int result = 0;
  265. for (int i = 0; i < 4; i++)
  266. {
  267. char c = reader.ReadOrFail("Unexpected end of text while reading Unicode escape sequence");
  268. int nybble;
  269. if (c >= '0' && c <= '9')
  270. {
  271. nybble = c - '0';
  272. }
  273. else if (c >= 'a' && c <= 'f')
  274. {
  275. nybble = c - 'a' + 10;
  276. }
  277. else if (c >= 'A' && c <= 'F')
  278. {
  279. nybble = c - 'A' + 10;
  280. }
  281. else
  282. {
  283. throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in character escape sequence: U+{0:x4}", (int) c));
  284. }
  285. result = (result << 4) + nybble;
  286. }
  287. return (char) result;
  288. }
  289. /// <summary>
  290. /// Consumes a text-only literal, throwing an exception if the read text doesn't match it.
  291. /// It is assumed that the first letter of the literal has already been read.
  292. /// </summary>
  293. private void ConsumeLiteral(string text)
  294. {
  295. for (int i = 1; i < text.Length; i++)
  296. {
  297. char? next = reader.Read();
  298. if (next == null)
  299. {
  300. throw reader.CreateException("Unexpected end of text while reading literal token " + text);
  301. }
  302. if (next.Value != text[i])
  303. {
  304. throw reader.CreateException("Unexpected character while reading literal token " + text);
  305. }
  306. }
  307. }
  308. private double ReadNumber(char initialCharacter)
  309. {
  310. StringBuilder builder = new StringBuilder();
  311. if (initialCharacter == '-')
  312. {
  313. builder.Append("-");
  314. }
  315. else
  316. {
  317. reader.PushBack(initialCharacter);
  318. }
  319. // Each method returns the character it read that doesn't belong in that part,
  320. // so we know what to do next, including pushing the character back at the end.
  321. // null is returned for "end of text".
  322. char? next = ReadInt(builder);
  323. if (next == '.')
  324. {
  325. next = ReadFrac(builder);
  326. }
  327. if (next == 'e' || next == 'E')
  328. {
  329. next = ReadExp(builder);
  330. }
  331. // If we read a character which wasn't part of the number, push it back so we can read it again
  332. // to parse the next token.
  333. if (next != null)
  334. {
  335. reader.PushBack(next.Value);
  336. }
  337. // TODO: What exception should we throw if the value can't be represented as a double?
  338. try
  339. {
  340. return double.Parse(builder.ToString(),
  341. NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent,
  342. CultureInfo.InvariantCulture);
  343. }
  344. catch (OverflowException)
  345. {
  346. throw reader.CreateException("Numeric value out of range: " + builder);
  347. }
  348. }
  349. private char? ReadInt(StringBuilder builder)
  350. {
  351. char first = reader.ReadOrFail("Invalid numeric literal");
  352. if (first < '0' || first > '9')
  353. {
  354. throw reader.CreateException("Invalid numeric literal");
  355. }
  356. builder.Append(first);
  357. int digitCount;
  358. char? next = ConsumeDigits(builder, out digitCount);
  359. if (first == '0' && digitCount != 0)
  360. {
  361. throw reader.CreateException("Invalid numeric literal: leading 0 for non-zero value.");
  362. }
  363. return next;
  364. }
  365. private char? ReadFrac(StringBuilder builder)
  366. {
  367. builder.Append('.'); // Already consumed this
  368. int digitCount;
  369. char? next = ConsumeDigits(builder, out digitCount);
  370. if (digitCount == 0)
  371. {
  372. throw reader.CreateException("Invalid numeric literal: fraction with no trailing digits");
  373. }
  374. return next;
  375. }
  376. private char? ReadExp(StringBuilder builder)
  377. {
  378. builder.Append('E'); // Already consumed this (or 'e')
  379. char? next = reader.Read();
  380. if (next == null)
  381. {
  382. throw reader.CreateException("Invalid numeric literal: exponent with no trailing digits");
  383. }
  384. if (next == '-' || next == '+')
  385. {
  386. builder.Append(next.Value);
  387. }
  388. else
  389. {
  390. reader.PushBack(next.Value);
  391. }
  392. int digitCount;
  393. next = ConsumeDigits(builder, out digitCount);
  394. if (digitCount == 0)
  395. {
  396. throw reader.CreateException("Invalid numeric literal: exponent without value");
  397. }
  398. return next;
  399. }
  400. private char? ConsumeDigits(StringBuilder builder, out int count)
  401. {
  402. count = 0;
  403. while (true)
  404. {
  405. char? next = reader.Read();
  406. if (next == null || next.Value < '0' || next.Value > '9')
  407. {
  408. return next;
  409. }
  410. count++;
  411. builder.Append(next.Value);
  412. }
  413. }
  414. /// <summary>
  415. /// Validates that we're in a valid state to read a value (using the given error prefix if necessary)
  416. /// and changes the state to the appropriate one, e.g. ObjectAfterColon to ObjectAfterProperty.
  417. /// </summary>
  418. private void ValidateAndModifyStateForValue(string errorPrefix)
  419. {
  420. ValidateState(ValueStates, errorPrefix);
  421. switch (state)
  422. {
  423. case State.StartOfDocument:
  424. state = State.ExpectedEndOfDocument;
  425. return;
  426. case State.ObjectAfterColon:
  427. state = State.ObjectAfterProperty;
  428. return;
  429. case State.ArrayStart:
  430. case State.ArrayAfterComma:
  431. state = State.ArrayAfterValue;
  432. return;
  433. default:
  434. throw new InvalidOperationException("ValidateAndModifyStateForValue does not handle all value states (and should)");
  435. }
  436. }
  437. /// <summary>
  438. /// Pops the top-most container, and sets the state to the appropriate one for the end of a value
  439. /// in the parent container.
  440. /// </summary>
  441. private void PopContainer()
  442. {
  443. containerStack.Pop();
  444. var parent = containerStack.Peek();
  445. switch (parent)
  446. {
  447. case ContainerType.Object:
  448. state = State.ObjectAfterProperty;
  449. break;
  450. case ContainerType.Array:
  451. state = State.ArrayAfterValue;
  452. break;
  453. case ContainerType.Document:
  454. state = State.ExpectedEndOfDocument;
  455. break;
  456. default:
  457. throw new InvalidOperationException("Unexpected container type: " + parent);
  458. }
  459. }
  460. private enum ContainerType
  461. {
  462. Document, Object, Array
  463. }
  464. /// <summary>
  465. /// Possible states of the tokenizer.
  466. /// </summary>
  467. /// <remarks>
  468. /// <para>This is a flags enum purely so we can simply and efficiently represent a set of valid states
  469. /// for checking.</para>
  470. /// <para>
  471. /// Each is documented with an example,
  472. /// where ^ represents the current position within the text stream. The examples all use string values,
  473. /// but could be any value, including nested objects/arrays.
  474. /// The complete state of the tokenizer also includes a stack to indicate the contexts (arrays/objects).
  475. /// Any additional notional state of "AfterValue" indicates that a value has been completed, at which
  476. /// point there's an immediate transition to ExpectedEndOfDocument, ObjectAfterProperty or ArrayAfterValue.
  477. /// </para>
  478. /// <para>
  479. /// These states were derived manually by reading RFC 7159 carefully.
  480. /// </para>
  481. /// </remarks>
  482. [Flags]
  483. private enum State
  484. {
  485. /// <summary>
  486. /// ^ { "foo": "bar" }
  487. /// Before the value in a document. Next states: ObjectStart, ArrayStart, "AfterValue"
  488. /// </summary>
  489. StartOfDocument = 1 << 0,
  490. /// <summary>
  491. /// { "foo": "bar" } ^
  492. /// After the value in a document. Next states: ReaderExhausted
  493. /// </summary>
  494. ExpectedEndOfDocument = 1 << 1,
  495. /// <summary>
  496. /// { "foo": "bar" } ^ (and already read to the end of the reader)
  497. /// Terminal state.
  498. /// </summary>
  499. ReaderExhausted = 1 << 2,
  500. /// <summary>
  501. /// { ^ "foo": "bar" }
  502. /// Before the *first* property in an object.
  503. /// Next states:
  504. /// "AfterValue" (empty object)
  505. /// ObjectBeforeColon (read a name)
  506. /// </summary>
  507. ObjectStart = 1 << 3,
  508. /// <summary>
  509. /// { "foo" ^ : "bar", "x": "y" }
  510. /// Next state: ObjectAfterColon
  511. /// </summary>
  512. ObjectBeforeColon = 1 << 4,
  513. /// <summary>
  514. /// { "foo" : ^ "bar", "x": "y" }
  515. /// Before any property other than the first in an object.
  516. /// (Equivalently: after any property in an object)
  517. /// Next states:
  518. /// "AfterValue" (value is simple)
  519. /// ObjectStart (value is object)
  520. /// ArrayStart (value is array)
  521. /// </summary>
  522. ObjectAfterColon = 1 << 5,
  523. /// <summary>
  524. /// { "foo" : "bar" ^ , "x" : "y" }
  525. /// At the end of a property, so expecting either a comma or end-of-object
  526. /// Next states: ObjectAfterComma or "AfterValue"
  527. /// </summary>
  528. ObjectAfterProperty = 1 << 6,
  529. /// <summary>
  530. /// { "foo":"bar", ^ "x":"y" }
  531. /// Read the comma after the previous property, so expecting another property.
  532. /// This is like ObjectStart, but closing brace isn't valid here
  533. /// Next state: ObjectBeforeColon.
  534. /// </summary>
  535. ObjectAfterComma = 1 << 7,
  536. /// <summary>
  537. /// [ ^ "foo", "bar" ]
  538. /// Before the *first* value in an array.
  539. /// Next states:
  540. /// "AfterValue" (read a value)
  541. /// "AfterValue" (end of array; will pop stack)
  542. /// </summary>
  543. ArrayStart = 1 << 8,
  544. /// <summary>
  545. /// [ "foo" ^ , "bar" ]
  546. /// After any value in an array, so expecting either a comma or end-of-array
  547. /// Next states: ArrayAfterComma or "AfterValue"
  548. /// </summary>
  549. ArrayAfterValue = 1 << 9,
  550. /// <summary>
  551. /// [ "foo", ^ "bar" ]
  552. /// After a comma in an array, so there *must* be another value (simple or complex).
  553. /// Next states: "AfterValue" (simple value), StartObject, StartArray
  554. /// </summary>
  555. ArrayAfterComma = 1 << 10
  556. }
  557. /// <summary>
  558. /// Wrapper around a text reader allowing small amounts of buffering and location handling.
  559. /// </summary>
  560. private class PushBackReader
  561. {
  562. // TODO: Add locations for errors etc.
  563. private readonly TextReader reader;
  564. internal PushBackReader(TextReader reader)
  565. {
  566. // TODO: Wrap the reader in a BufferedReader?
  567. this.reader = reader;
  568. }
  569. /// <summary>
  570. /// The buffered next character, if we have one.
  571. /// </summary>
  572. private char? nextChar;
  573. /// <summary>
  574. /// Returns the next character in the stream, or null if we have reached the end.
  575. /// </summary>
  576. /// <returns></returns>
  577. internal char? Read()
  578. {
  579. if (nextChar != null)
  580. {
  581. char? tmp = nextChar;
  582. nextChar = null;
  583. return tmp;
  584. }
  585. int next = reader.Read();
  586. return next == -1 ? null : (char?) next;
  587. }
  588. internal char ReadOrFail(string messageOnFailure)
  589. {
  590. char? next = Read();
  591. if (next == null)
  592. {
  593. throw CreateException(messageOnFailure);
  594. }
  595. return next.Value;
  596. }
  597. internal void PushBack(char c)
  598. {
  599. if (nextChar != null)
  600. {
  601. throw new InvalidOperationException("Cannot push back when already buffering a character");
  602. }
  603. nextChar = c;
  604. }
  605. /// <summary>
  606. /// Creates a new exception appropriate for the current state of the reader.
  607. /// </summary>
  608. internal InvalidJsonException CreateException(string message)
  609. {
  610. // TODO: Keep track of and use the location.
  611. return new InvalidJsonException(message);
  612. }
  613. }
  614. }
  615. }