TextTokenizer.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc.
  3. // http://code.google.com/p/protobuf/
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License");
  6. // you may not use this file except in compliance with the License.
  7. // You may obtain a copy of the License at
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. using System;
  17. using System.Globalization;
  18. using System.Text.RegularExpressions;
  19. namespace Google.ProtocolBuffers {
  20. /// <summary>
  21. /// Represents a stream of tokens parsed from a string.
  22. /// </summary>
  23. internal sealed class TextTokenizer {
  24. private readonly string text;
  25. private string currentToken;
  26. /// <summary>
  27. /// The character index within the text to perform the next regex match at.
  28. /// </summary>
  29. private int matchPos = 0;
  30. /// <summary>
  31. /// The character index within the text at which the current token begins.
  32. /// </summary>
  33. private int pos = 0;
  34. /// <summary>
  35. /// The line number of the current token.
  36. /// </summary>
  37. private int line = 0;
  38. /// <summary>
  39. /// The column number of the current token.
  40. /// </summary>
  41. private int column = 0;
  42. /// <summary>
  43. /// The line number of the previous token.
  44. /// </summary>
  45. private int previousLine = 0;
  46. /// <summary>
  47. /// The column number of the previous token.
  48. /// </summary>
  49. private int previousColumn = 0;
  50. private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#.*$))+",
  51. RegexOptions.Compiled | RegexOptions.Multiline);
  52. private static readonly Regex TokenPattern = new Regex(
  53. "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
  54. "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
  55. "\\G\"([^\"\\\n\\\\]|\\\\.)*(\"|\\\\?$)|" + // a double-quoted string
  56. "\\G\'([^\"\\\n\\\\]|\\\\.)*(\'|\\\\?$)", // a single-quoted string
  57. RegexOptions.Compiled | RegexOptions.Multiline);
  58. private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  59. private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  60. private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  61. /** Construct a tokenizer that parses tokens from the given text. */
  62. public TextTokenizer(string text) {
  63. this.text = text;
  64. SkipWhitespace();
  65. NextToken();
  66. }
  67. /// <summary>
  68. /// Are we at the end of the input?
  69. /// </summary>
  70. public bool AtEnd {
  71. get { return currentToken.Length == 0; }
  72. }
  73. /// <summary>
  74. /// Advances to the next token.
  75. /// </summary>
  76. public void NextToken() {
  77. previousLine = line;
  78. previousColumn = column;
  79. // Advance the line counter to the current position.
  80. while (pos < matchPos) {
  81. if (text[pos] == '\n') {
  82. ++line;
  83. column = 0;
  84. } else {
  85. ++column;
  86. }
  87. ++pos;
  88. }
  89. // Match the next token.
  90. if (matchPos == text.Length) {
  91. // EOF
  92. currentToken = "";
  93. } else {
  94. Match match = TokenPattern.Match(text, matchPos);
  95. if (match.Success) {
  96. currentToken = match.Value;
  97. matchPos += match.Length;
  98. } else {
  99. // Take one character.
  100. currentToken = text[matchPos].ToString();
  101. matchPos++;
  102. }
  103. SkipWhitespace();
  104. }
  105. }
  106. /// <summary>
  107. /// Skip over any whitespace so that matchPos starts at the next token.
  108. /// </summary>
  109. private void SkipWhitespace() {
  110. Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
  111. if (match.Success) {
  112. matchPos += match.Length;
  113. }
  114. }
  115. /// <summary>
  116. /// If the next token exactly matches the given token, consume it and return
  117. /// true. Otherwise, return false without doing anything.
  118. /// </summary>
  119. public bool TryConsume(string token) {
  120. if (currentToken == token) {
  121. NextToken();
  122. return true;
  123. }
  124. return false;
  125. }
  126. /*
  127. * If the next token exactly matches {@code token}, consume it. Otherwise,
  128. * throw a {@link ParseException}.
  129. */
  130. /// <summary>
  131. /// If the next token exactly matches the specified one, consume it.
  132. /// Otherwise, throw a FormatException.
  133. /// </summary>
  134. /// <param name="token"></param>
  135. public void Consume(string token) {
  136. if (!TryConsume(token)) {
  137. throw CreateFormatException("Expected \"" + token + "\".");
  138. }
  139. }
  140. /// <summary>
  141. /// Returns true if the next token is an integer, but does not consume it.
  142. /// </summary>
  143. public bool LookingAtInteger() {
  144. if (currentToken.Length == 0) {
  145. return false;
  146. }
  147. char c = currentToken[0];
  148. return ('0' <= c && c <= '9') || c == '-' || c == '+';
  149. }
  150. /// <summary>
  151. /// If the next token is an identifier, consume it and return its value.
  152. /// Otherwise, throw a FormatException.
  153. /// </summary>
  154. public string ConsumeIdentifier() {
  155. foreach (char c in currentToken) {
  156. if (('a' <= c && c <= 'z') ||
  157. ('A' <= c && c <= 'Z') ||
  158. ('0' <= c && c <= '9') ||
  159. (c == '_') || (c == '.')) {
  160. // OK
  161. } else {
  162. throw CreateFormatException("Expected identifier.");
  163. }
  164. }
  165. string result = currentToken;
  166. NextToken();
  167. return result;
  168. }
  169. /// <summary>
  170. /// If the next token is a 32-bit signed integer, consume it and return its
  171. /// value. Otherwise, throw a FormatException.
  172. /// </summary>
  173. public int ConsumeInt32() {
  174. try {
  175. int result = TextFormat.ParseInt32(currentToken);
  176. NextToken();
  177. return result;
  178. } catch (FormatException e) {
  179. throw CreateIntegerParseException(e);
  180. }
  181. }
  182. /// <summary>
  183. /// If the next token is a 32-bit unsigned integer, consume it and return its
  184. /// value. Otherwise, throw a FormatException.
  185. /// </summary>
  186. public uint ConsumeUInt32() {
  187. try {
  188. uint result = TextFormat.ParseUInt32(currentToken);
  189. NextToken();
  190. return result;
  191. } catch (FormatException e) {
  192. throw CreateIntegerParseException(e);
  193. }
  194. }
  195. /// <summary>
  196. /// If the next token is a 64-bit signed integer, consume it and return its
  197. /// value. Otherwise, throw a FormatException.
  198. /// </summary>
  199. public long ConsumeInt64() {
  200. try {
  201. long result = TextFormat.ParseInt64(currentToken);
  202. NextToken();
  203. return result;
  204. } catch (FormatException e) {
  205. throw CreateIntegerParseException(e);
  206. }
  207. }
  208. /// <summary>
  209. /// If the next token is a 64-bit unsigned integer, consume it and return its
  210. /// value. Otherwise, throw a FormatException.
  211. /// </summary>
  212. public ulong ConsumeUInt64() {
  213. try {
  214. ulong result = TextFormat.ParseUInt64(currentToken);
  215. NextToken();
  216. return result;
  217. } catch (FormatException e) {
  218. throw CreateIntegerParseException(e);
  219. }
  220. }
  221. /// <summary>
  222. /// If the next token is a double, consume it and return its value.
  223. /// Otherwise, throw a FormatException.
  224. /// </summary>
  225. public double ConsumeDouble() {
  226. // We need to parse infinity and nan separately because
  227. // double.Parse() does not accept "inf", "infinity", or "nan".
  228. if (DoubleInfinity.IsMatch(currentToken)) {
  229. bool negative = currentToken.StartsWith("-");
  230. NextToken();
  231. return negative ? double.NegativeInfinity : double.PositiveInfinity;
  232. }
  233. if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
  234. NextToken();
  235. return Double.NaN;
  236. }
  237. try {
  238. double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
  239. NextToken();
  240. return result;
  241. } catch (FormatException e) {
  242. throw CreateFloatParseException(e);
  243. } catch (OverflowException e) {
  244. throw CreateFloatParseException(e);
  245. }
  246. }
  247. /// <summary>
  248. /// If the next token is a float, consume it and return its value.
  249. /// Otherwise, throw a FormatException.
  250. /// </summary>
  251. public float ConsumeFloat() {
  252. // We need to parse infinity and nan separately because
  253. // Float.parseFloat() does not accept "inf", "infinity", or "nan".
  254. if (FloatInfinity.IsMatch(currentToken)) {
  255. bool negative = currentToken.StartsWith("-");
  256. NextToken();
  257. return negative ? float.NegativeInfinity : float.PositiveInfinity;
  258. }
  259. if (FloatNan.IsMatch(currentToken)) {
  260. NextToken();
  261. return float.NaN;
  262. }
  263. if (currentToken.EndsWith("f")) {
  264. currentToken = currentToken.TrimEnd('f');
  265. }
  266. try {
  267. float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
  268. NextToken();
  269. return result;
  270. } catch (FormatException e) {
  271. throw CreateFloatParseException(e);
  272. } catch (OverflowException e) {
  273. throw CreateFloatParseException(e);
  274. }
  275. }
  276. /// <summary>
  277. /// If the next token is a Boolean, consume it and return its value.
  278. /// Otherwise, throw a FormatException.
  279. /// </summary>
  280. public bool ConsumeBoolean() {
  281. if (currentToken == "true") {
  282. NextToken();
  283. return true;
  284. }
  285. if (currentToken == "false") {
  286. NextToken();
  287. return false;
  288. }
  289. throw CreateFormatException("Expected \"true\" or \"false\".");
  290. }
  291. /// <summary>
  292. /// If the next token is a string, consume it and return its (unescaped) value.
  293. /// Otherwise, throw a FormatException.
  294. /// </summary>
  295. public string ConsumeString() {
  296. return ConsumeByteString().ToStringUtf8();
  297. }
  298. /// <summary>
  299. /// If the next token is a string, consume it, unescape it as a
  300. /// ByteString and return it. Otherwise, throw a FormatException.
  301. /// </summary>
  302. public ByteString ConsumeByteString() {
  303. char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
  304. if (quote != '\"' && quote != '\'') {
  305. throw CreateFormatException("Expected string.");
  306. }
  307. if (currentToken.Length < 2 ||
  308. currentToken[currentToken.Length-1] != quote) {
  309. throw CreateFormatException("String missing ending quote.");
  310. }
  311. try {
  312. string escaped = currentToken.Substring(1, currentToken.Length - 2);
  313. ByteString result = TextFormat.UnescapeBytes(escaped);
  314. NextToken();
  315. return result;
  316. } catch (FormatException e) {
  317. throw CreateFormatException(e.Message);
  318. }
  319. }
  320. /// <summary>
  321. /// Returns a format exception with the current line and column numbers
  322. /// in the description, suitable for throwing.
  323. /// </summary>
  324. public FormatException CreateFormatException(string description) {
  325. // Note: People generally prefer one-based line and column numbers.
  326. return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
  327. }
  328. /// <summary>
  329. /// Returns a format exception with the line and column numbers of the
  330. /// previous token in the description, suitable for throwing.
  331. /// </summary>
  332. public FormatException CreateFormatExceptionPreviousToken(string description) {
  333. // Note: People generally prefer one-based line and column numbers.
  334. return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
  335. }
  336. /// <summary>
  337. /// Constructs an appropriate FormatException for the given existing exception
  338. /// when trying to parse an integer.
  339. /// </summary>
  340. private FormatException CreateIntegerParseException(FormatException e) {
  341. return CreateFormatException("Couldn't parse integer: " + e.Message);
  342. }
  343. /// <summary>
  344. /// Constructs an appropriate FormatException for the given existing exception
  345. /// when trying to parse a float or double.
  346. /// </summary>
  347. private FormatException CreateFloatParseException(Exception e) {
  348. return CreateFormatException("Couldn't parse number: " + e.Message);
  349. }
  350. }
  351. }