TextTokenizer.cs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. using System;
  2. using System.Globalization;
  3. using System.Text.RegularExpressions;
  4. namespace Google.ProtocolBuffers {
  5. /// <summary>
  6. /// Represents a stream of tokens parsed from a string.
  7. /// </summary>
  8. internal sealed class TextTokenizer {
  9. private readonly string text;
  10. private string currentToken;
  11. /// <summary>
  12. /// The character index within the text to perform the next regex match at.
  13. /// </summary>
  14. private int matchPos = 0;
  15. /// <summary>
  16. /// The character index within the text at which the current token begins.
  17. /// </summary>
  18. private int pos = 0;
  19. /// <summary>
  20. /// The line number of the current token.
  21. /// </summary>
  22. private int line = 0;
  23. /// <summary>
  24. /// The column number of the current token.
  25. /// </summary>
  26. private int column = 0;
  27. /// <summary>
  28. /// The line number of the previous token.
  29. /// </summary>
  30. private int previousLine = 0;
  31. /// <summary>
  32. /// The column number of the previous token.
  33. /// </summary>
  34. private int previousColumn = 0;
  35. private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled);
  36. private static Regex TokenPattern = new Regex(
  37. "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
  38. "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
  39. "\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" + // a double-quoted string
  40. "\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)", // a single-quoted string
  41. RegexOptions.Compiled);
  42. /** Construct a tokenizer that parses tokens from the given text. */
  43. public TextTokenizer(string text) {
  44. this.text = text;
  45. SkipWhitespace();
  46. NextToken();
  47. }
  48. /// <summary>
  49. /// Are we at the end of the input?
  50. /// </summary>
  51. public bool AtEnd {
  52. get { return currentToken.Length == 0; }
  53. }
  54. /// <summary>
  55. /// Advances to the next token.
  56. /// </summary>
  57. public void NextToken() {
  58. previousLine = line;
  59. previousColumn = column;
  60. // Advance the line counter to the current position.
  61. while (pos < matchPos) {
  62. if (text[pos] == '\n') {
  63. ++line;
  64. column = 0;
  65. } else {
  66. ++column;
  67. }
  68. ++pos;
  69. }
  70. // Match the next token.
  71. if (matchPos == text.Length) {
  72. // EOF
  73. currentToken = "";
  74. } else {
  75. Match match = TokenPattern.Match(text, matchPos);
  76. if (match.Success) {
  77. currentToken = match.Value;
  78. matchPos += match.Length;
  79. } else {
  80. // Take one character.
  81. currentToken = text[matchPos].ToString();
  82. matchPos++;
  83. }
  84. SkipWhitespace();
  85. }
  86. }
  87. /// <summary>
  88. /// Skip over any whitespace so that matchPos starts at the next token.
  89. /// </summary>
  90. private void SkipWhitespace() {
  91. Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
  92. if (match.Success) {
  93. matchPos += match.Length;
  94. }
  95. }
  96. /// <summary>
  97. /// If the next token exactly matches the given token, consume it and return
  98. /// true. Otherwise, return false without doing anything.
  99. /// </summary>
  100. public bool TryConsume(string token) {
  101. if (currentToken == token) {
  102. NextToken();
  103. return true;
  104. }
  105. return false;
  106. }
  107. /*
  108. * If the next token exactly matches {@code token}, consume it. Otherwise,
  109. * throw a {@link ParseException}.
  110. */
  111. /// <summary>
  112. /// If the next token exactly matches the specified one, consume it.
  113. /// Otherwise, throw a FormatException.
  114. /// </summary>
  115. /// <param name="token"></param>
  116. public void Consume(string token) {
  117. if (!TryConsume(token)) {
  118. throw CreateFormatException("Expected \"" + token + "\".");
  119. }
  120. }
  121. /// <summary>
  122. /// Returns true if the next token is an integer, but does not consume it.
  123. /// </summary>
  124. public bool LookingAtInteger() {
  125. if (currentToken.Length == 0) {
  126. return false;
  127. }
  128. char c = currentToken[0];
  129. return ('0' <= c && c <= '9') || c == '-' || c == '+';
  130. }
  131. /// <summary>
  132. /// If the next token is an identifier, consume it and return its value.
  133. /// Otherwise, throw a FormatException.
  134. /// </summary>
  135. public string ConsumeIdentifier() {
  136. foreach (char c in currentToken) {
  137. if (('a' <= c && c <= 'z') ||
  138. ('A' <= c && c <= 'Z') ||
  139. ('0' <= c && c <= '9') ||
  140. (c == '_') || (c == '.')) {
  141. // OK
  142. } else {
  143. throw CreateFormatException("Expected identifier.");
  144. }
  145. }
  146. string result = currentToken;
  147. NextToken();
  148. return result;
  149. }
  150. /// <summary>
  151. /// If the next token is a 32-bit signed integer, consume it and return its
  152. /// value. Otherwise, throw a FormatException.
  153. /// </summary>
  154. public int ConsumeInt32() {
  155. try {
  156. int result = TextFormat.ParseInt32(currentToken);
  157. NextToken();
  158. return result;
  159. } catch (FormatException e) {
  160. throw CreateIntegerParseException(e);
  161. }
  162. }
  163. /// <summary>
  164. /// If the next token is a 32-bit unsigned integer, consume it and return its
  165. /// value. Otherwise, throw a FormatException.
  166. /// </summary>
  167. public uint ConsumeUInt32() {
  168. try {
  169. uint result = TextFormat.ParseUInt32(currentToken);
  170. NextToken();
  171. return result;
  172. } catch (FormatException e) {
  173. throw CreateIntegerParseException(e);
  174. }
  175. }
  176. /// <summary>
  177. /// If the next token is a 64-bit signed integer, consume it and return its
  178. /// value. Otherwise, throw a FormatException.
  179. /// </summary>
  180. public long ConsumeInt64() {
  181. try {
  182. long result = TextFormat.ParseInt64(currentToken);
  183. NextToken();
  184. return result;
  185. } catch (FormatException e) {
  186. throw CreateIntegerParseException(e);
  187. }
  188. }
  189. /// <summary>
  190. /// If the next token is a 64-bit unsigned integer, consume it and return its
  191. /// value. Otherwise, throw a FormatException.
  192. /// </summary>
  193. public ulong ConsumeUInt64() {
  194. try {
  195. ulong result = TextFormat.ParseUInt64(currentToken);
  196. NextToken();
  197. return result;
  198. } catch (FormatException e) {
  199. throw CreateIntegerParseException(e);
  200. }
  201. }
  202. /// <summary>
  203. /// If the next token is a double, consume it and return its value.
  204. /// Otherwise, throw a FormatException.
  205. /// </summary>
  206. public double ConsumeDouble() {
  207. try {
  208. double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
  209. NextToken();
  210. return result;
  211. } catch (FormatException e) {
  212. throw CreateFloatParseException(e);
  213. } catch (OverflowException e) {
  214. throw CreateFloatParseException(e);
  215. }
  216. }
  217. /// <summary>
  218. /// If the next token is a float, consume it and return its value.
  219. /// Otherwise, throw a FormatException.
  220. /// </summary>
  221. public float consumeFloat() {
  222. try {
  223. float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
  224. NextToken();
  225. return result;
  226. } catch (FormatException e) {
  227. throw CreateFloatParseException(e);
  228. } catch (OverflowException e) {
  229. throw CreateFloatParseException(e);
  230. }
  231. }
  232. /// <summary>
  233. /// If the next token is a Boolean, consume it and return its value.
  234. /// Otherwise, throw a FormatException.
  235. /// </summary>
  236. public bool ConsumeBoolean() {
  237. if (currentToken == "true") {
  238. NextToken();
  239. return true;
  240. }
  241. if (currentToken == "false") {
  242. NextToken();
  243. return false;
  244. }
  245. throw CreateFormatException("Expected \"true\" or \"false\".");
  246. }
  247. /// <summary>
  248. /// If the next token is a string, consume it and return its (unescaped) value.
  249. /// Otherwise, throw a FormatException.
  250. /// </summary>
  251. public string ConsumeString() {
  252. return ConsumeByteString().ToStringUtf8();
  253. }
  254. /// <summary>
  255. /// If the next token is a string, consume it, unescape it as a
  256. /// ByteString and return it. Otherwise, throw a FormatException.
  257. /// </summary>
  258. public ByteString ConsumeByteString() {
  259. char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
  260. if (quote != '\"' && quote != '\'') {
  261. throw CreateFormatException("Expected string.");
  262. }
  263. if (currentToken.Length < 2 ||
  264. currentToken[currentToken.Length-1] != quote) {
  265. throw CreateFormatException("String missing ending quote.");
  266. }
  267. try {
  268. string escaped = currentToken.Substring(1, currentToken.Length - 2);
  269. ByteString result = TextFormat.UnescapeBytes(escaped);
  270. NextToken();
  271. return result;
  272. } catch (FormatException e) {
  273. throw CreateFormatException(e.Message);
  274. }
  275. }
  276. /// <summary>
  277. /// Returns a format exception with the current line and column numbers
  278. /// in the description, suitable for throwing.
  279. /// </summary>
  280. public FormatException CreateFormatException(string description) {
  281. // Note: People generally prefer one-based line and column numbers.
  282. return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
  283. }
  284. /// <summary>
  285. /// Returns a format exception with the line and column numbers of the
  286. /// previous token in the description, suitable for throwing.
  287. /// </summary>
  288. public FormatException CreateFormatExceptionPreviousToken(string description) {
  289. // Note: People generally prefer one-based line and column numbers.
  290. return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
  291. }
  292. /// <summary>
  293. /// Constructs an appropriate FormatException for the given existing exception
  294. /// when trying to parse an integer.
  295. /// </summary>
  296. private FormatException CreateIntegerParseException(FormatException e) {
  297. return CreateFormatException("Couldn't parse integer: " + e.Message);
  298. }
  299. /// <summary>
  300. /// Constructs an appropriate FormatException for the given existing exception
  301. /// when trying to parse a float or double.
  302. /// </summary>
  303. private FormatException CreateFloatParseException(Exception e) {
  304. return CreateFormatException("Couldn't parse number: " + e.Message);
  305. }
  306. }
  307. }