tokenizer.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. import { FSM, makeTransition } from "@webassemblyjs/helper-fsm";
  2. import { codeFrameFromSource } from "@webassemblyjs/helper-code-frame";
  3. // eslint-disable-next-line
  4. function getCodeFrame(source, line, column) {
  5. var loc = {
  6. start: {
  7. line: line,
  8. column: column
  9. }
  10. };
  11. return "\n" + codeFrameFromSource(source, loc) + "\n";
  12. }
  13. var WHITESPACE = /\s/;
  14. var PARENS = /\(|\)/;
  15. var LETTERS = /[a-z0-9_/]/i;
  16. var idchar = /[a-z0-9!#$%&*+./:<=>?@\\[\]^_`|~-]/i;
  17. var valtypes = ["i32", "i64", "f32", "f64"];
  18. var NUMBERS = /[0-9|.|_]/;
  19. var NUMBER_KEYWORDS = /nan|inf/;
  20. function isNewLine(char) {
  21. return char.charCodeAt(0) === 10 || char.charCodeAt(0) === 13;
  22. }
  23. function Token(type, value, start, end) {
  24. var opts = arguments.length > 4 && arguments[4] !== undefined ? arguments[4] : {};
  25. var token = {
  26. type: type,
  27. value: value,
  28. loc: {
  29. start: start,
  30. end: end
  31. }
  32. };
  33. if (Object.keys(opts).length > 0) {
  34. // $FlowIgnore
  35. token["opts"] = opts;
  36. }
  37. return token;
  38. }
  39. var tokenTypes = {
  40. openParen: "openParen",
  41. closeParen: "closeParen",
  42. number: "number",
  43. string: "string",
  44. name: "name",
  45. identifier: "identifier",
  46. valtype: "valtype",
  47. dot: "dot",
  48. comment: "comment",
  49. equal: "equal",
  50. keyword: "keyword"
  51. };
  52. export var keywords = {
  53. module: "module",
  54. func: "func",
  55. param: "param",
  56. result: "result",
  57. export: "export",
  58. loop: "loop",
  59. block: "block",
  60. if: "if",
  61. then: "then",
  62. else: "else",
  63. call: "call",
  64. call_indirect: "call_indirect",
  65. import: "import",
  66. memory: "memory",
  67. table: "table",
  68. global: "global",
  69. anyfunc: "anyfunc",
  70. mut: "mut",
  71. data: "data",
  72. type: "type",
  73. elem: "elem",
  74. start: "start",
  75. offset: "offset"
  76. };
  77. var NUMERIC_SEPARATOR = "_";
  78. /**
  79. * Build the FSM for number literals
  80. */
  81. var numberLiteralFSM = new FSM({
  82. START: [makeTransition(/-|\+/, "AFTER_SIGN"), makeTransition(/nan:0x/, "NAN_HEX", {
  83. n: 6
  84. }), makeTransition(/nan|inf/, "STOP", {
  85. n: 3
  86. }), makeTransition(/0x/, "HEX", {
  87. n: 2
  88. }), makeTransition(/[0-9]/, "DEC"), makeTransition(/\./, "DEC_FRAC")],
  89. AFTER_SIGN: [makeTransition(/nan:0x/, "NAN_HEX", {
  90. n: 6
  91. }), makeTransition(/nan|inf/, "STOP", {
  92. n: 3
  93. }), makeTransition(/0x/, "HEX", {
  94. n: 2
  95. }), makeTransition(/[0-9]/, "DEC"), makeTransition(/\./, "DEC_FRAC")],
  96. DEC_FRAC: [makeTransition(/[0-9]/, "DEC_FRAC", {
  97. allowedSeparator: NUMERIC_SEPARATOR
  98. }), makeTransition(/e|E/, "DEC_SIGNED_EXP")],
  99. DEC: [makeTransition(/[0-9]/, "DEC", {
  100. allowedSeparator: NUMERIC_SEPARATOR
  101. }), makeTransition(/\./, "DEC_FRAC"), makeTransition(/e|E/, "DEC_SIGNED_EXP")],
  102. DEC_SIGNED_EXP: [makeTransition(/\+|-/, "DEC_EXP"), makeTransition(/[0-9]/, "DEC_EXP")],
  103. DEC_EXP: [makeTransition(/[0-9]/, "DEC_EXP", {
  104. allowedSeparator: NUMERIC_SEPARATOR
  105. })],
  106. HEX: [makeTransition(/[0-9|A-F|a-f]/, "HEX", {
  107. allowedSeparator: NUMERIC_SEPARATOR
  108. }), makeTransition(/\./, "HEX_FRAC"), makeTransition(/p|P/, "HEX_SIGNED_EXP")],
  109. HEX_FRAC: [makeTransition(/[0-9|A-F|a-f]/, "HEX_FRAC", {
  110. allowedSeparator: NUMERIC_SEPARATOR
  111. }), makeTransition(/p|P|/, "HEX_SIGNED_EXP")],
  112. HEX_SIGNED_EXP: [makeTransition(/[0-9|+|-]/, "HEX_EXP")],
  113. HEX_EXP: [makeTransition(/[0-9]/, "HEX_EXP", {
  114. allowedSeparator: NUMERIC_SEPARATOR
  115. })],
  116. NAN_HEX: [makeTransition(/[0-9|A-F|a-f]/, "NAN_HEX", {
  117. allowedSeparator: NUMERIC_SEPARATOR
  118. })],
  119. STOP: []
  120. }, "START", "STOP");
  121. export function tokenize(input) {
  122. var current = 0;
  123. var char = input[current]; // Used by SourceLocation
  124. var column = 1;
  125. var line = 1;
  126. var tokens = [];
  127. /**
  128. * Creates a pushToken function for a given type
  129. */
  130. function pushToken(type) {
  131. return function (v) {
  132. var opts = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
  133. var startColumn = opts.startColumn || column - String(v).length;
  134. delete opts.startColumn;
  135. var endColumn = opts.endColumn || startColumn + String(v).length - 1;
  136. delete opts.endColumn;
  137. var start = {
  138. line: line,
  139. column: startColumn
  140. };
  141. var end = {
  142. line: line,
  143. column: endColumn
  144. };
  145. tokens.push(Token(type, v, start, end, opts));
  146. };
  147. }
  148. /**
  149. * Functions to save newly encountered tokens
  150. */
  151. var pushCloseParenToken = pushToken(tokenTypes.closeParen);
  152. var pushOpenParenToken = pushToken(tokenTypes.openParen);
  153. var pushNumberToken = pushToken(tokenTypes.number);
  154. var pushValtypeToken = pushToken(tokenTypes.valtype);
  155. var pushNameToken = pushToken(tokenTypes.name);
  156. var pushIdentifierToken = pushToken(tokenTypes.identifier);
  157. var pushKeywordToken = pushToken(tokenTypes.keyword);
  158. var pushDotToken = pushToken(tokenTypes.dot);
  159. var pushStringToken = pushToken(tokenTypes.string);
  160. var pushCommentToken = pushToken(tokenTypes.comment);
  161. var pushEqualToken = pushToken(tokenTypes.equal);
  162. /**
  163. * Can be used to look at the next character(s).
  164. *
  165. * The default behavior `lookahead()` simply returns the next character without consuming it.
  166. * Letters are always returned in lowercase.
  167. *
  168. * @param {number} length How many characters to query. Default = 1
  169. * @param {number} offset How many characters to skip forward from current one. Default = 1
  170. *
  171. */
  172. function lookahead() {
  173. var length = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
  174. var offset = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
  175. return input.substring(current + offset, current + offset + length).toLowerCase();
  176. }
  177. /**
  178. * Advances the cursor in the input by a certain amount
  179. *
  180. * @param {number} amount How many characters to consume. Default = 1
  181. */
  182. function eatCharacter() {
  183. var amount = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
  184. column += amount;
  185. current += amount;
  186. char = input[current];
  187. }
  188. while (current < input.length) {
  189. // ;;
  190. if (char === ";" && lookahead() === ";") {
  191. var startColumn = column;
  192. eatCharacter(2);
  193. var text = "";
  194. while (!isNewLine(char)) {
  195. text += char;
  196. eatCharacter();
  197. if (char === undefined) {
  198. break;
  199. }
  200. }
  201. var endColumn = column;
  202. pushCommentToken(text, {
  203. type: "leading",
  204. startColumn: startColumn,
  205. endColumn: endColumn
  206. });
  207. continue;
  208. } // (;
  209. if (char === "(" && lookahead() === ";") {
  210. var _startColumn = column;
  211. eatCharacter(2);
  212. var _text = ""; // ;)
  213. while (true) {
  214. char = input[current];
  215. if (char === ";" && lookahead() === ")") {
  216. eatCharacter(2);
  217. break;
  218. }
  219. _text += char;
  220. eatCharacter();
  221. if (isNewLine(char)) {
  222. line++;
  223. column = 0;
  224. }
  225. }
  226. var _endColumn = column;
  227. pushCommentToken(_text, {
  228. type: "block",
  229. startColumn: _startColumn,
  230. endColumn: _endColumn
  231. });
  232. continue;
  233. }
  234. if (char === "(") {
  235. pushOpenParenToken(char);
  236. eatCharacter();
  237. continue;
  238. }
  239. if (char === "=") {
  240. pushEqualToken(char);
  241. eatCharacter();
  242. continue;
  243. }
  244. if (char === ")") {
  245. pushCloseParenToken(char);
  246. eatCharacter();
  247. continue;
  248. }
  249. if (isNewLine(char)) {
  250. line++;
  251. eatCharacter();
  252. column = 0;
  253. continue;
  254. }
  255. if (WHITESPACE.test(char)) {
  256. eatCharacter();
  257. continue;
  258. }
  259. if (char === "$") {
  260. var _startColumn2 = column;
  261. eatCharacter();
  262. var value = "";
  263. while (idchar.test(char)) {
  264. value += char;
  265. eatCharacter();
  266. }
  267. var _endColumn2 = column;
  268. pushIdentifierToken(value, {
  269. startColumn: _startColumn2,
  270. endColumn: _endColumn2
  271. });
  272. continue;
  273. }
  274. if (NUMBERS.test(char) || NUMBER_KEYWORDS.test(lookahead(3, 0)) || char === "-" || char === "+") {
  275. var _startColumn3 = column;
  276. var _value = numberLiteralFSM.run(input.slice(current));
  277. if (_value === "") {
  278. throw new Error(getCodeFrame(input, line, column) + "Unexpected character " + JSON.stringify(char));
  279. }
  280. pushNumberToken(_value, {
  281. startColumn: _startColumn3
  282. });
  283. eatCharacter(_value.length);
  284. if (char && !PARENS.test(char) && !WHITESPACE.test(char)) {
  285. throw new Error(getCodeFrame(input, line, column) + "Unexpected character " + JSON.stringify(char));
  286. }
  287. continue;
  288. }
  289. if (char === '"') {
  290. var _startColumn4 = column;
  291. var _value2 = "";
  292. eatCharacter(); // "
  293. while (char !== '"') {
  294. if (isNewLine(char)) {
  295. throw new Error(getCodeFrame(input, line, column) + "Unexpected character " + JSON.stringify(char));
  296. }
  297. _value2 += char;
  298. eatCharacter(); // char
  299. }
  300. eatCharacter(); // "
  301. var _endColumn3 = column;
  302. pushStringToken(_value2, {
  303. startColumn: _startColumn4,
  304. endColumn: _endColumn3
  305. });
  306. continue;
  307. }
  308. if (LETTERS.test(char)) {
  309. var _value3 = "";
  310. var _startColumn5 = column;
  311. while (char && LETTERS.test(char)) {
  312. _value3 += char;
  313. eatCharacter();
  314. }
  315. /*
  316. * Handle MemberAccess
  317. */
  318. if (char === ".") {
  319. var dotStartColumn = column;
  320. if (valtypes.indexOf(_value3) !== -1) {
  321. pushValtypeToken(_value3, {
  322. startColumn: _startColumn5
  323. });
  324. } else {
  325. pushNameToken(_value3);
  326. }
  327. eatCharacter();
  328. _value3 = "";
  329. var nameStartColumn = column;
  330. while (LETTERS.test(char)) {
  331. _value3 += char;
  332. eatCharacter();
  333. }
  334. pushDotToken(".", {
  335. startColumn: dotStartColumn
  336. });
  337. pushNameToken(_value3, {
  338. startColumn: nameStartColumn
  339. });
  340. continue;
  341. }
  342. /*
  343. * Handle keywords
  344. */
  345. // $FlowIgnore
  346. if (typeof keywords[_value3] === "string") {
  347. pushKeywordToken(_value3, {
  348. startColumn: _startColumn5
  349. });
  350. continue;
  351. }
  352. /*
  353. * Handle types
  354. */
  355. if (valtypes.indexOf(_value3) !== -1) {
  356. pushValtypeToken(_value3, {
  357. startColumn: _startColumn5
  358. });
  359. continue;
  360. }
  361. /*
  362. * Handle literals
  363. */
  364. pushNameToken(_value3, {
  365. startColumn: _startColumn5
  366. });
  367. continue;
  368. }
  369. throw new Error(getCodeFrame(input, line, column) + "Unexpected character " + JSON.stringify(char));
  370. }
  371. return tokens;
  372. }
  373. export var tokens = tokenTypes;