rewrite-pattern.js 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. 'use strict';
  2. const generate = require('regjsgen').generate;
  3. const parse = require('regjsparser').parse;
  4. const regenerate = require('regenerate');
  5. const unicodeMatchProperty = require('unicode-match-property-ecmascript');
  6. const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
  7. const iuMappings = require('./data/iu-mappings.js');
  8. const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  9. // Prepare a Regenerate set containing all code points, used for negative
  10. // character classes (if any).
  11. const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
  12. // Without the `u` flag, the range stops at 0xFFFF.
  13. // https://mths.be/es6#sec-pattern-semantics
  14. const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
  15. // Prepare a Regenerate set containing all code points that are supposed to be
  16. // matched by `/./u`. https://mths.be/es6#sec-atom
  17. const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  18. .remove(
  19. // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  20. 0x000A, // Line Feed <LF>
  21. 0x000D, // Carriage Return <CR>
  22. 0x2028, // Line Separator <LS>
  23. 0x2029 // Paragraph Separator <PS>
  24. );
  25. // Prepare a Regenerate set containing all code points that are supposed to be
  26. // matched by `/./` (only BMP code points).
  27. const DOT_SET = DOT_SET_UNICODE.clone()
  28. .intersection(BMP_SET);
  29. const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
  30. if (unicode) {
  31. if (ignoreCase) {
  32. return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
  33. }
  34. return ESCAPE_SETS.UNICODE.get(character);
  35. }
  36. return ESCAPE_SETS.REGULAR.get(character);
  37. };
  38. const getDotSet = (unicode, dotAll) => {
  39. if (dotAll) {
  40. return unicode ? UNICODE_SET : BMP_SET;
  41. }
  42. return unicode ? DOT_SET_UNICODE : DOT_SET;
  43. };
  44. const getUnicodePropertyValueSet = (property, value) => {
  45. const path = value ?
  46. `${ property }/${ value }` :
  47. `Binary_Property/${ property }`;
  48. try {
  49. return require(`regenerate-unicode-properties/${ path }.js`);
  50. } catch (exception) {
  51. throw new Error(
  52. `Failed to recognize value \`${ value }\` for property ` +
  53. `\`${ property }\`.`
  54. );
  55. }
  56. };
  57. const handleLoneUnicodePropertyNameOrValue = (value) => {
  58. // It could be a `General_Category` value or a binary property.
  59. // Note: `unicodeMatchPropertyValue` throws on invalid values.
  60. try {
  61. const property = 'General_Category';
  62. const category = unicodeMatchPropertyValue(property, value);
  63. return getUnicodePropertyValueSet(property, category);
  64. } catch (exception) {}
  65. // It’s not a `General_Category` value, so check if it’s a binary
  66. // property. Note: `unicodeMatchProperty` throws on invalid properties.
  67. const property = unicodeMatchProperty(value);
  68. return getUnicodePropertyValueSet(property);
  69. };
  70. const getUnicodePropertyEscapeSet = (value, isNegative) => {
  71. const parts = value.split('=');
  72. const firstPart = parts[0];
  73. let set;
  74. if (parts.length == 1) {
  75. set = handleLoneUnicodePropertyNameOrValue(firstPart);
  76. } else {
  77. // The pattern consists of two parts, i.e. `Property=Value`.
  78. const property = unicodeMatchProperty(firstPart);
  79. const value = unicodeMatchPropertyValue(property, parts[1]);
  80. set = getUnicodePropertyValueSet(property, value);
  81. }
  82. if (isNegative) {
  83. return UNICODE_SET.clone().remove(set);
  84. }
  85. return set.clone();
  86. };
  87. // Given a range of code points, add any case-folded code points in that range
  88. // to a set.
  89. regenerate.prototype.iuAddRange = function(min, max) {
  90. const $this = this;
  91. do {
  92. const folded = caseFold(min);
  93. if (folded) {
  94. $this.add(folded);
  95. }
  96. } while (++min <= max);
  97. return $this;
  98. };
  99. const update = (item, pattern) => {
  100. let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
  101. switch (tree.type) {
  102. case 'characterClass':
  103. case 'group':
  104. case 'value':
  105. // No wrapping needed.
  106. break;
  107. default:
  108. // Wrap the pattern in a non-capturing group.
  109. tree = wrap(tree, pattern);
  110. }
  111. Object.assign(item, tree);
  112. };
  113. const wrap = (tree, pattern) => {
  114. // Wrap the pattern in a non-capturing group.
  115. return {
  116. 'type': 'group',
  117. 'behavior': 'ignore',
  118. 'body': [tree],
  119. 'raw': `(?:${ pattern })`
  120. };
  121. };
  122. const caseFold = (codePoint) => {
  123. return iuMappings.get(codePoint) || false;
  124. };
  125. const processCharacterClass = (characterClassItem, regenerateOptions) => {
  126. let set = regenerate();
  127. for (const item of characterClassItem.body) {
  128. switch (item.type) {
  129. case 'value':
  130. set.add(item.codePoint);
  131. if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
  132. const folded = caseFold(item.codePoint);
  133. if (folded) {
  134. set.add(folded);
  135. }
  136. }
  137. break;
  138. case 'characterClassRange':
  139. const min = item.min.codePoint;
  140. const max = item.max.codePoint;
  141. set.addRange(min, max);
  142. if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
  143. set.iuAddRange(min, max);
  144. }
  145. break;
  146. case 'characterClassEscape':
  147. set.add(getCharacterClassEscapeSet(
  148. item.value,
  149. config.unicode,
  150. config.ignoreCase
  151. ));
  152. break;
  153. case 'unicodePropertyEscape':
  154. set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
  155. break;
  156. // The `default` clause is only here as a safeguard; it should never be
  157. // reached. Code coverage tools should ignore it.
  158. /* istanbul ignore next */
  159. default:
  160. throw new Error(`Unknown term type: ${ item.type }`);
  161. }
  162. }
  163. if (characterClassItem.negative) {
  164. set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
  165. }
  166. update(characterClassItem, set.toString(regenerateOptions));
  167. return characterClassItem;
  168. };
  169. const updateNamedReference = (item, index) => {
  170. delete item.name;
  171. item.matchIndex = index;
  172. };
  173. const assertNoUnmatchedReferences = (groups) => {
  174. const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
  175. if (unmatchedReferencesNames.length > 0) {
  176. throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
  177. }
  178. };
  179. const processTerm = (item, regenerateOptions, groups) => {
  180. switch (item.type) {
  181. case 'dot':
  182. update(
  183. item,
  184. getDotSet(config.unicode, config.dotAll).toString(regenerateOptions)
  185. );
  186. break;
  187. case 'characterClass':
  188. item = processCharacterClass(item, regenerateOptions);
  189. break;
  190. case 'unicodePropertyEscape':
  191. update(
  192. item,
  193. getUnicodePropertyEscapeSet(item.value, item.negative)
  194. .toString(regenerateOptions)
  195. );
  196. break;
  197. case 'characterClassEscape':
  198. update(
  199. item,
  200. getCharacterClassEscapeSet(
  201. item.value,
  202. config.unicode,
  203. config.ignoreCase
  204. ).toString(regenerateOptions)
  205. );
  206. break;
  207. case 'group':
  208. groups.lastIndex++;
  209. if (item.name) {
  210. const name = item.name.value;
  211. if (groups.names[name]) {
  212. throw new Error(
  213. `Multiple groups with the same name (${ name }) are not allowed.`
  214. );
  215. }
  216. const index = groups.lastIndex;
  217. delete item.name;
  218. groups.names[name] = index;
  219. if (groups.onNamedGroup) {
  220. groups.onNamedGroup.call(null, name, index);
  221. }
  222. if (groups.unmatchedReferences[name]) {
  223. groups.unmatchedReferences[name].forEach(reference => {
  224. updateNamedReference(reference, index);
  225. });
  226. delete groups.unmatchedReferences[name];
  227. }
  228. }
  229. /* falls through */
  230. case 'alternative':
  231. case 'disjunction':
  232. case 'quantifier':
  233. item.body = item.body.map(term => {
  234. return processTerm(term, regenerateOptions, groups);
  235. });
  236. break;
  237. case 'value':
  238. const codePoint = item.codePoint;
  239. const set = regenerate(codePoint);
  240. if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
  241. const folded = caseFold(codePoint);
  242. if (folded) {
  243. set.add(folded);
  244. }
  245. }
  246. update(item, set.toString(regenerateOptions));
  247. break;
  248. case 'reference':
  249. if (item.name) {
  250. const name = item.name.value;
  251. const index = groups.names[name];
  252. if (index) {
  253. updateNamedReference(item, index);
  254. break;
  255. }
  256. if (!groups.unmatchedReferences[name]) {
  257. groups.unmatchedReferences[name] = [];
  258. }
  259. // Keep track of references used before the corresponding group.
  260. groups.unmatchedReferences[name].push(item);
  261. }
  262. break;
  263. case 'anchor':
  264. case 'empty':
  265. case 'group':
  266. // Nothing to do here.
  267. break;
  268. // The `default` clause is only here as a safeguard; it should never be
  269. // reached. Code coverage tools should ignore it.
  270. /* istanbul ignore next */
  271. default:
  272. throw new Error(`Unknown term type: ${ item.type }`);
  273. }
  274. return item;
  275. };
  276. const config = {
  277. 'ignoreCase': false,
  278. 'unicode': false,
  279. 'dotAll': false,
  280. 'useUnicodeFlag': false
  281. };
  282. const rewritePattern = (pattern, flags, options) => {
  283. const regjsparserFeatures = {
  284. 'unicodePropertyEscape': options && options.unicodePropertyEscape,
  285. 'namedGroups': options && options.namedGroup,
  286. 'lookbehind': options && options.lookbehind
  287. };
  288. config.ignoreCase = flags && flags.includes('i');
  289. config.unicode = flags && flags.includes('u');
  290. const supportDotAllFlag = options && options.dotAllFlag;
  291. config.dotAll = supportDotAllFlag && flags && flags.includes('s');
  292. config.useUnicodeFlag = options && options.useUnicodeFlag;
  293. const regenerateOptions = {
  294. 'hasUnicodeFlag': config.useUnicodeFlag,
  295. 'bmpOnly': !config.unicode
  296. };
  297. const groups = {
  298. 'onNamedGroup': options && options.onNamedGroup,
  299. 'lastIndex': 0,
  300. 'names': Object.create(null), // { [name]: index }
  301. 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
  302. };
  303. const tree = parse(pattern, flags, regjsparserFeatures);
  304. // Note: `processTerm` mutates `tree` and `groups`.
  305. processTerm(tree, regenerateOptions, groups);
  306. assertNoUnmatchedReferences(groups);
  307. return generate(tree);
  308. };
  309. module.exports = rewritePattern;