123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333 |
- 'use strict';
- const generate = require('regjsgen').generate;
- const parse = require('regjsparser').parse;
- const regenerate = require('regenerate');
- const unicodeMatchProperty = require('unicode-match-property-ecmascript');
- const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
- const iuMappings = require('./data/iu-mappings.js');
- const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
- // Prepare a Regenerate set containing all code points, used for negative
- // character classes (if any).
- const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
- // Without the `u` flag, the range stops at 0xFFFF.
- // https://mths.be/es6#sec-pattern-semantics
- const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./u`. https://mths.be/es6#sec-atom
- const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
- .remove(
- // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
- 0x000A, // Line Feed <LF>
- 0x000D, // Carriage Return <CR>
- 0x2028, // Line Separator <LS>
- 0x2029 // Paragraph Separator <PS>
- );
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./` (only BMP code points).
- const DOT_SET = DOT_SET_UNICODE.clone()
- .intersection(BMP_SET);
- const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
- if (unicode) {
- if (ignoreCase) {
- return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
- }
- return ESCAPE_SETS.UNICODE.get(character);
- }
- return ESCAPE_SETS.REGULAR.get(character);
- };
- const getDotSet = (unicode, dotAll) => {
- if (dotAll) {
- return unicode ? UNICODE_SET : BMP_SET;
- }
- return unicode ? DOT_SET_UNICODE : DOT_SET;
- };
- const getUnicodePropertyValueSet = (property, value) => {
- const path = value ?
- `${ property }/${ value }` :
- `Binary_Property/${ property }`;
- try {
- return require(`regenerate-unicode-properties/${ path }.js`);
- } catch (exception) {
- throw new Error(
- `Failed to recognize value \`${ value }\` for property ` +
- `\`${ property }\`.`
- );
- }
- };
- const handleLoneUnicodePropertyNameOrValue = (value) => {
- // It could be a `General_Category` value or a binary property.
- // Note: `unicodeMatchPropertyValue` throws on invalid values.
- try {
- const property = 'General_Category';
- const category = unicodeMatchPropertyValue(property, value);
- return getUnicodePropertyValueSet(property, category);
- } catch (exception) {}
- // It’s not a `General_Category` value, so check if it’s a binary
- // property. Note: `unicodeMatchProperty` throws on invalid properties.
- const property = unicodeMatchProperty(value);
- return getUnicodePropertyValueSet(property);
- };
- const getUnicodePropertyEscapeSet = (value, isNegative) => {
- const parts = value.split('=');
- const firstPart = parts[0];
- let set;
- if (parts.length == 1) {
- set = handleLoneUnicodePropertyNameOrValue(firstPart);
- } else {
- // The pattern consists of two parts, i.e. `Property=Value`.
- const property = unicodeMatchProperty(firstPart);
- const value = unicodeMatchPropertyValue(property, parts[1]);
- set = getUnicodePropertyValueSet(property, value);
- }
- if (isNegative) {
- return UNICODE_SET.clone().remove(set);
- }
- return set.clone();
- };
- // Given a range of code points, add any case-folded code points in that range
- // to a set.
- regenerate.prototype.iuAddRange = function(min, max) {
- const $this = this;
- do {
- const folded = caseFold(min);
- if (folded) {
- $this.add(folded);
- }
- } while (++min <= max);
- return $this;
- };
- const update = (item, pattern) => {
- let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
- switch (tree.type) {
- case 'characterClass':
- case 'group':
- case 'value':
- // No wrapping needed.
- break;
- default:
- // Wrap the pattern in a non-capturing group.
- tree = wrap(tree, pattern);
- }
- Object.assign(item, tree);
- };
- const wrap = (tree, pattern) => {
- // Wrap the pattern in a non-capturing group.
- return {
- 'type': 'group',
- 'behavior': 'ignore',
- 'body': [tree],
- 'raw': `(?:${ pattern })`
- };
- };
- const caseFold = (codePoint) => {
- return iuMappings.get(codePoint) || false;
- };
- const processCharacterClass = (characterClassItem, regenerateOptions) => {
- let set = regenerate();
- for (const item of characterClassItem.body) {
- switch (item.type) {
- case 'value':
- set.add(item.codePoint);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- const folded = caseFold(item.codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- break;
- case 'characterClassRange':
- const min = item.min.codePoint;
- const max = item.max.codePoint;
- set.addRange(min, max);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- set.iuAddRange(min, max);
- }
- break;
- case 'characterClassEscape':
- set.add(getCharacterClassEscapeSet(
- item.value,
- config.unicode,
- config.ignoreCase
- ));
- break;
- case 'unicodePropertyEscape':
- set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- }
- if (characterClassItem.negative) {
- set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
- }
- update(characterClassItem, set.toString(regenerateOptions));
- return characterClassItem;
- };
- const updateNamedReference = (item, index) => {
- delete item.name;
- item.matchIndex = index;
- };
- const assertNoUnmatchedReferences = (groups) => {
- const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
- if (unmatchedReferencesNames.length > 0) {
- throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
- }
- };
- const processTerm = (item, regenerateOptions, groups) => {
- switch (item.type) {
- case 'dot':
- update(
- item,
- getDotSet(config.unicode, config.dotAll).toString(regenerateOptions)
- );
- break;
- case 'characterClass':
- item = processCharacterClass(item, regenerateOptions);
- break;
- case 'unicodePropertyEscape':
- update(
- item,
- getUnicodePropertyEscapeSet(item.value, item.negative)
- .toString(regenerateOptions)
- );
- break;
- case 'characterClassEscape':
- update(
- item,
- getCharacterClassEscapeSet(
- item.value,
- config.unicode,
- config.ignoreCase
- ).toString(regenerateOptions)
- );
- break;
- case 'group':
- groups.lastIndex++;
- if (item.name) {
- const name = item.name.value;
- if (groups.names[name]) {
- throw new Error(
- `Multiple groups with the same name (${ name }) are not allowed.`
- );
- }
- const index = groups.lastIndex;
- delete item.name;
- groups.names[name] = index;
- if (groups.onNamedGroup) {
- groups.onNamedGroup.call(null, name, index);
- }
- if (groups.unmatchedReferences[name]) {
- groups.unmatchedReferences[name].forEach(reference => {
- updateNamedReference(reference, index);
- });
- delete groups.unmatchedReferences[name];
- }
- }
- /* falls through */
- case 'alternative':
- case 'disjunction':
- case 'quantifier':
- item.body = item.body.map(term => {
- return processTerm(term, regenerateOptions, groups);
- });
- break;
- case 'value':
- const codePoint = item.codePoint;
- const set = regenerate(codePoint);
- if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
- const folded = caseFold(codePoint);
- if (folded) {
- set.add(folded);
- }
- }
- update(item, set.toString(regenerateOptions));
- break;
- case 'reference':
- if (item.name) {
- const name = item.name.value;
- const index = groups.names[name];
- if (index) {
- updateNamedReference(item, index);
- break;
- }
- if (!groups.unmatchedReferences[name]) {
- groups.unmatchedReferences[name] = [];
- }
- // Keep track of references used before the corresponding group.
- groups.unmatchedReferences[name].push(item);
- }
- break;
- case 'anchor':
- case 'empty':
- case 'group':
- // Nothing to do here.
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* istanbul ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- return item;
- };
- const config = {
- 'ignoreCase': false,
- 'unicode': false,
- 'dotAll': false,
- 'useUnicodeFlag': false
- };
- const rewritePattern = (pattern, flags, options) => {
- const regjsparserFeatures = {
- 'unicodePropertyEscape': options && options.unicodePropertyEscape,
- 'namedGroups': options && options.namedGroup,
- 'lookbehind': options && options.lookbehind
- };
- config.ignoreCase = flags && flags.includes('i');
- config.unicode = flags && flags.includes('u');
- const supportDotAllFlag = options && options.dotAllFlag;
- config.dotAll = supportDotAllFlag && flags && flags.includes('s');
- config.useUnicodeFlag = options && options.useUnicodeFlag;
- const regenerateOptions = {
- 'hasUnicodeFlag': config.useUnicodeFlag,
- 'bmpOnly': !config.unicode
- };
- const groups = {
- 'onNamedGroup': options && options.onNamedGroup,
- 'lastIndex': 0,
- 'names': Object.create(null), // { [name]: index }
- 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
- };
- const tree = parse(pattern, flags, regjsparserFeatures);
- // Note: `processTerm` mutates `tree` and `groups`.
- processTerm(tree, regenerateOptions, groups);
- assertNoUnmatchedReferences(groups);
- return generate(tree);
- };
- module.exports = rewritePattern;
|