fix(compiler): support full range of entity decoding in browser builds

BREAKING CHANGE: compiler options have been adjusted. - new option `decodeEntities` is added. - `namedCharacterReferences` option has been removed. - `maxCRNameLength` option has been rmeoved.
2020-04-08 18:51:25 -04:00
parent 8c17535a47
commit 1f6e72b110
11 changed files with 245 additions and 1809 deletions
--- a/packages/compiler-core/tests/snapshots/parse.spec.ts.snap
+++ b/packages/compiler-core/tests/snapshots/parse.spec.ts.snap
--- a/packages/compiler-core/tests/parse.spec.ts
+++ b/packages/compiler-core/tests/parse.spec.ts
@@ -9,7 +9,6 @@ import {
  NodeTypes,
  Position,
  TextNode,
-  AttributeNode,
  InterpolationNode
 } from '../src/ast'

@@ -163,114 +162,6 @@ describe('compiler: parse', () => {
        }
      })
    })
-
-    test('HTML entities compatibility in text (https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state).', () => {
-      const spy = jest.fn()
-      const ast = baseParse('&ampersand;', {
-        namedCharacterReferences: { amp: '&' },
-        onError: spy
-      })
-      const text = ast.children[0] as TextNode
-
-      expect(text).toStrictEqual({
-        type: NodeTypes.TEXT,
-        content: '&ersand;',
-        loc: {
-          start: { offset: 0, line: 1, column: 1 },
-          end: { offset: 11, line: 1, column: 12 },
-          source: '&ampersand;'
-        }
-      })
-      expect(spy.mock.calls).toMatchObject([
-        [
-          {
-            code: ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
-            loc: {
-              start: { offset: 4, line: 1, column: 5 }
-            }
-          }
-        ]
-      ])
-    })
-
-    test('HTML entities compatibility in attribute (https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state).', () => {
-      const spy = jest.fn()
-      const ast = baseParse(
-        '<div a="&ampersand;" b="&amp;ersand;" c="&amp!"></div>',
-        {
-          namedCharacterReferences: { amp: '&', 'amp;': '&' },
-          onError: spy
-        }
-      )
-      const element = ast.children[0] as ElementNode
-      const text1 = (element.props[0] as AttributeNode).value
-      const text2 = (element.props[1] as AttributeNode).value
-      const text3 = (element.props[2] as AttributeNode).value
-
-      expect(text1).toStrictEqual({
-        type: NodeTypes.TEXT,
-        content: '&ampersand;',
-        loc: {
-          start: { offset: 7, line: 1, column: 8 },
-          end: { offset: 20, line: 1, column: 21 },
-          source: '"&ampersand;"'
-        }
-      })
-      expect(text2).toStrictEqual({
-        type: NodeTypes.TEXT,
-        content: '&ersand;',
-        loc: {
-          start: { offset: 23, line: 1, column: 24 },
-          end: { offset: 37, line: 1, column: 38 },
-          source: '"&amp;ersand;"'
-        }
-      })
-      expect(text3).toStrictEqual({
-        type: NodeTypes.TEXT,
-        content: '&!',
-        loc: {
-          start: { offset: 40, line: 1, column: 41 },
-          end: { offset: 47, line: 1, column: 48 },
-          source: '"&amp!"'
-        }
-      })
-      expect(spy.mock.calls).toMatchObject([
-        [
-          {
-            code: ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
-            loc: {
-              start: { offset: 45, line: 1, column: 46 }
-            }
-          }
-        ]
-      ])
-    })
-
-    test('Some control character reference should be replaced.', () => {
-      const spy = jest.fn()
-      const ast = baseParse('&#x86;', { onError: spy })
-      const text = ast.children[0] as TextNode
-
-      expect(text).toStrictEqual({
-        type: NodeTypes.TEXT,
-        content: '†',
-        loc: {
-          start: { offset: 0, line: 1, column: 1 },
-          end: { offset: 6, line: 1, column: 7 },
-          source: '&#x86;'
-        }
-      })
-      expect(spy.mock.calls).toMatchObject([
-        [
-          {
-            code: ErrorCodes.CONTROL_CHARACTER_REFERENCE,
-            loc: {
-              start: { offset: 0, line: 1, column: 1 }
-            }
-          }
-        ]
-      ])
-    })
  })

  describe('Interpolation', () => {
@@ -1652,12 +1543,10 @@ foo
    expect(baz.loc.end).toEqual({ line: 2, column: 28, offset })
  })

-  describe('namedCharacterReferences option', () => {
+  describe('decodeEntities option', () => {
    test('use the given map', () => {
      const ast: any = baseParse('&amp;&cups;', {
-        namedCharacterReferences: {
-          'cups;': '\u222A\uFE00' // UNION with serifs
-        },
+        decodeEntities: text => text.replace('&cups;', '\u222A\uFE00'),
        onError: () => {} // Ignore errors
      })

@@ -1756,60 +1645,6 @@ foo
          errors: []
        }
      ],
-      ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&#a;</template>',
-          errors: [
-            {
-              type: ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#xg;</template>',
-          errors: [
-            {
-              type: ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#99;</template>',
-          errors: []
-        },
-        {
-          code: '<template>&#xff;</template>',
-          errors: []
-        },
-        {
-          code: '<template attr="&#a;"></template>',
-          errors: [
-            {
-              type: ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
-              loc: { offset: 16, line: 1, column: 17 }
-            }
-          ]
-        },
-        {
-          code: '<template attr="&#xg;"></template>',
-          errors: [
-            {
-              type: ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
-              loc: { offset: 16, line: 1, column: 17 }
-            }
-          ]
-        },
-        {
-          code: '<template attr="&#99;"></template>',
-          errors: []
-        },
-        {
-          code: '<template attr="&#xff;"></template>',
-          errors: []
-        }
-      ],
      CDATA_IN_HTML_CONTENT: [
        {
          code: '<template><![CDATA[cdata]]></template>',
@@ -1825,37 +1660,6 @@ foo
          errors: []
        }
      ],
-      CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE: [
-        {
-          code: '<template>&#1234567;</template>',
-          errors: [
-            {
-              type: ErrorCodes.CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        }
-      ],
-      CONTROL_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&#0003;</template>',
-          errors: [
-            {
-              type: ErrorCodes.CONTROL_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#x7F;</template>',
-          errors: [
-            {
-              type: ErrorCodes.CONTROL_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        }
-      ],
      DUPLICATE_ATTRIBUTE: [
        {
          code: '<template><div id="" id=""></div></template>',
@@ -2412,36 +2216,6 @@ foo
          ]
        }
      ],
-      MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&amp</template>',
-          options: { namedCharacterReferences: { amp: '&' } },
-          errors: [
-            {
-              type: ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
-              loc: { offset: 14, line: 1, column: 15 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#40</template>',
-          errors: [
-            {
-              type: ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
-              loc: { offset: 14, line: 1, column: 15 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#x40</template>',
-          errors: [
-            {
-              type: ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
-              loc: { offset: 15, line: 1, column: 16 }
-            }
-          ]
-        }
-      ],
      MISSING_WHITESPACE_BETWEEN_ATTRIBUTES: [
        {
          code: '<template><div id="foo"class="bar"></div></template>',
@@ -2500,48 +2274,6 @@ foo
          ]
        }
      ],
-      NONCHARACTER_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&#xFFFE;</template>',
-          errors: [
-            {
-              type: ErrorCodes.NONCHARACTER_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        },
-        {
-          code: '<template>&#x1FFFF;</template>',
-          errors: [
-            {
-              type: ErrorCodes.NONCHARACTER_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        }
-      ],
-      NULL_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&#0000;</template>',
-          errors: [
-            {
-              type: ErrorCodes.NULL_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        }
-      ],
-      SURROGATE_CHARACTER_REFERENCE: [
-        {
-          code: '<template>&#xD800;</template>',
-          errors: [
-            {
-              type: ErrorCodes.SURROGATE_CHARACTER_REFERENCE,
-              loc: { offset: 10, line: 1, column: 11 }
-            }
-          ]
-        }
-      ],
      UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME: [
        {
          code: "<template><div a\"bc=''></div></template>",
--- a/packages/compiler-core/src/errors.ts
+++ b/packages/compiler-core/src/errors.ts
@@ -32,10 +32,7 @@ export function createCompilerError<T extends number>(
 export const enum ErrorCodes {
  // parse errors
  ABRUPT_CLOSING_OF_EMPTY_COMMENT,
-  ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
  CDATA_IN_HTML_CONTENT,
-  CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
-  CONTROL_CHARACTER_REFERENCE,
  DUPLICATE_ATTRIBUTE,
  END_TAG_WITH_ATTRIBUTES,
  END_TAG_WITH_TRAILING_SOLIDUS,
@@ -49,12 +46,8 @@ export const enum ErrorCodes {
  INVALID_FIRST_CHARACTER_OF_TAG_NAME,
  MISSING_ATTRIBUTE_VALUE,
  MISSING_END_TAG_NAME,
-  MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
  MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
  NESTED_COMMENT,
-  NONCHARACTER_CHARACTER_REFERENCE,
-  NULL_CHARACTER_REFERENCE,
-  SURROGATE_CHARACTER_REFERENCE,
  UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
  UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
  UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
@@ -101,14 +94,8 @@ export const enum ErrorCodes {
 export const errorMessages: { [code: number]: string } = {
  // parse errors
  [ErrorCodes.ABRUPT_CLOSING_OF_EMPTY_COMMENT]: 'Illegal comment.',
-  [ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE]:
-    'Illegal numeric character reference: invalid character.',
  [ErrorCodes.CDATA_IN_HTML_CONTENT]:
    'CDATA section is allowed only in XML context.',
-  [ErrorCodes.CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE]:
-    'Illegal numeric character reference: too big.',
-  [ErrorCodes.CONTROL_CHARACTER_REFERENCE]:
-    'Illegal numeric character reference: control character.',
  [ErrorCodes.DUPLICATE_ATTRIBUTE]: 'Duplicate attribute.',
  [ErrorCodes.END_TAG_WITH_ATTRIBUTES]: 'End tag cannot have attributes.',
  [ErrorCodes.END_TAG_WITH_TRAILING_SOLIDUS]: "Illegal '/' in tags.",
@@ -124,17 +111,9 @@ export const errorMessages: { [code: number]: string } = {
    "Illegal tag name. Use '&lt;' to print '<'.",
  [ErrorCodes.MISSING_ATTRIBUTE_VALUE]: 'Attribute value was expected.',
  [ErrorCodes.MISSING_END_TAG_NAME]: 'End tag name was expected.',
-  [ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE]:
-    'Semicolon was expected.',
  [ErrorCodes.MISSING_WHITESPACE_BETWEEN_ATTRIBUTES]:
    'Whitespace was expected.',
  [ErrorCodes.NESTED_COMMENT]: "Unexpected '<!--' in comment.",
-  [ErrorCodes.NONCHARACTER_CHARACTER_REFERENCE]:
-    'Illegal numeric character reference: non character.',
-  [ErrorCodes.NULL_CHARACTER_REFERENCE]:
-    'Illegal numeric character reference: null character.',
-  [ErrorCodes.SURROGATE_CHARACTER_REFERENCE]:
-    'Illegal numeric character reference: non-pair surrogate.',
  [ErrorCodes.UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME]:
    'Attribute name cannot contain U+0022 ("), U+0027 (\'), and U+003C (<).',
  [ErrorCodes.UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE]:
--- a/packages/compiler-core/src/options.ts
+++ b/packages/compiler-core/src/options.ts
@@ -26,13 +26,7 @@ export interface ParserOptions {
    parent: ElementNode | undefined
  ) => TextModes
  delimiters?: [string, string] // ['{{', '}}']
-
-  // Map to HTML entities. E.g., `{ "amp;": "&" }`
-  // The full set is https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
-  namedCharacterReferences?: Record<string, string>
-  // this number is based on the map above, but it should be pre-computed
-  // to avoid the cost on every parse() call.
-  maxCRNameLength?: number
+  decodeEntities?: (rawText: string, asAttr: boolean) => string
  onError?: (error: CompilerError) => void
 }

--- a/packages/compiler-core/src/parse.ts
+++ b/packages/compiler-core/src/parse.ts
@@ -30,6 +30,18 @@ type OptionalOptions = 'isNativeTag' | 'isBuiltInComponent'
 type MergedParserOptions = Omit<Required<ParserOptions>, OptionalOptions> &
  Pick<ParserOptions, OptionalOptions>

+// The default decoder only provides escapes for characters reserved as part of
+// the tempalte syntax, and is only used if the custom renderer did not provide
+// a platform-specific decoder.
+const decodeRE = /&(gt|lt|amp|apos|quot);/g
+const decodeMap: Record<string, string> = {
+  gt: '>',
+  lt: '<',
+  amp: '&',
+  apos: "'",
+  quot: '"'
+}
+
 export const defaultParserOptions: MergedParserOptions = {
  delimiters: [`{{`, `}}`],
  getNamespace: () => Namespaces.HTML,
@@ -37,14 +49,8 @@ export const defaultParserOptions: MergedParserOptions = {
  isVoidTag: NO,
  isPreTag: NO,
  isCustomElement: NO,
-  namedCharacterReferences: {
-    'gt;': '>',
-    'lt;': '<',
-    'amp;': '&',
-    'apos;': "'",
-    'quot;': '"'
-  },
-  maxCRNameLength: 5,
+  decodeEntities: (rawText: string): string =>
+    rawText.replace(decodeRE, (_, p1) => decodeMap[p1]),
  onError: defaultOnError
 }

@@ -57,7 +63,7 @@ export const enum TextModes {
  ATTRIBUTE_VALUE
 }

-interface ParserContext {
+export interface ParserContext {
  options: MergedParserOptions
  readonly originalSource: string
  source: string
@@ -812,128 +818,21 @@ function parseTextData(
  length: number,
  mode: TextModes
 ): string {
-  let rawText = context.source.slice(0, length)
+  const rawText = context.source.slice(0, length)
+  advanceBy(context, length)
  if (
    mode === TextModes.RAWTEXT ||
    mode === TextModes.CDATA ||
    rawText.indexOf('&') === -1
  ) {
-    advanceBy(context, length)
    return rawText
+  } else {
+    // DATA or RCDATA containing "&"". Entity decoding required.
+    return context.options.decodeEntities(
+      rawText,
+      mode === TextModes.ATTRIBUTE_VALUE
+    )
  }
-
-  // DATA or RCDATA containing "&"". Entity decoding required.
-  const end = context.offset + length
-  let decodedText = ''
-
-  function advance(length: number) {
-    advanceBy(context, length)
-    rawText = rawText.slice(length)
-  }
-
-  while (context.offset < end) {
-    const head = /&(?:#x?)?/i.exec(rawText)
-    if (!head || context.offset + head.index >= end) {
-      const remaining = end - context.offset
-      decodedText += rawText.slice(0, remaining)
-      advance(remaining)
-      break
-    }
-
-    // Advance to the "&".
-    decodedText += rawText.slice(0, head.index)
-    advance(head.index)
-
-    if (head[0] === '&') {
-      // Named character reference.
-      let name = ''
-      let value: string | undefined = undefined
-      if (/[0-9a-z]/i.test(rawText[1])) {
-        for (
-          let length = context.options.maxCRNameLength;
-          !value && length > 0;
-          --length
-        ) {
-          name = rawText.substr(1, length)
-          value = context.options.namedCharacterReferences[name]
-        }
-        if (value) {
-          const semi = name.endsWith(';')
-          if (
-            mode === TextModes.ATTRIBUTE_VALUE &&
-            !semi &&
-            /[=a-z0-9]/i.test(rawText[name.length + 1] || '')
-          ) {
-            decodedText += '&' + name
-            advance(1 + name.length)
-          } else {
-            decodedText += value
-            advance(1 + name.length)
-            if (!semi) {
-              emitError(
-                context,
-                ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE
-              )
-            }
-          }
-        } else {
-          decodedText += '&' + name
-          advance(1 + name.length)
-        }
-      } else {
-        decodedText += '&'
-        advance(1)
-      }
-    } else {
-      // Numeric character reference.
-      const hex = head[0] === '&#x'
-      const pattern = hex ? /^&#x([0-9a-f]+);?/i : /^&#([0-9]+);?/
-      const body = pattern.exec(rawText)
-      if (!body) {
-        decodedText += head[0]
-        emitError(
-          context,
-          ErrorCodes.ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE
-        )
-        advance(head[0].length)
-      } else {
-        // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
-        let cp = Number.parseInt(body[1], hex ? 16 : 10)
-        if (cp === 0) {
-          emitError(context, ErrorCodes.NULL_CHARACTER_REFERENCE)
-          cp = 0xfffd
-        } else if (cp > 0x10ffff) {
-          emitError(
-            context,
-            ErrorCodes.CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE
-          )
-          cp = 0xfffd
-        } else if (cp >= 0xd800 && cp <= 0xdfff) {
-          emitError(context, ErrorCodes.SURROGATE_CHARACTER_REFERENCE)
-          cp = 0xfffd
-        } else if ((cp >= 0xfdd0 && cp <= 0xfdef) || (cp & 0xfffe) === 0xfffe) {
-          emitError(context, ErrorCodes.NONCHARACTER_CHARACTER_REFERENCE)
-        } else if (
-          (cp >= 0x01 && cp <= 0x08) ||
-          cp === 0x0b ||
-          (cp >= 0x0d && cp <= 0x1f) ||
-          (cp >= 0x7f && cp <= 0x9f)
-        ) {
-          emitError(context, ErrorCodes.CONTROL_CHARACTER_REFERENCE)
-          cp = CCR_REPLACEMENTS[cp] || cp
-        }
-        decodedText += String.fromCodePoint(cp)
-        advance(body[0].length)
-        if (!body![0].endsWith(';')) {
-          emitError(
-            context,
-            ErrorCodes.MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE
-          )
-        }
-      }
-    }
-  }
-  return decodedText
 }

 function getCursor(context: ParserContext): Position {
@@ -1052,34 +951,3 @@ function startsWithEndTagOpen(source: string, tag: string): boolean {
    /[\t\n\f />]/.test(source[2 + tag.length] || '>')
  )
 }
-
-// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
-const CCR_REPLACEMENTS: { [key: number]: number | undefined } = {
-  0x80: 0x20ac,
-  0x82: 0x201a,
-  0x83: 0x0192,
-  0x84: 0x201e,
-  0x85: 0x2026,
-  0x86: 0x2020,
-  0x87: 0x2021,
-  0x88: 0x02c6,
-  0x89: 0x2030,
-  0x8a: 0x0160,
-  0x8b: 0x2039,
-  0x8c: 0x0152,
-  0x8e: 0x017d,
-  0x91: 0x2018,
-  0x92: 0x2019,
-  0x93: 0x201c,
-  0x94: 0x201d,
-  0x95: 0x2022,
-  0x96: 0x2013,
-  0x97: 0x2014,
-  0x98: 0x02dc,
-  0x99: 0x2122,
-  0x9a: 0x0161,
-  0x9b: 0x203a,
-  0x9c: 0x0153,
-  0x9e: 0x017e,
-  0x9f: 0x0178
-}