Spaces:
Build error
Build error
; | |
Object.defineProperty(exports, "__esModule", { value: true }); | |
exports.QuoteType = void 0; | |
var decode_js_1 = require("entities/lib/decode.js"); | |
var CharCodes; | |
(function (CharCodes) { | |
CharCodes[CharCodes["Tab"] = 9] = "Tab"; | |
CharCodes[CharCodes["NewLine"] = 10] = "NewLine"; | |
CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed"; | |
CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn"; | |
CharCodes[CharCodes["Space"] = 32] = "Space"; | |
CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark"; | |
CharCodes[CharCodes["Number"] = 35] = "Number"; | |
CharCodes[CharCodes["Amp"] = 38] = "Amp"; | |
CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote"; | |
CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote"; | |
CharCodes[CharCodes["Dash"] = 45] = "Dash"; | |
CharCodes[CharCodes["Slash"] = 47] = "Slash"; | |
CharCodes[CharCodes["Zero"] = 48] = "Zero"; | |
CharCodes[CharCodes["Nine"] = 57] = "Nine"; | |
CharCodes[CharCodes["Semi"] = 59] = "Semi"; | |
CharCodes[CharCodes["Lt"] = 60] = "Lt"; | |
CharCodes[CharCodes["Eq"] = 61] = "Eq"; | |
CharCodes[CharCodes["Gt"] = 62] = "Gt"; | |
CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark"; | |
CharCodes[CharCodes["UpperA"] = 65] = "UpperA"; | |
CharCodes[CharCodes["LowerA"] = 97] = "LowerA"; | |
CharCodes[CharCodes["UpperF"] = 70] = "UpperF"; | |
CharCodes[CharCodes["LowerF"] = 102] = "LowerF"; | |
CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ"; | |
CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ"; | |
CharCodes[CharCodes["LowerX"] = 120] = "LowerX"; | |
CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket"; | |
})(CharCodes || (CharCodes = {})); | |
/** All the states the tokenizer can be in. */ | |
var State; | |
(function (State) { | |
State[State["Text"] = 1] = "Text"; | |
State[State["BeforeTagName"] = 2] = "BeforeTagName"; | |
State[State["InTagName"] = 3] = "InTagName"; | |
State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag"; | |
State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName"; | |
State[State["InClosingTagName"] = 6] = "InClosingTagName"; | |
State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName"; | |
// Attributes | |
State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName"; | |
State[State["InAttributeName"] = 9] = "InAttributeName"; | |
State[State["AfterAttributeName"] = 10] = "AfterAttributeName"; | |
State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue"; | |
State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq"; | |
State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq"; | |
State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq"; | |
// Declarations | |
State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration"; | |
State[State["InDeclaration"] = 16] = "InDeclaration"; | |
// Processing instructions | |
State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction"; | |
// Comments & CDATA | |
State[State["BeforeComment"] = 18] = "BeforeComment"; | |
State[State["CDATASequence"] = 19] = "CDATASequence"; | |
State[State["InSpecialComment"] = 20] = "InSpecialComment"; | |
State[State["InCommentLike"] = 21] = "InCommentLike"; | |
// Special tags | |
State[State["BeforeSpecialS"] = 22] = "BeforeSpecialS"; | |
State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence"; | |
State[State["InSpecialTag"] = 24] = "InSpecialTag"; | |
State[State["BeforeEntity"] = 25] = "BeforeEntity"; | |
State[State["BeforeNumericEntity"] = 26] = "BeforeNumericEntity"; | |
State[State["InNamedEntity"] = 27] = "InNamedEntity"; | |
State[State["InNumericEntity"] = 28] = "InNumericEntity"; | |
State[State["InHexEntity"] = 29] = "InHexEntity"; | |
})(State || (State = {})); | |
function isWhitespace(c) { | |
return (c === CharCodes.Space || | |
c === CharCodes.NewLine || | |
c === CharCodes.Tab || | |
c === CharCodes.FormFeed || | |
c === CharCodes.CarriageReturn); | |
} | |
function isEndOfTagSection(c) { | |
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); | |
} | |
function isNumber(c) { | |
return c >= CharCodes.Zero && c <= CharCodes.Nine; | |
} | |
function isASCIIAlpha(c) { | |
return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || | |
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ)); | |
} | |
function isHexDigit(c) { | |
return ((c >= CharCodes.UpperA && c <= CharCodes.UpperF) || | |
(c >= CharCodes.LowerA && c <= CharCodes.LowerF)); | |
} | |
var QuoteType; | |
(function (QuoteType) { | |
QuoteType[QuoteType["NoValue"] = 0] = "NoValue"; | |
QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted"; | |
QuoteType[QuoteType["Single"] = 2] = "Single"; | |
QuoteType[QuoteType["Double"] = 3] = "Double"; | |
})(QuoteType = exports.QuoteType || (exports.QuoteType = {})); | |
/** | |
* Sequences used to match longer strings. | |
* | |
* We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End | |
* sequences with an increased offset. | |
*/ | |
var Sequences = { | |
Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), | |
CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), | |
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), | |
ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), | |
StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), | |
TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title` | |
}; | |
var Tokenizer = /** @class */ (function () { | |
function Tokenizer(_a, cbs) { | |
var _b = _a.xmlMode, xmlMode = _b === void 0 ? false : _b, _c = _a.decodeEntities, decodeEntities = _c === void 0 ? true : _c; | |
this.cbs = cbs; | |
/** The current state the tokenizer is in. */ | |
this.state = State.Text; | |
/** The read buffer. */ | |
this.buffer = ""; | |
/** The beginning of the section that is currently being read. */ | |
this.sectionStart = 0; | |
/** The index within the buffer that we are currently looking at. */ | |
this.index = 0; | |
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ | |
this.baseState = State.Text; | |
/** For special parsing behavior inside of script and style tags. */ | |
this.isSpecial = false; | |
/** Indicates whether the tokenizer has been paused. */ | |
this.running = true; | |
/** The offset of the current buffer. */ | |
this.offset = 0; | |
this.currentSequence = undefined; | |
this.sequenceIndex = 0; | |
this.trieIndex = 0; | |
this.trieCurrent = 0; | |
/** For named entities, the index of the value. For numeric entities, the code point. */ | |
this.entityResult = 0; | |
this.entityExcess = 0; | |
this.xmlMode = xmlMode; | |
this.decodeEntities = decodeEntities; | |
this.entityTrie = xmlMode ? decode_js_1.xmlDecodeTree : decode_js_1.htmlDecodeTree; | |
} | |
Tokenizer.prototype.reset = function () { | |
this.state = State.Text; | |
this.buffer = ""; | |
this.sectionStart = 0; | |
this.index = 0; | |
this.baseState = State.Text; | |
this.currentSequence = undefined; | |
this.running = true; | |
this.offset = 0; | |
}; | |
Tokenizer.prototype.write = function (chunk) { | |
this.offset += this.buffer.length; | |
this.buffer = chunk; | |
this.parse(); | |
}; | |
Tokenizer.prototype.end = function () { | |
if (this.running) | |
this.finish(); | |
}; | |
Tokenizer.prototype.pause = function () { | |
this.running = false; | |
}; | |
Tokenizer.prototype.resume = function () { | |
this.running = true; | |
if (this.index < this.buffer.length + this.offset) { | |
this.parse(); | |
} | |
}; | |
/** | |
* The current index within all of the written data. | |
*/ | |
Tokenizer.prototype.getIndex = function () { | |
return this.index; | |
}; | |
/** | |
* The start of the current section. | |
*/ | |
Tokenizer.prototype.getSectionStart = function () { | |
return this.sectionStart; | |
}; | |
Tokenizer.prototype.stateText = function (c) { | |
if (c === CharCodes.Lt || | |
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))) { | |
if (this.index > this.sectionStart) { | |
this.cbs.ontext(this.sectionStart, this.index); | |
} | |
this.state = State.BeforeTagName; | |
this.sectionStart = this.index; | |
} | |
else if (this.decodeEntities && c === CharCodes.Amp) { | |
this.state = State.BeforeEntity; | |
} | |
}; | |
Tokenizer.prototype.stateSpecialStartSequence = function (c) { | |
var isEnd = this.sequenceIndex === this.currentSequence.length; | |
var isMatch = isEnd | |
? // If we are at the end of the sequence, make sure the tag name has ended | |
isEndOfTagSection(c) | |
: // Otherwise, do a case-insensitive comparison | |
(c | 0x20) === this.currentSequence[this.sequenceIndex]; | |
if (!isMatch) { | |
this.isSpecial = false; | |
} | |
else if (!isEnd) { | |
this.sequenceIndex++; | |
return; | |
} | |
this.sequenceIndex = 0; | |
this.state = State.InTagName; | |
this.stateInTagName(c); | |
}; | |
/** Look for an end tag. For <title> tags, also decode entities. */ | |
Tokenizer.prototype.stateInSpecialTag = function (c) { | |
if (this.sequenceIndex === this.currentSequence.length) { | |
if (c === CharCodes.Gt || isWhitespace(c)) { | |
var endOfText = this.index - this.currentSequence.length; | |
if (this.sectionStart < endOfText) { | |
// Spoof the index so that reported locations match up. | |
var actualIndex = this.index; | |
this.index = endOfText; | |
this.cbs.ontext(this.sectionStart, endOfText); | |
this.index = actualIndex; | |
} | |
this.isSpecial = false; | |
this.sectionStart = endOfText + 2; // Skip over the `</` | |
this.stateInClosingTagName(c); | |
return; // We are done; skip the rest of the function. | |
} | |
this.sequenceIndex = 0; | |
} | |
if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { | |
this.sequenceIndex += 1; | |
} | |
else if (this.sequenceIndex === 0) { | |
if (this.currentSequence === Sequences.TitleEnd) { | |
// We have to parse entities in <title> tags. | |
if (this.decodeEntities && c === CharCodes.Amp) { | |
this.state = State.BeforeEntity; | |
} | |
} | |
else if (this.fastForwardTo(CharCodes.Lt)) { | |
// Outside of <title> tags, we can fast-forward. | |
this.sequenceIndex = 1; | |
} | |
} | |
else { | |
// If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`. | |
this.sequenceIndex = Number(c === CharCodes.Lt); | |
} | |
}; | |
Tokenizer.prototype.stateCDATASequence = function (c) { | |
if (c === Sequences.Cdata[this.sequenceIndex]) { | |
if (++this.sequenceIndex === Sequences.Cdata.length) { | |
this.state = State.InCommentLike; | |
this.currentSequence = Sequences.CdataEnd; | |
this.sequenceIndex = 0; | |
this.sectionStart = this.index + 1; | |
} | |
} | |
else { | |
this.sequenceIndex = 0; | |
this.state = State.InDeclaration; | |
this.stateInDeclaration(c); // Reconsume the character | |
} | |
}; | |
/** | |
* When we wait for one specific character, we can speed things up | |
* by skipping through the buffer until we find it. | |
* | |
* @returns Whether the character was found. | |
*/ | |
Tokenizer.prototype.fastForwardTo = function (c) { | |
while (++this.index < this.buffer.length + this.offset) { | |
if (this.buffer.charCodeAt(this.index - this.offset) === c) { | |
return true; | |
} | |
} | |
/* | |
* We increment the index at the end of the `parse` loop, | |
* so set it to `buffer.length - 1` here. | |
* | |
* TODO: Refactor `parse` to increment index before calling states. | |
*/ | |
this.index = this.buffer.length + this.offset - 1; | |
return false; | |
}; | |
/** | |
* Comments and CDATA end with `-->` and `]]>`. | |
* | |
* Their common qualities are: | |
* - Their end sequences have a distinct character they start with. | |
* - That character is then repeated, so we have to check multiple repeats. | |
* - All characters but the start character of the sequence can be skipped. | |
*/ | |
Tokenizer.prototype.stateInCommentLike = function (c) { | |
if (c === this.currentSequence[this.sequenceIndex]) { | |
if (++this.sequenceIndex === this.currentSequence.length) { | |
if (this.currentSequence === Sequences.CdataEnd) { | |
this.cbs.oncdata(this.sectionStart, this.index, 2); | |
} | |
else { | |
this.cbs.oncomment(this.sectionStart, this.index, 2); | |
} | |
this.sequenceIndex = 0; | |
this.sectionStart = this.index + 1; | |
this.state = State.Text; | |
} | |
} | |
else if (this.sequenceIndex === 0) { | |
// Fast-forward to the first character of the sequence | |
if (this.fastForwardTo(this.currentSequence[0])) { | |
this.sequenceIndex = 1; | |
} | |
} | |
else if (c !== this.currentSequence[this.sequenceIndex - 1]) { | |
// Allow long sequences, eg. --->, ]]]> | |
this.sequenceIndex = 0; | |
} | |
}; | |
/** | |
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. | |
* | |
* XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). | |
* We allow anything that wouldn't end the tag. | |
*/ | |
Tokenizer.prototype.isTagStartChar = function (c) { | |
return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c); | |
}; | |
Tokenizer.prototype.startSpecial = function (sequence, offset) { | |
this.isSpecial = true; | |
this.currentSequence = sequence; | |
this.sequenceIndex = offset; | |
this.state = State.SpecialStartSequence; | |
}; | |
Tokenizer.prototype.stateBeforeTagName = function (c) { | |
if (c === CharCodes.ExclamationMark) { | |
this.state = State.BeforeDeclaration; | |
this.sectionStart = this.index + 1; | |
} | |
else if (c === CharCodes.Questionmark) { | |
this.state = State.InProcessingInstruction; | |
this.sectionStart = this.index + 1; | |
} | |
else if (this.isTagStartChar(c)) { | |
var lower = c | 0x20; | |
this.sectionStart = this.index; | |
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) { | |
this.startSpecial(Sequences.TitleEnd, 3); | |
} | |
else { | |
this.state = | |
!this.xmlMode && lower === Sequences.ScriptEnd[2] | |
? State.BeforeSpecialS | |
: State.InTagName; | |
} | |
} | |
else if (c === CharCodes.Slash) { | |
this.state = State.BeforeClosingTagName; | |
} | |
else { | |
this.state = State.Text; | |
this.stateText(c); | |
} | |
}; | |
Tokenizer.prototype.stateInTagName = function (c) { | |
if (isEndOfTagSection(c)) { | |
this.cbs.onopentagname(this.sectionStart, this.index); | |
this.sectionStart = -1; | |
this.state = State.BeforeAttributeName; | |
this.stateBeforeAttributeName(c); | |
} | |
}; | |
Tokenizer.prototype.stateBeforeClosingTagName = function (c) { | |
if (isWhitespace(c)) { | |
// Ignore | |
} | |
else if (c === CharCodes.Gt) { | |
this.state = State.Text; | |
} | |
else { | |
this.state = this.isTagStartChar(c) | |
? State.InClosingTagName | |
: State.InSpecialComment; | |
this.sectionStart = this.index; | |
} | |
}; | |
Tokenizer.prototype.stateInClosingTagName = function (c) { | |
if (c === CharCodes.Gt || isWhitespace(c)) { | |
this.cbs.onclosetag(this.sectionStart, this.index); | |
this.sectionStart = -1; | |
this.state = State.AfterClosingTagName; | |
this.stateAfterClosingTagName(c); | |
} | |
}; | |
Tokenizer.prototype.stateAfterClosingTagName = function (c) { | |
// Skip everything until ">" | |
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { | |
this.state = State.Text; | |
this.baseState = State.Text; | |
this.sectionStart = this.index + 1; | |
} | |
}; | |
Tokenizer.prototype.stateBeforeAttributeName = function (c) { | |
if (c === CharCodes.Gt) { | |
this.cbs.onopentagend(this.index); | |
if (this.isSpecial) { | |
this.state = State.InSpecialTag; | |
this.sequenceIndex = 0; | |
} | |
else { | |
this.state = State.Text; | |
} | |
this.baseState = this.state; | |
this.sectionStart = this.index + 1; | |
} | |
else if (c === CharCodes.Slash) { | |
this.state = State.InSelfClosingTag; | |
} | |
else if (!isWhitespace(c)) { | |
this.state = State.InAttributeName; | |
this.sectionStart = this.index; | |
} | |
}; | |
Tokenizer.prototype.stateInSelfClosingTag = function (c) { | |
if (c === CharCodes.Gt) { | |
this.cbs.onselfclosingtag(this.index); | |
this.state = State.Text; | |
this.baseState = State.Text; | |
this.sectionStart = this.index + 1; | |
this.isSpecial = false; // Reset special state, in case of self-closing special tags | |
} | |
else if (!isWhitespace(c)) { | |
this.state = State.BeforeAttributeName; | |
this.stateBeforeAttributeName(c); | |
} | |
}; | |
Tokenizer.prototype.stateInAttributeName = function (c) { | |
if (c === CharCodes.Eq || isEndOfTagSection(c)) { | |
this.cbs.onattribname(this.sectionStart, this.index); | |
this.sectionStart = -1; | |
this.state = State.AfterAttributeName; | |
this.stateAfterAttributeName(c); | |
} | |
}; | |
Tokenizer.prototype.stateAfterAttributeName = function (c) { | |
if (c === CharCodes.Eq) { | |
this.state = State.BeforeAttributeValue; | |
} | |
else if (c === CharCodes.Slash || c === CharCodes.Gt) { | |
this.cbs.onattribend(QuoteType.NoValue, this.index); | |
this.state = State.BeforeAttributeName; | |
this.stateBeforeAttributeName(c); | |
} | |
else if (!isWhitespace(c)) { | |
this.cbs.onattribend(QuoteType.NoValue, this.index); | |
this.state = State.InAttributeName; | |
this.sectionStart = this.index; | |
} | |
}; | |
Tokenizer.prototype.stateBeforeAttributeValue = function (c) { | |
if (c === CharCodes.DoubleQuote) { | |
this.state = State.InAttributeValueDq; | |
this.sectionStart = this.index + 1; | |
} | |
else if (c === CharCodes.SingleQuote) { | |
this.state = State.InAttributeValueSq; | |
this.sectionStart = this.index + 1; | |
} | |
else if (!isWhitespace(c)) { | |
this.sectionStart = this.index; | |
this.state = State.InAttributeValueNq; | |
this.stateInAttributeValueNoQuotes(c); // Reconsume token | |
} | |
}; | |
Tokenizer.prototype.handleInAttributeValue = function (c, quote) { | |
if (c === quote || | |
(!this.decodeEntities && this.fastForwardTo(quote))) { | |
this.cbs.onattribdata(this.sectionStart, this.index); | |
this.sectionStart = -1; | |
this.cbs.onattribend(quote === CharCodes.DoubleQuote | |
? QuoteType.Double | |
: QuoteType.Single, this.index); | |
this.state = State.BeforeAttributeName; | |
} | |
else if (this.decodeEntities && c === CharCodes.Amp) { | |
this.baseState = this.state; | |
this.state = State.BeforeEntity; | |
} | |
}; | |
Tokenizer.prototype.stateInAttributeValueDoubleQuotes = function (c) { | |
this.handleInAttributeValue(c, CharCodes.DoubleQuote); | |
}; | |
Tokenizer.prototype.stateInAttributeValueSingleQuotes = function (c) { | |
this.handleInAttributeValue(c, CharCodes.SingleQuote); | |
}; | |
Tokenizer.prototype.stateInAttributeValueNoQuotes = function (c) { | |
if (isWhitespace(c) || c === CharCodes.Gt) { | |
this.cbs.onattribdata(this.sectionStart, this.index); | |
this.sectionStart = -1; | |
this.cbs.onattribend(QuoteType.Unquoted, this.index); | |
this.state = State.BeforeAttributeName; | |
this.stateBeforeAttributeName(c); | |
} | |
else if (this.decodeEntities && c === CharCodes.Amp) { | |
this.baseState = this.state; | |
this.state = State.BeforeEntity; | |
} | |
}; | |
Tokenizer.prototype.stateBeforeDeclaration = function (c) { | |
if (c === CharCodes.OpeningSquareBracket) { | |
this.state = State.CDATASequence; | |
this.sequenceIndex = 0; | |
} | |
else { | |
this.state = | |
c === CharCodes.Dash | |
? State.BeforeComment | |
: State.InDeclaration; | |
} | |
}; | |
Tokenizer.prototype.stateInDeclaration = function (c) { | |
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { | |
this.cbs.ondeclaration(this.sectionStart, this.index); | |
this.state = State.Text; | |
this.sectionStart = this.index + 1; | |
} | |
}; | |
Tokenizer.prototype.stateInProcessingInstruction = function (c) { | |
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { | |
this.cbs.onprocessinginstruction(this.sectionStart, this.index); | |
this.state = State.Text; | |
this.sectionStart = this.index + 1; | |
} | |
}; | |
Tokenizer.prototype.stateBeforeComment = function (c) { | |
if (c === CharCodes.Dash) { | |
this.state = State.InCommentLike; | |
this.currentSequence = Sequences.CommentEnd; | |
// Allow short comments (eg. <!-->) | |
this.sequenceIndex = 2; | |
this.sectionStart = this.index + 1; | |
} | |
else { | |
this.state = State.InDeclaration; | |
} | |
}; | |
Tokenizer.prototype.stateInSpecialComment = function (c) { | |
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { | |
this.cbs.oncomment(this.sectionStart, this.index, 0); | |
this.state = State.Text; | |
this.sectionStart = this.index + 1; | |
} | |
}; | |
Tokenizer.prototype.stateBeforeSpecialS = function (c) { | |
var lower = c | 0x20; | |
if (lower === Sequences.ScriptEnd[3]) { | |
this.startSpecial(Sequences.ScriptEnd, 4); | |
} | |
else if (lower === Sequences.StyleEnd[3]) { | |
this.startSpecial(Sequences.StyleEnd, 4); | |
} | |
else { | |
this.state = State.InTagName; | |
this.stateInTagName(c); // Consume the token again | |
} | |
}; | |
Tokenizer.prototype.stateBeforeEntity = function (c) { | |
// Start excess with 1 to include the '&' | |
this.entityExcess = 1; | |
this.entityResult = 0; | |
if (c === CharCodes.Number) { | |
this.state = State.BeforeNumericEntity; | |
} | |
else if (c === CharCodes.Amp) { | |
// We have two `&` characters in a row. Stay in the current state. | |
} | |
else { | |
this.trieIndex = 0; | |
this.trieCurrent = this.entityTrie[0]; | |
this.state = State.InNamedEntity; | |
this.stateInNamedEntity(c); | |
} | |
}; | |
Tokenizer.prototype.stateInNamedEntity = function (c) { | |
this.entityExcess += 1; | |
this.trieIndex = (0, decode_js_1.determineBranch)(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c); | |
if (this.trieIndex < 0) { | |
this.emitNamedEntity(); | |
this.index--; | |
return; | |
} | |
this.trieCurrent = this.entityTrie[this.trieIndex]; | |
var masked = this.trieCurrent & decode_js_1.BinTrieFlags.VALUE_LENGTH; | |
// If the branch is a value, store it and continue | |
if (masked) { | |
// The mask is the number of bytes of the value, including the current byte. | |
var valueLength = (masked >> 14) - 1; | |
// If we have a legacy entity while parsing strictly, just skip the number of bytes | |
if (!this.allowLegacyEntity() && c !== CharCodes.Semi) { | |
this.trieIndex += valueLength; | |
} | |
else { | |
// Add 1 as we have already incremented the excess | |
var entityStart = this.index - this.entityExcess + 1; | |
if (entityStart > this.sectionStart) { | |
this.emitPartial(this.sectionStart, entityStart); | |
} | |
// If this is a surrogate pair, consume the next two bytes | |
this.entityResult = this.trieIndex; | |
this.trieIndex += valueLength; | |
this.entityExcess = 0; | |
this.sectionStart = this.index + 1; | |
if (valueLength === 0) { | |
this.emitNamedEntity(); | |
} | |
} | |
} | |
}; | |
Tokenizer.prototype.emitNamedEntity = function () { | |
this.state = this.baseState; | |
if (this.entityResult === 0) { | |
return; | |
} | |
var valueLength = (this.entityTrie[this.entityResult] & decode_js_1.BinTrieFlags.VALUE_LENGTH) >> | |
14; | |
switch (valueLength) { | |
case 1: { | |
this.emitCodePoint(this.entityTrie[this.entityResult] & | |
~decode_js_1.BinTrieFlags.VALUE_LENGTH); | |
break; | |
} | |
case 2: { | |
this.emitCodePoint(this.entityTrie[this.entityResult + 1]); | |
break; | |
} | |
case 3: { | |
this.emitCodePoint(this.entityTrie[this.entityResult + 1]); | |
this.emitCodePoint(this.entityTrie[this.entityResult + 2]); | |
} | |
} | |
}; | |
Tokenizer.prototype.stateBeforeNumericEntity = function (c) { | |
if ((c | 0x20) === CharCodes.LowerX) { | |
this.entityExcess++; | |
this.state = State.InHexEntity; | |
} | |
else { | |
this.state = State.InNumericEntity; | |
this.stateInNumericEntity(c); | |
} | |
}; | |
Tokenizer.prototype.emitNumericEntity = function (strict) { | |
var entityStart = this.index - this.entityExcess - 1; | |
var numberStart = entityStart + 2 + Number(this.state === State.InHexEntity); | |
if (numberStart !== this.index) { | |
// Emit leading data if any | |
if (entityStart > this.sectionStart) { | |
this.emitPartial(this.sectionStart, entityStart); | |
} | |
this.sectionStart = this.index + Number(strict); | |
this.emitCodePoint((0, decode_js_1.replaceCodePoint)(this.entityResult)); | |
} | |
this.state = this.baseState; | |
}; | |
Tokenizer.prototype.stateInNumericEntity = function (c) { | |
if (c === CharCodes.Semi) { | |
this.emitNumericEntity(true); | |
} | |
else if (isNumber(c)) { | |
this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero); | |
this.entityExcess++; | |
} | |
else { | |
if (this.allowLegacyEntity()) { | |
this.emitNumericEntity(false); | |
} | |
else { | |
this.state = this.baseState; | |
} | |
this.index--; | |
} | |
}; | |
Tokenizer.prototype.stateInHexEntity = function (c) { | |
if (c === CharCodes.Semi) { | |
this.emitNumericEntity(true); | |
} | |
else if (isNumber(c)) { | |
this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero); | |
this.entityExcess++; | |
} | |
else if (isHexDigit(c)) { | |
this.entityResult = | |
this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10); | |
this.entityExcess++; | |
} | |
else { | |
if (this.allowLegacyEntity()) { | |
this.emitNumericEntity(false); | |
} | |
else { | |
this.state = this.baseState; | |
} | |
this.index--; | |
} | |
}; | |
Tokenizer.prototype.allowLegacyEntity = function () { | |
return (!this.xmlMode && | |
(this.baseState === State.Text || | |
this.baseState === State.InSpecialTag)); | |
}; | |
/** | |
* Remove data that has already been consumed from the buffer. | |
*/ | |
Tokenizer.prototype.cleanup = function () { | |
// If we are inside of text or attributes, emit what we already have. | |
if (this.running && this.sectionStart !== this.index) { | |
if (this.state === State.Text || | |
(this.state === State.InSpecialTag && this.sequenceIndex === 0)) { | |
this.cbs.ontext(this.sectionStart, this.index); | |
this.sectionStart = this.index; | |
} | |
else if (this.state === State.InAttributeValueDq || | |
this.state === State.InAttributeValueSq || | |
this.state === State.InAttributeValueNq) { | |
this.cbs.onattribdata(this.sectionStart, this.index); | |
this.sectionStart = this.index; | |
} | |
} | |
}; | |
Tokenizer.prototype.shouldContinue = function () { | |
return this.index < this.buffer.length + this.offset && this.running; | |
}; | |
/** | |
* Iterates through the buffer, calling the function corresponding to the current state. | |
* | |
* States that are more likely to be hit are higher up, as a performance improvement. | |
*/ | |
Tokenizer.prototype.parse = function () { | |
while (this.shouldContinue()) { | |
var c = this.buffer.charCodeAt(this.index - this.offset); | |
switch (this.state) { | |
case State.Text: { | |
this.stateText(c); | |
break; | |
} | |
case State.SpecialStartSequence: { | |
this.stateSpecialStartSequence(c); | |
break; | |
} | |
case State.InSpecialTag: { | |
this.stateInSpecialTag(c); | |
break; | |
} | |
case State.CDATASequence: { | |
this.stateCDATASequence(c); | |
break; | |
} | |
case State.InAttributeValueDq: { | |
this.stateInAttributeValueDoubleQuotes(c); | |
break; | |
} | |
case State.InAttributeName: { | |
this.stateInAttributeName(c); | |
break; | |
} | |
case State.InCommentLike: { | |
this.stateInCommentLike(c); | |
break; | |
} | |
case State.InSpecialComment: { | |
this.stateInSpecialComment(c); | |
break; | |
} | |
case State.BeforeAttributeName: { | |
this.stateBeforeAttributeName(c); | |
break; | |
} | |
case State.InTagName: { | |
this.stateInTagName(c); | |
break; | |
} | |
case State.InClosingTagName: { | |
this.stateInClosingTagName(c); | |
break; | |
} | |
case State.BeforeTagName: { | |
this.stateBeforeTagName(c); | |
break; | |
} | |
case State.AfterAttributeName: { | |
this.stateAfterAttributeName(c); | |
break; | |
} | |
case State.InAttributeValueSq: { | |
this.stateInAttributeValueSingleQuotes(c); | |
break; | |
} | |
case State.BeforeAttributeValue: { | |
this.stateBeforeAttributeValue(c); | |
break; | |
} | |
case State.BeforeClosingTagName: { | |
this.stateBeforeClosingTagName(c); | |
break; | |
} | |
case State.AfterClosingTagName: { | |
this.stateAfterClosingTagName(c); | |
break; | |
} | |
case State.BeforeSpecialS: { | |
this.stateBeforeSpecialS(c); | |
break; | |
} | |
case State.InAttributeValueNq: { | |
this.stateInAttributeValueNoQuotes(c); | |
break; | |
} | |
case State.InSelfClosingTag: { | |
this.stateInSelfClosingTag(c); | |
break; | |
} | |
case State.InDeclaration: { | |
this.stateInDeclaration(c); | |
break; | |
} | |
case State.BeforeDeclaration: { | |
this.stateBeforeDeclaration(c); | |
break; | |
} | |
case State.BeforeComment: { | |
this.stateBeforeComment(c); | |
break; | |
} | |
case State.InProcessingInstruction: { | |
this.stateInProcessingInstruction(c); | |
break; | |
} | |
case State.InNamedEntity: { | |
this.stateInNamedEntity(c); | |
break; | |
} | |
case State.BeforeEntity: { | |
this.stateBeforeEntity(c); | |
break; | |
} | |
case State.InHexEntity: { | |
this.stateInHexEntity(c); | |
break; | |
} | |
case State.InNumericEntity: { | |
this.stateInNumericEntity(c); | |
break; | |
} | |
default: { | |
// `this._state === State.BeforeNumericEntity` | |
this.stateBeforeNumericEntity(c); | |
} | |
} | |
this.index++; | |
} | |
this.cleanup(); | |
}; | |
Tokenizer.prototype.finish = function () { | |
if (this.state === State.InNamedEntity) { | |
this.emitNamedEntity(); | |
} | |
// If there is remaining data, emit it in a reasonable way | |
if (this.sectionStart < this.index) { | |
this.handleTrailingData(); | |
} | |
this.cbs.onend(); | |
}; | |
/** Handle any trailing data. */ | |
Tokenizer.prototype.handleTrailingData = function () { | |
var endIndex = this.buffer.length + this.offset; | |
if (this.state === State.InCommentLike) { | |
if (this.currentSequence === Sequences.CdataEnd) { | |
this.cbs.oncdata(this.sectionStart, endIndex, 0); | |
} | |
else { | |
this.cbs.oncomment(this.sectionStart, endIndex, 0); | |
} | |
} | |
else if (this.state === State.InNumericEntity && | |
this.allowLegacyEntity()) { | |
this.emitNumericEntity(false); | |
// All trailing data will have been consumed | |
} | |
else if (this.state === State.InHexEntity && | |
this.allowLegacyEntity()) { | |
this.emitNumericEntity(false); | |
// All trailing data will have been consumed | |
} | |
else if (this.state === State.InTagName || | |
this.state === State.BeforeAttributeName || | |
this.state === State.BeforeAttributeValue || | |
this.state === State.AfterAttributeName || | |
this.state === State.InAttributeName || | |
this.state === State.InAttributeValueSq || | |
this.state === State.InAttributeValueDq || | |
this.state === State.InAttributeValueNq || | |
this.state === State.InClosingTagName) { | |
/* | |
* If we are currently in an opening or closing tag, us not calling the | |
* respective callback signals that the tag should be ignored. | |
*/ | |
} | |
else { | |
this.cbs.ontext(this.sectionStart, endIndex); | |
} | |
}; | |
Tokenizer.prototype.emitPartial = function (start, endIndex) { | |
if (this.baseState !== State.Text && | |
this.baseState !== State.InSpecialTag) { | |
this.cbs.onattribdata(start, endIndex); | |
} | |
else { | |
this.cbs.ontext(start, endIndex); | |
} | |
}; | |
Tokenizer.prototype.emitCodePoint = function (cp) { | |
if (this.baseState !== State.Text && | |
this.baseState !== State.InSpecialTag) { | |
this.cbs.onattribentity(cp); | |
} | |
else { | |
this.cbs.ontextentity(cp); | |
} | |
}; | |
return Tokenizer; | |
}()); | |
exports.default = Tokenizer; | |
//# sourceMappingURL=Tokenizer.js.map |