|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import ParseError from "./ParseError"; |
|
import SourceLocation from "./SourceLocation"; |
|
import {Token} from "./Token"; |
|
|
|
import type {LexerInterface} from "./Token"; |
|
import type Settings from "./Settings"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const spaceRegexString = "[ \r\n\t]"; |
|
const controlWordRegexString = "\\\\[a-zA-Z@]+"; |
|
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; |
|
const controlWordWhitespaceRegexString = |
|
`(${controlWordRegexString})${spaceRegexString}*`; |
|
const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*"; |
|
const combiningDiacriticalMarkString = "[\u0300-\u036f]"; |
|
export const combiningDiacriticalMarksEndRegex: RegExp = |
|
new RegExp(`${combiningDiacriticalMarkString}+$`); |
|
const tokenRegexString = `(${spaceRegexString}+)|` + |
|
`${controlSpaceRegexString}|` + |
|
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + |
|
`${combiningDiacriticalMarkString}*` + |
|
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + |
|
`${combiningDiacriticalMarkString}*` + |
|
"|\\\\verb\\*([^]).*?\\4" + |
|
"|\\\\verb([^*a-zA-Z]).*?\\5" + |
|
`|${controlWordWhitespaceRegexString}` + |
|
`|${controlSymbolRegexString})`; |
|
|
|
|
|
export default class Lexer implements LexerInterface { |
|
input: string; |
|
settings: Settings; |
|
tokenRegex: RegExp; |
|
|
|
|
|
catcodes: {[string]: number}; |
|
|
|
constructor(input: string, settings: Settings) { |
|
|
|
this.input = input; |
|
this.settings = settings; |
|
this.tokenRegex = new RegExp(tokenRegexString, 'g'); |
|
this.catcodes = { |
|
"%": 14, |
|
"~": 13, |
|
}; |
|
} |
|
|
|
setCatcode(char: string, code: number) { |
|
this.catcodes[char] = code; |
|
} |
|
|
|
|
|
|
|
|
|
lex(): Token { |
|
const input = this.input; |
|
const pos = this.tokenRegex.lastIndex; |
|
if (pos === input.length) { |
|
return new Token("EOF", new SourceLocation(this, pos, pos)); |
|
} |
|
const match = this.tokenRegex.exec(input); |
|
if (match === null || match.index !== pos) { |
|
throw new ParseError( |
|
`Unexpected character: '${input[pos]}'`, |
|
new Token(input[pos], new SourceLocation(this, pos, pos + 1))); |
|
} |
|
const text = match[6] || match[3] || (match[2] ? "\\ " : " "); |
|
|
|
if (this.catcodes[text] === 14) { |
|
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex); |
|
if (nlIndex === -1) { |
|
this.tokenRegex.lastIndex = input.length; |
|
this.settings.reportNonstrict("commentAtEnd", |
|
"% comment has no terminating newline; LaTeX would " + |
|
"fail because of commenting the end of math mode (e.g. $)"); |
|
} else { |
|
this.tokenRegex.lastIndex = nlIndex + 1; |
|
} |
|
return this.lex(); |
|
} |
|
|
|
return new Token(text, new SourceLocation(this, pos, |
|
this.tokenRegex.lastIndex)); |
|
} |
|
} |
|
|