/** | |
* @typedef {import('micromark-util-types').Code} Code | |
*/ | |
import {unicodePunctuationRegex} from './lib/unicode-punctuation-regex.js' | |
/** | |
* Check whether the character code represents an ASCII alpha (`a` through `z`, | |
* case insensitive). | |
* | |
* An **ASCII alpha** is an ASCII upper alpha or ASCII lower alpha. | |
* | |
* An **ASCII upper alpha** is a character in the inclusive range U+0041 (`A`) | |
* to U+005A (`Z`). | |
* | |
* An **ASCII lower alpha** is a character in the inclusive range U+0061 (`a`) | |
* to U+007A (`z`). | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiAlpha = regexCheck(/[A-Za-z]/) | |
/** | |
* Check whether the character code represents an ASCII alphanumeric (`a` | |
* through `z`, case insensitive, or `0` through `9`). | |
* | |
* An **ASCII alphanumeric** is an ASCII digit (see `asciiDigit`) or ASCII alpha | |
* (see `asciiAlpha`). | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiAlphanumeric = regexCheck(/[\dA-Za-z]/) | |
/** | |
* Check whether the character code represents an ASCII atext. | |
* | |
* atext is an ASCII alphanumeric (see `asciiAlphanumeric`), or a character in | |
* the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), | |
* U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F | |
* SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E | |
* CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE | |
* (`{`) to U+007E TILDE (`~`). | |
* | |
* See: | |
* **\[RFC5322]**: | |
* [Internet Message Format](https://tools.ietf.org/html/rfc5322). | |
* P. Resnick. | |
* IETF. | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiAtext = regexCheck(/[#-'*+\--9=?A-Z^-~]/) | |
/** | |
* Check whether a character code is an ASCII control character. | |
* | |
* An **ASCII control** is a character in the inclusive range U+0000 NULL (NUL) | |
* to U+001F (US), or U+007F (DEL). | |
* | |
* @param {Code} code | |
* Code. | |
* @returns {boolean} | |
* Whether it matches. | |
*/ | |
export function asciiControl(code) { | |
return ( | |
// Special whitespace codes (which have negative values), C0 and Control | |
// character DEL | |
code !== null && (code < 32 || code === 127) | |
) | |
} | |
/** | |
* Check whether the character code represents an ASCII digit (`0` through `9`). | |
* | |
* An **ASCII digit** is a character in the inclusive range U+0030 (`0`) to | |
* U+0039 (`9`). | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiDigit = regexCheck(/\d/) | |
/** | |
* Check whether the character code represents an ASCII hex digit (`a` through | |
* `f`, case insensitive, or `0` through `9`). | |
* | |
* An **ASCII hex digit** is an ASCII digit (see `asciiDigit`), ASCII upper hex | |
* digit, or an ASCII lower hex digit. | |
* | |
* An **ASCII upper hex digit** is a character in the inclusive range U+0041 | |
* (`A`) to U+0046 (`F`). | |
* | |
* An **ASCII lower hex digit** is a character in the inclusive range U+0061 | |
* (`a`) to U+0066 (`f`). | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiHexDigit = regexCheck(/[\dA-Fa-f]/) | |
/** | |
* Check whether the character code represents ASCII punctuation. | |
* | |
* An **ASCII punctuation** is a character in the inclusive ranges U+0021 | |
* EXCLAMATION MARK (`!`) to U+002F SLASH (`/`), U+003A COLON (`:`) to U+0040 AT | |
* SIGN (`@`), U+005B LEFT SQUARE BRACKET (`[`) to U+0060 GRAVE ACCENT | |
* (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE (`~`). | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const asciiPunctuation = regexCheck(/[!-/:-@[-`{-~]/) | |
/** | |
* Check whether a character code is a markdown line ending. | |
* | |
* A **markdown line ending** is the virtual characters M-0003 CARRIAGE RETURN | |
* LINE FEED (CRLF), M-0004 LINE FEED (LF) and M-0005 CARRIAGE RETURN (CR). | |
* | |
* In micromark, the actual character U+000A LINE FEED (LF) and U+000D CARRIAGE | |
* RETURN (CR) are replaced by these virtual characters depending on whether | |
* they occurred together. | |
* | |
* @param {Code} code | |
* Code. | |
* @returns {boolean} | |
* Whether it matches. | |
*/ | |
export function markdownLineEnding(code) { | |
return code !== null && code < -2 | |
} | |
/** | |
* Check whether a character code is a markdown line ending (see | |
* `markdownLineEnding`) or markdown space (see `markdownSpace`). | |
* | |
* @param {Code} code | |
* Code. | |
* @returns {boolean} | |
* Whether it matches. | |
*/ | |
export function markdownLineEndingOrSpace(code) { | |
return code !== null && (code < 0 || code === 32) | |
} | |
/** | |
* Check whether a character code is a markdown space. | |
* | |
* A **markdown space** is the concrete character U+0020 SPACE (SP) and the | |
* virtual characters M-0001 VIRTUAL SPACE (VS) and M-0002 HORIZONTAL TAB (HT). | |
* | |
* In micromark, the actual character U+0009 CHARACTER TABULATION (HT) is | |
* replaced by one M-0002 HORIZONTAL TAB (HT) and between 0 and 3 M-0001 VIRTUAL | |
* SPACE (VS) characters, depending on the column at which the tab occurred. | |
* | |
* @param {Code} code | |
* Code. | |
* @returns {boolean} | |
* Whether it matches. | |
*/ | |
export function markdownSpace(code) { | |
return code === -2 || code === -1 || code === 32 | |
} | |
// Size note: removing ASCII from the regex and using `asciiPunctuation` here | |
// In fact adds to the bundle size. | |
/** | |
* Check whether the character code represents Unicode punctuation. | |
* | |
* A **Unicode punctuation** is a character in the Unicode `Pc` (Punctuation, | |
* Connector), `Pd` (Punctuation, Dash), `Pe` (Punctuation, Close), `Pf` | |
* (Punctuation, Final quote), `Pi` (Punctuation, Initial quote), `Po` | |
* (Punctuation, Other), or `Ps` (Punctuation, Open) categories, or an ASCII | |
* punctuation (see `asciiPunctuation`). | |
* | |
* See: | |
* **\[UNICODE]**: | |
* [The Unicode Standard](https://www.unicode.org/versions/). | |
* Unicode Consortium. | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const unicodePunctuation = regexCheck(unicodePunctuationRegex) | |
/** | |
* Check whether the character code represents Unicode whitespace. | |
* | |
* Note that this does handle micromark specific markdown whitespace characters. | |
* See `markdownLineEndingOrSpace` to check that. | |
* | |
* A **Unicode whitespace** is a character in the Unicode `Zs` (Separator, | |
* Space) category, or U+0009 CHARACTER TABULATION (HT), U+000A LINE FEED (LF), | |
* U+000C (FF), or U+000D CARRIAGE RETURN (CR) (**\[UNICODE]**). | |
* | |
* See: | |
* **\[UNICODE]**: | |
* [The Unicode Standard](https://www.unicode.org/versions/). | |
* Unicode Consortium. | |
* | |
* @param code | |
* Code. | |
* @returns | |
* Whether it matches. | |
*/ | |
export const unicodeWhitespace = regexCheck(/\s/) | |
/** | |
* Create a code check from a regex. | |
* | |
* @param {RegExp} regex | |
* @returns {(code: Code) => boolean} | |
*/ | |
function regexCheck(regex) { | |
return check | |
/** | |
* Check whether a code matches the bound regex. | |
* | |
* @param {Code} code | |
* Character code. | |
* @returns {boolean} | |
* Whether the character code matches the bound regex. | |
*/ | |
function check(code) { | |
return code !== null && regex.test(String.fromCharCode(code)) | |
} | |
} | |