from .exceptions_types import EmailSyntaxError, ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress from typing import Optional, Tuple, TypedDict, Union def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: # Return the display name, unescaped local part, and domain part # of the address, and whether the local part was quoted. If no # display name was present and angle brackets do not surround # the address, display name will be None; otherwise, it will be # set to the display name or the empty string if there were # angle brackets but no display name. # Typical email addresses have a single @-sign and no quote # characters, but the awkward "quoted string" local part form # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear # in the local part if the local part is quoted. # A `display name ` format is also present in MIME messages # (RFC 5322 3.4) and this format is also often recognized in # mail UIs. It's not allowed in SMTP commands or in typical web # login forms, but parsing it has been requested, so it's done # here as a convenience. It's implemented in the spirit but not # the letter of RFC 5322 3.4 because MIME messages allow newlines # and comments as a part of the CFWS rule, but this is typically # not allowed in mail UIs (although comment syntax was requested # once too). # # Display names are either basic characters (the same basic characters # permitted in email addresses, but periods are not allowed and spaces # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with # the same rules as a quoted local part. (Multiple quoted strings might # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the # email address follows in angle brackets. # # An initial quote is ambiguous between starting a display name or # a quoted local part --- fun. # # We assume the input string is already stripped of leading and # trailing CFWS. def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: # Split the string at the first character in specials (an @-sign # or left angle bracket) that does not occur within quotes and # is not followed by a Unicode combining character. # If no special character is found, raise an error. inside_quote = False escaped = False left_part = "" for i, c in enumerate(text): # < plus U+0338 (Combining Long Solidus Overlay) normalizes to # ≮ U+226E (Not Less-Than), and it would be confusing to treat # the < as the start of "" syntax in that case. Liekwise, # if anything combines with an @ or ", we should probably not # treat it as a special character. if unicodedata.normalize("NFC", text[i:])[0] != c: left_part += c elif inside_quote: left_part += c if c == '\\' and not escaped: escaped = True elif c == '"' and not escaped: # The only way to exit the quote is an unescaped quote. inside_quote = False escaped = False else: escaped = False elif c == '"': left_part += c inside_quote = True elif c in specials: # When unquoted, stop before a special character. break else: left_part += c if len(left_part) == len(text): raise EmailSyntaxError("An email address must have an @-sign.") # The right part is whatever is left. right_part = text[len(left_part):] return left_part, right_part def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Remove surrounding quotes and unescape escaped backslashes # and quotes. Escapes are parsed liberally. I think only # backslashes and quotes can be escaped but we'll allow anything # to be. quoted = False escaped = False value = "" for i, c in enumerate(text): if quoted: if escaped: value += c escaped = False elif c == '\\': escaped = True elif c == '"': if i != len(text) - 1: raise EmailSyntaxError("Extra character(s) found after close quote: " + ", ".join(safe_character_display(c) for c in text[i + 1:])) break else: value += c elif i == 0 and c == '"': quoted = True else: value += c return value, quoted # Split the string at the first unquoted @-sign or left angle bracket. left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) # If the right part starts with an angle bracket, # then the left part is a display name and the rest # of the right part up to the final right angle bracket # is the email address, . if right_part.startswith("<"): # Remove space between the display name and angle bracket. left_part = left_part.rstrip() # Unquote and unescape the display name. display_name, display_name_quoted = unquote_quoted_string(left_part) # Check that only basic characters are present in a # non-quoted display name. if not display_name_quoted: bad_chars = { safe_character_display(c) for c in display_name if (not ATEXT_RE.match(c) and c != ' ') or c == '.' } if bad_chars: raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") # Check for other unsafe characters. check_unsafe_chars(display_name, allow_space=True) # Check that the right part ends with an angle bracket # but allow spaces after it, I guess. if ">" not in right_part: raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") right_part = right_part.rstrip(" ") if right_part[-1] != ">": raise EmailSyntaxError("There can't be anything after the email address.") # Remove the initial and trailing angle brackets. addr_spec = right_part[1:].rstrip(">") # Split the email address at the first unquoted @-sign. local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) # Otherwise there is no display name. The left part is the local # part and the right part is the domain. else: display_name = None local_part, domain_part = left_part, right_part if domain_part.startswith("@"): domain_part = domain_part[1:] # Unquote the local part if it is quoted. local_part, is_quoted_local_part = unquote_quoted_string(local_part) return display_name, local_part, domain_part, is_quoted_local_part def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit suffix = "s" if diff > 1 else "" return f"({diff} character{suffix} too many)" def safe_character_display(c: str) -> str: # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it if unicodedata.category(c)[0] in ("L", "N", "P", "S"): return repr(c) # Construct a hex string in case the unicode name doesn't exist. if ord(c) < 0xFFFF: h = f"U+{ord(c):04x}".upper() else: h = f"U+{ord(c):08x}".upper() # Return the character name or, if it has no name, the hex string. return unicodedata.name(c, h) class LocalPartValidationResult(TypedDict): local_part: str ascii_local_part: Optional[str] smtputf8: bool def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, quoted_local_part: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: if not allow_empty_local: raise EmailSyntaxError("There must be something before the @-sign.") # The caller allows an empty local part. Useful for validating certain # Postfix aliases. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } # Check the length of the local part by counting characters. # (RFC 5321 4.5.3.1.1) # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. if len(local) > LOCAL_PART_MAX_LENGTH: reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. # (RFC 5322 3.2.3) # All local parts matching the dot-atom rule are also valid as a quoted string # so if it was originally quoted (quoted_local_part is True) and this regex matches, # it's ok. # (RFC 5321 4.1.2 / RFC 5322 3.2.4). if DOT_ATOM_TEXT.match(local): # It's valid. And since it's just the permitted ASCII characters, # it's normalized and safe. If the local part was originally quoted, # the quoting was unnecessary and it'll be returned as normalized to # non-quoted form. # Return the local part and flag that SMTPUTF8 is not needed. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } # The local part failed the basic dot-atom check. Try the extended character set # for internationalized addresses. It's the same pattern but with additional # characters permitted. # RFC 6531 section 3.3. valid: Optional[str] = None requires_smtputf8 = False if DOT_ATOM_TEXT_INTL.match(local): # But international characters in the local part may not be permitted. if not allow_smtputf8: # Check for invalid characters against the non-internationalized # permitted character set. # (RFC 5322 3.2.3) bad_chars = { safe_character_display(c) for c in local if not ATEXT_RE.match(c) } if bad_chars: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") # Although the check above should always find something, fall back to this just in case. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") # It's valid. valid = "dot-atom" requires_smtputf8 = True # There are no syntactic restrictions on quoted local parts, so if # it was originally quoted, it is probably valid. More characters # are allowed, like @-signs, spaces, and quotes, and there are no # restrictions on the placement of dots, as in dot-atom local parts. elif quoted_local_part: # Check for invalid characters in a quoted string local part. # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* # characters which are *not* allowed here. RFC 6531 section 3.3 # extends the range to UTF8 strings.) bad_chars = { safe_character_display(c) for c in local if not QTEXT_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # See if any characters are outside of the ASCII range. bad_chars = { safe_character_display(c) for c in local if not (32 <= ord(c) <= 126) } if bad_chars: requires_smtputf8 = True # International characters in the local part may not be permitted. if not allow_smtputf8: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") # It's valid. valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, # perform additional checks for Unicode strings. if valid: # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the # email specs, but they may not be valid, safe, or sensible Unicode strings. # See the function for rationale. check_unsafe_chars(local, allow_space=(valid == "quoted")) # Try encoding to UTF-8. Failure is possible with some characters like # surrogate code points, but those are checked above. Still, we don't # want to have an unhandled exception later. try: local.encode("utf8") except ValueError as e: raise EmailSyntaxError("The email address contains an invalid character.") from e # If this address passes only by the quoted string form, re-quote it # and backslash-escape quotes and backslashes (removing any unnecessary # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, # and the sending system SHOULD transmit the form that uses the minimum quoting possible." if valid == "quoted": local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' return { "local_part": local, "ascii_local_part": local if not requires_smtputf8 else None, "smtputf8": requires_smtputf8, } # It's not a valid local part. Let's find out why. # (Since quoted local parts are all valid or handled above, these checks # don't apply in those cases.) # Check for invalid characters. # (RFC 5322 3.2.3, plus RFC 6531 3.3) bad_chars = { safe_character_display(c) for c in local if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # Check for dot errors imposted by the dot-atom rule. # (RFC 5322 3.2.3) check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) # All of the reasons should already have been checked, but just in case # we have a fallback message. raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() for i, c in enumerate(s): category = unicodedata.category(c) if category[0] in ("L", "N", "P", "S"): # Letters, numbers, punctuation, and symbols are permitted. pass elif category[0] == "M": # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be # sensible because it would modify the @-sign. if i == 0: bad_chars.add(c) elif category == "Zs": # Spaces outside of the ASCII range are not specifically disallowed in # internationalized addresses as far as I can tell, but they violate # the spirit of the non-internationalized specification that email # addresses do not contain ASCII spaces when not quoted. Excluding # ASCII spaces when not quoted is handled directly by the atom regex. # # In quoted-string local parts, spaces are explicitly permitted, and # the ASCII space has category Zs, so we must allow it here, and we'll # allow all Unicode spaces to be consistent. if not allow_space: bad_chars.add(c) elif category[0] == "Z": # The two line and paragraph separator characters (in categories Zl and Zp) # are not specifically disallowed in internationalized addresses # as far as I can tell, but they violate the spirit of the non-internationalized # specification that email addresses do not contain line breaks when not quoted. bad_chars.add(c) elif category[0] == "C": # Control, format, surrogate, private use, and unassigned code points (C) # are all unsafe in various ways. Control and format characters can affect # text rendering if the email address is concatenated with other text. # Bidirectional format characters are unsafe, even if used properly, because # they cause an email address to render as a different email address. # Private use characters do not make sense for publicly deliverable # email addresses. bad_chars.add(c) else: # All categories should be handled above, but in case there is something new # to the Unicode specification in the future, reject all other categories. bad_chars.add(c) if bad_chars: raise EmailSyntaxError("The email address contains unsafe characters: " + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) if label.startswith("."): raise EmailSyntaxError(start_descr.format("period")) if ".." in label: raise EmailSyntaxError("An email address cannot have two periods in a row.") if is_hostname: # RFC 952 if label.endswith("-"): raise EmailSyntaxError(end_descr.format("hyphen")) if label.startswith("-"): raise EmailSyntaxError(start_descr.format("hyphen")) if ".-" in label or "-." in label: raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") class DomainNameValidationResult(TypedDict): ascii_domain: str domain: str def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, # such as "⒈" which is invalid because it would expand to include a dot and # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. # Since several characters *are* normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. original_domain = domain try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e # Check for invalid characters after Unicode normalization which are not caught # by uts46_remap (see tests for examples). bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) # Check for RFC 5890's invalid R-LDH labels, which are labels that start # with two characters other than "xn" and two dashes. for label in domain.split("."): if re.match(r"(?!xn)..--", label, re.I): raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): # This is a valid non-internationalized domain. ascii_domain = domain else: # If international characters are present in the domain name, convert # the domain to IDNA ASCII. If internationalized characters are present, # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # # idna.encode also checks the domain name length after encoding but it # doesn't give a nice error, so we call the underlying idna.alabel method # directly. idna.alabel checks label length and doesn't give great messages, # but we can't easily go to lower level methods. try: ascii_domain = ".".join( idna.alabel(label).decode("ascii") for label in domain.split(".") ) except idna.IDNAError as e: # Some errors would have already been raised by idna.uts46_remap. raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") # Check the length of the domain name in bytes. # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) # We're checking the number of bytes ("octets") here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: if ascii_domain == original_domain: reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") else: diff = len(ascii_domain) - DOMAIN_MAX_LENGTH s = "" if diff == 1 else "s" raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") # Also check the label length limit. # (RFC 1035 2.3.1) for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") if globally_deliverable: # All publicly deliverable addresses have domain names with at least # one period, at least for gTLDs created since 2013 (per the ICANN Board # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). # We'll consider the lack of a period a syntax error # since that will match people's sense of what an email address looks # like. We'll skip this in test environments to allow '@test' email # addresses. if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") # We also know that all TLDs currently end with a letter. if not DOMAIN_NAME_REGEX.search(ascii_domain): raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. # See the references in __init__.py. from . import SPECIAL_USE_DOMAIN_NAMES for d in SPECIAL_USE_DOMAIN_NAMES: # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. if d == "test" and test_environment: continue if ascii_domain == d or ascii_domain.endswith("." + d): raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") # We may have been given an IDNA ASCII domain to begin with. Check # that the domain actually conforms to IDNA. It could look like IDNA # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # # This gives us the canonical internationalized form of the domain, # which we return to the caller as a part of the normalized email # address. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e # Check that this normalized domain name has not somehow become # an invalid domain name. All of the checks before this point # using the idna package probably guarantee that we now have # a valid international domain name in most respects. But it # doesn't hurt to re-apply some tests to be sure. See the similar # tests above. # Check for invalid and unsafe characters. We have no test # case for this. bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) # Check that it can be encoded back to IDNA ASCII. We have no test # case for this. try: idna.encode(domain_i18n) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, # which is better for display purposes. This should also take care # of RFC 6532 section 3.1's suggestion to apply Unicode NFC # normalization to addresses. return { "ascii_domain": ascii_domain, "domain": domain_i18n, } def validate_email_length(addrinfo: ValidatedEmail) -> None: # There are three forms of the email address whose length must be checked: # # 1) The original email address string. Since callers may continue to use # this string, even though we recommend using the normalized form, we # should not pass validation when the original input is not valid. This # form is checked first because it is the original input. # 2) The normalized email address. We perform Unicode NFC normalization of # the local part, we normalize the domain to internationalized characters # (if originaly IDNA ASCII) which also includes Unicode normalization, # and we may remove quotes in quoted local parts. We recommend that # callers use this string, so it must be valid. # 3) The email address with the IDNA ASCII representation of the domain # name, since this string may be used with email stacks that don't # support UTF-8. Since this is the least likely to be used by callers, # it is checked last. Note that ascii_email will only be set if the # local part is ASCII, but conceivably the caller may combine a # internationalized local part with an ASCII domain, so we check this # on that combination also. Since we only return the normalized local # part, we use that (and not the unnormalized local part). # # In all cases, the length is checked in UTF-8 because the SMTPUTF8 # extension to SMTP validates the length in bytes. addresses_to_check = [ (addrinfo.original, None), (addrinfo.normalized, "after normalization"), ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), ] for addr, reason in addresses_to_check: addr_len = len(addr) addr_utf8_len = len(addr.encode("utf8")) diff = addr_utf8_len - EMAIL_MAX_LENGTH if diff > 0: if reason is None and addr_len == addr_utf8_len: # If there is no normalization or transcoding, # we can give a simple count of the number of # characters over the limit. reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) elif reason is None: # If there is no normalization but there is # some transcoding to UTF-8, we can compute # the minimum number of characters over the # limit by dividing the number of bytes over # the limit by the maximum number of bytes # per character. mbpc = max(len(c.encode("utf8")) for c in addr) mchars = max(1, diff // mbpc) suffix = "s" if diff > 1 else "" if mchars == diff: reason = f"({diff} character{suffix} too many)" else: reason = f"({mchars}-{diff} character{suffix} too many)" else: # Since there is normalization, the number of # characters in the input that need to change is # impossible to know. suffix = "s" if diff > 1 else "" reason += f" ({diff} byte{suffix} too many)" raise EmailSyntaxError(f"The email address is too long {reason}.") class DomainLiteralValidationResult(TypedDict): domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] domain: str def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. if re.match(r"^[0-9\.]+$", domain_literal): try: addr = ipaddress.IPv4Address(domain_literal) except ValueError as e: raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e # Return the IPv4Address object and the domain back unchanged. return { "domain_address": addr, "domain": f"[{addr}]", } # If it begins with "IPv6:" it's an IPv6 address. if domain_literal.startswith("IPv6:"): try: addr = ipaddress.IPv6Address(domain_literal[5:]) except ValueError as e: raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e # Return the IPv6Address object and construct a normalized # domain literal. return { "domain_address": addr, "domain": f"[IPv6:{addr.compressed}]", } # Nothing else is valid. if ":" not in domain_literal: raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") # The tag (the part before the colon) has character restrictions, # but since it must come from a registry of tags (in which only "IPv6" is defined), # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. # Check for permitted ASCII characters. This actually doesn't matter # since there will be an exception after anyway. bad_chars = { safe_character_display(c) for c in domain_literal if not DOMAIN_LITERAL_CHARS.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") # There are no other domain literal tags. # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")