from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ QUOTED_LOCAL_PART_ADDR import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress from typing import Optional def split_email(email): # Return the local part and domain part of the address and # whether the local part was quoted as a three-tuple. # Typical email addresses have a single @-sign, but the # awkward "quoted string" local part form (RFC 5321 4.1.2) # allows @-signs (and escaped quotes) to appear in the local # part if the local part is quoted. If the address is quoted, # split it at a non-escaped @-sign and unescape the escaping. if m := QUOTED_LOCAL_PART_ADDR.match(email): local_part, domain_part = m.groups() # Since backslash-escaping is no longer needed because # the quotes are removed, remove backslash-escaping # to return in the normalized form. local_part = re.sub(r"\\(.)", "\\1", local_part) return local_part, domain_part, True else: # Split at the one and only at-sign. parts = email.split('@') if len(parts) != 2: raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") local_part, domain_part = parts return local_part, domain_part, False def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" return f"({prefix}{diff} character{suffix} too many)" def safe_character_display(c): # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it if unicodedata.category(c)[0] in ("L", "N", "P", "S"): return repr(c) # Construct a hex string in case the unicode name doesn't exist. if ord(c) < 0xFFFF: h = f"U+{ord(c):04x}".upper() else: h = f"U+{ord(c):08x}".upper() # Return the character name or, if it has no name, the hex string. return unicodedata.name(c, h) def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, quoted_local_part: bool = False): """Validates the syntax of the local part of an email address.""" if len(local) == 0: if not allow_empty_local: raise EmailSyntaxError("There must be something before the @-sign.") # The caller allows an empty local part. Useful for validating certain # Postfix aliases. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } # Check the length of the local part by counting characters. # (RFC 5321 4.5.3.1.1) # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. if len(local) > LOCAL_PART_MAX_LENGTH: reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. # (RFC 5322 3.2.3) # All local parts matching the dot-atom rule are also valid as a quoted string # so if it was originally quoted (quoted_local_part is True) and this regex matches, # it's ok. # (RFC 5321 4.1.2 / RFC 5322 3.2.4). if DOT_ATOM_TEXT.match(local): # It's valid. And since it's just the permitted ASCII characters, # it's normalized and safe. If the local part was originally quoted, # the quoting was unnecessary and it'll be returned as normalized to # non-quoted form. # Return the local part and flag that SMTPUTF8 is not needed. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } # The local part failed the basic dot-atom check. Try the extended character set # for internationalized addresses. It's the same pattern but with additional # characters permitted. # RFC 6531 section 3.3. valid: Optional[str] = None requires_smtputf8 = False if DOT_ATOM_TEXT_INTL.match(local): # But international characters in the local part may not be permitted. if not allow_smtputf8: # Check for invalid characters against the non-internationalized # permitted character set. # (RFC 5322 3.2.3) bad_chars = { safe_character_display(c) for c in local if not ATEXT_RE.match(c) } if bad_chars: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") # Although the check above should always find something, fall back to this just in case. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") # It's valid. valid = "dot-atom" requires_smtputf8 = True # There are no syntactic restrictions on quoted local parts, so if # it was originally quoted, it is probably valid. More characters # are allowed, like @-signs, spaces, and quotes, and there are no # restrictions on the placement of dots, as in dot-atom local parts. elif quoted_local_part: # Check for invalid characters in a quoted string local part. # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* # characters which are *not* allowed here. RFC 6531 section 3.3 # extends the range to UTF8 strings.) bad_chars = { safe_character_display(c) for c in local if not QTEXT_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # See if any characters are outside of the ASCII range. bad_chars = { safe_character_display(c) for c in local if not (32 <= ord(c) <= 126) } if bad_chars: requires_smtputf8 = True # International characters in the local part may not be permitted. if not allow_smtputf8: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") # It's valid. valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, # perform normalization and additional checks for Unicode strings. if valid: # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the # email specs, but they may not be valid, safe, or sensible Unicode strings. # See the function for rationale. check_unsafe_chars(local, allow_space=(valid == "quoted")) # Try encoding to UTF-8. Failure is possible with some characters like # surrogate code points, but those are checked above. Still, we don't # want to have an unhandled exception later. try: local.encode("utf8") except ValueError as e: raise EmailSyntaxError("The email address contains an invalid character.") from e # If this address passes only by the quoted string form, re-quote it # and backslash-escape quotes and backslashes (removing any unnecessary # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, # and the sending system SHOULD transmit the form that uses the minimum quoting possible." if valid == "quoted": local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' return { "local_part": local, "ascii_local_part": local if not requires_smtputf8 else None, "smtputf8": requires_smtputf8, } # It's not a valid local part. Let's find out why. # (Since quoted local parts are all valid or handled above, these checks # don't apply in those cases.) # Check for invalid characters. # (RFC 5322 3.2.3, plus RFC 6531 3.3) bad_chars = { safe_character_display(c) for c in local if not ATEXT_INTL_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # Check for dot errors imposted by the dot-atom rule. # (RFC 5322 3.2.3) check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) # All of the reasons should already have been checked, but just in case # we have a fallback message. raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") def check_unsafe_chars(s, allow_space=False): # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() for i, c in enumerate(s): category = unicodedata.category(c) if category[0] in ("L", "N", "P", "S"): # Letters, numbers, punctuation, and symbols are permitted. pass elif category[0] == "M": # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be # sensible. if i == 0: bad_chars.add(c) elif category == "Zs": # Spaces outside of the ASCII range are not specifically disallowed in # internationalized addresses as far as I can tell, but they violate # the spirit of the non-internationalized specification that email # addresses do not contain ASCII spaces when not quoted. Excluding # ASCII spaces when not quoted is handled directly by the atom regex. # # In quoted-string local parts, spaces are explicitly permitted, and # the ASCII space has category Zs, so we must allow it here, and we'll # allow all Unicode spaces to be consistent. if not allow_space: bad_chars.add(c) elif category[0] == "Z": # The two line and paragraph separator characters (in categories Zl and Zp) # are not specifically disallowed in internationalized addresses # as far as I can tell, but they violate the spirit of the non-internationalized # specification that email addresses do not contain line breaks when not quoted. bad_chars.add(c) elif category[0] == "C": # Control, format, surrogate, private use, and unassigned code points (C) # are all unsafe in various ways. Control and format characters can affect # text rendering if the email address is concatenated with other text. # Bidirectional format characters are unsafe, even if used properly, because # they cause an email address to render as a different email address. # Private use characters do not make sense for publicly deliverable # email addresses. bad_chars.add(c) else: # All categories should be handled above, but in case there is something new # to the Unicode specification in the future, reject all other categories. bad_chars.add(c) if bad_chars: raise EmailSyntaxError("The email address contains unsafe characters: " + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") def check_dot_atom(label, start_descr, end_descr, is_hostname): # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) if label.startswith("."): raise EmailSyntaxError(start_descr.format("period")) if ".." in label: raise EmailSyntaxError("An email address cannot have two periods in a row.") if is_hostname: # RFC 952 if label.endswith("-"): raise EmailSyntaxError(end_descr.format("hyphen")) if label.startswith("-"): raise EmailSyntaxError(start_descr.format("hyphen")) if ".-" in label or "-." in label: raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): """Validates the syntax of the domain part of an email address.""" # Check for invalid characters before normalization. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, # such as "⒈" which is invalid because it would expand to include a dot. # Since several characters are normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) # Check for RFC 5890's invalid R-LDH labels, which are labels that start # with two characters other than "xn" and two dashes. for label in domain.split("."): if re.match(r"(?!xn)..--", label, re.I): raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): # This is a valid non-internationalized domain. ascii_domain = domain else: # If international characters are present in the domain name, convert # the domain to IDNA ASCII. If internationalized characters are present, # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # # Unfortunately this step incorrectly 'fixes' domain names with leading # periods by removing them, so we have to check for this above. It also gives # a funky error message ("No input") when there are two periods in a # row, also checked separately above. # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # # uts46 is off here because it is handled above. try: ascii_domain = idna.encode(domain, uts46=False).decode("ascii") except idna.IDNAError as e: if "Domain too long" in str(e): # We can't really be more specific because UTS-46 normalization means # the length check is applied to a string that is different from the # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! raise EmailSyntaxError("The email address is too long after the @-sign.") from e # Other errors seem to not be possible because the call to idna.uts46_remap # would have already raised them. raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") # Check the length of the domain name in bytes. # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) # We're checking the number of bytes ("octets") here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") # Also check the label length limit. # (RFC 1035 2.3.1) for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") if globally_deliverable: # All publicly deliverable addresses have domain names with at least # one period, at least for gTLDs created since 2013 (per the ICANN Board # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). # We'll consider the lack of a period a syntax error # since that will match people's sense of what an email address looks # like. We'll skip this in test environments to allow '@test' email # addresses. if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") # We also know that all TLDs currently end with a letter. if not DOMAIN_NAME_REGEX.search(ascii_domain): raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. # See the references in __init__.py. from . import SPECIAL_USE_DOMAIN_NAMES for d in SPECIAL_USE_DOMAIN_NAMES: # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. if d == "test" and test_environment: continue if ascii_domain == d or ascii_domain.endswith("." + d): raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") # We may have been given an IDNA ASCII domain to begin with. Check # that the domain actually conforms to IDNA. It could look like IDNA # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # # This gives us the canonical internationalized form of the domain. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e # Check for invalid characters after normalization. These # should never arise. See the similar checks above. bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, # which is better for display purposes. This should also take care # of RFC 6532 section 3.1's suggestion to apply Unicode NFC # normalization to addresses. return { "ascii_domain": ascii_domain, "domain": domain_i18n, } def validate_email_length(addrinfo): # If the email address has an ASCII representation, then we assume it may be # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to # the destination) and the length limit applies to ASCII characters (which is # the same as octets). The number of characters in the internationalized form # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 # Unicode characters, and of course the number of octets over the limit may # not be the number of characters over the limit, so if the email address is # internationalized, we can't give any simple information about why the address # is too long. if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: if addrinfo.ascii_email == addrinfo.normalized: reason = get_length_reason(addrinfo.ascii_email) elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the ASCII # form is definitely going to be too long. reason = get_length_reason(addrinfo.normalized, utf8=True) else: reason = "(when converted to IDNA ASCII)" raise EmailSyntaxError(f"The email address is too long {reason}.") # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not # Unicode characters) is at most 254 octets. If the addres is transmitted using # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. # If the email address has an ASCII form that differs from its internationalized # form, I don't think the internationalized form can be longer, and so the ASCII # form length check would be sufficient. If there is no ASCII form, then we have # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times # longer than the number of characters. # # See the length checks on the local part and the domain. if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the UTF-8 # encoding is definitely going to be too long. reason = get_length_reason(addrinfo.normalized, utf8=True) else: reason = "(when encoded in bytes)" raise EmailSyntaxError(f"The email address is too long {reason}.") def validate_email_domain_literal(domain_literal): # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. if re.match(r"^[0-9\.]+$", domain_literal): try: addr = ipaddress.IPv4Address(domain_literal) except ValueError as e: raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e # Return the IPv4Address object and the domain back unchanged. return { "domain_address": addr, "domain": f"[{addr}]", } # If it begins with "IPv6:" it's an IPv6 address. if domain_literal.startswith("IPv6:"): try: addr = ipaddress.IPv6Address(domain_literal[5:]) except ValueError as e: raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e # Return the IPv6Address object and construct a normalized # domain literal. return { "domain_address": addr, "domain": f"[IPv6:{addr.compressed}]", } # Nothing else is valid. if ":" not in domain_literal: raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") # The tag (the part before the colon) has character restrictions, # but since it must come from a registry of tags (in which only "IPv6" is defined), # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. # Check for permitted ASCII characters. This actually doesn't matter # since there will be an exception after anyway. bad_chars = { safe_character_display(c) for c in domain_literal if not DOMAIN_LITERAL_CHARS.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") # There are no other domain literal tags. # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")