import email import imaplib import logging from datetime import datetime from email.header import decode_header from email.message import Message from typing import Any, Dict, List, Optional import pytz from pydantic import Field, PrivateAttr, SecretStr from pydantic_settings import BaseSettings from obsei.payload import TextPayload from obsei.misc.utils import ( DATETIME_STRING_PATTERN, DEFAULT_LOOKUP_PERIOD, convert_utc_time, text_from_html, ) from obsei.source.base_source import BaseSource, BaseSourceConfig logger = logging.getLogger(__name__) class EmailCredInfo(BaseSettings): username: Optional[SecretStr] = Field(None, env="email_username") password: Optional[SecretStr] = Field(None, env="email_password") class EmailConfig(BaseSourceConfig): # This is done to avoid exposing member to API response _imap_client: imaplib.IMAP4 = PrivateAttr() TYPE: str = "Email" # List of IMAP servers for most commonly used email providers # https://www.systoolsgroup.com/imap/ # Also, if you're using a Gmail account then make sure you allow less secure apps on your account - # https://myaccount.google.com/lesssecureapps?pli=1 # Also enable IMAP access - # https://mail.google.com/mail/u/0/#settings/fwdandpop imap_server: str imap_port: Optional[int] = None download_attachments: Optional[bool] = False mailboxes: List[str] = Field(["INBOX"]) cred_info: Optional[EmailCredInfo] = Field(None) lookup_period: Optional[str] = None def __init__(self, **data: Any): super().__init__(**data) self.cred_info = self.cred_info or EmailCredInfo() if self.cred_info.password is None or self.cred_info.username is None: raise ValueError("Email account `username` and `password` is required") if self.imap_port: self._imap_client = imaplib.IMAP4_SSL( host=self.imap_server, port=self.imap_port ) else: self._imap_client = imaplib.IMAP4_SSL(self.imap_server) self._imap_client.login( user=self.cred_info.username.get_secret_value(), password=self.cred_info.password.get_secret_value(), ) def __del__(self) -> None: # self._imap_client.close() self._imap_client.logout() def get_client(self) -> imaplib.IMAP4: return self._imap_client class EmailSource(BaseSource): NAME: str = "Email" @staticmethod def clean(text: str) -> str: # clean text for creating a folder return "".join(c if c.isalnum() else "_" for c in text) def lookup(self, config: EmailConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override] source_responses: List[TextPayload] = [] # Get data from state id: str = kwargs.get("id", None) state: Optional[Dict[str, Any]] = ( None if id is None or self.store is None else self.store.get_source_state(id) ) update_state: bool = True if id else False state = state or dict() imap_client = config.get_client() for mailbox in config.mailboxes: need_more_lookup = True status, messages = imap_client.select(mailbox=mailbox, readonly=True) if status != "OK": logger.warning(f"Not able to connect with {mailbox}: {status}") continue mailbox_stat: Dict[str, Any] = state.get(mailbox, dict()) lookup_period: str = mailbox_stat.get( "since_time", config.lookup_period or DEFAULT_LOOKUP_PERIOD ) if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) if since_time.tzinfo is None: since_time = since_time.replace(tzinfo=pytz.utc) else: since_time = since_time.astimezone(pytz.utc) last_since_time: datetime = since_time since_id: Optional[int] = mailbox_stat.get("since_message_id", None) last_index = since_id state[mailbox] = mailbox_stat num_of_emails = int(str(messages[0])) # Read in reverse order means latest emails first # Most of code is borrowed from https://www.thepythoncode.com/article/reading-emails-in-python and # modified to suite here for index in range(num_of_emails, 0, -1): email_meta: Dict[str, Any] = dict() # fetch the email message by ID status, email_message = imap_client.fetch(str(index), "(RFC822)") email_content: str = "" for response in email_message: if isinstance(response, tuple): # parse a bytes email into a message object msg = email.message_from_bytes(response[1]) email_meta["subject"] = self._parse_email_header(msg, "Subject") email_meta["from_address"] = self._parse_email_header( msg, "From" ) email_meta["to_address"] = self._parse_email_header(msg, "To") date_received_str = self._parse_email_header(msg, "Date") try: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %Z" ) except Exception: try: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %z" ) except Exception: date_received = datetime.strptime( date_received_str, "%a, %d %b %Y %H:%M:%S %z (%Z)" ) if date_received.tzinfo is None: date_received = date_received.replace(tzinfo=pytz.utc) else: date_received = date_received.astimezone(pytz.utc) email_meta["date_received"] = date_received email_meta["message_id"] = self._parse_email_header( msg, "Message-ID" ) part_id = 0 # if the email message is multipart if msg.is_multipart(): # iterate over email parts for part in msg.walk(): part_id_str = f"part_{part_id}" # extract content type of email content_type = part.get_content_type() content_disposition = str( part.get("Content-Disposition") ) email_meta[part_id_str] = dict() email_meta[part_id_str]["content_type"] = content_type email_meta[part_id_str][ "content_disposition" ] = content_disposition if ( "attachment" not in content_disposition and "text/" in content_type ): try: # get the email body email_body = part.get_payload( decode=True ).decode() if content_type == "text/html": email_body = text_from_html(email_body) # append email body with existing email_meta[part_id_str][ "email_body" ] = email_body email_content = ( email_content + "\n" + email_body ) except Exception: logger.error("Unable to parse email body") elif "attachment" in content_disposition: logger.warning( "Email attachment download is not supported" ) # Download attachment is commented currently # # download attachment # filename = part.get_filename() # if filename: # folder_name = self.clean(subject) # if not os.path.isdir(folder_name): # # make a folder for this email (named after the subject) # os.mkdir(folder_name) # filepath = os.path.join(folder_name, filename) # # download attachment and save it # open(filepath, "wb").write(part.get_payload(decode=True)) part_id = part_id + 1 else: part_id_str = f"part_{part_id}" email_meta[part_id_str] = dict() # extract content type of email content_type = msg.get_content_type() email_meta[part_id_str]["content_type"] = content_type # get the email body email_body = msg.get_payload(decode=True).decode() if content_type == "text/html": email_body = text_from_html(email_body) email_meta[part_id_str]["email_body"] = email_body email_content = email_content + "\n" + email_body if date_received <= since_time: need_more_lookup = False break if last_index and last_index == email_meta["message_id"]: need_more_lookup = False break if last_since_time is None or last_since_time < date_received: last_since_time = date_received if last_index is None: last_index = email_meta["message_id"] source_responses.append( TextPayload( processed_text="\n".join( [email_meta.get("subject", ""), email_content] ), meta=email_meta, source_name=self.NAME, ) ) if not need_more_lookup: break mailbox_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN ) mailbox_stat["since_comment_id"] = last_index if update_state and self.store is not None: self.store.update_source_state(workflow_id=id, state=state) return source_responses @staticmethod def _email_cleanup(content: str): # type: ignore[no-untyped-def] # TODO: Implement the method to cleanup email contents pass @staticmethod def _parse_email_header(header: Message, key: str) -> str: value, encoding = decode_header(header[key])[0] if isinstance(value, bytes): # if it's a bytes, decode to str return "" if not encoding else value.decode(encoding) return str(value)