import re from bs4 import BeautifulSoup import streamlit as st # Define patterns that indicate the start of a previous message PRIOR_MESSAGE_MARKERS = [ # English patterns re.compile(r'^On .* wrote:', re.IGNORECASE), re.compile(r'^From:\s', re.IGNORECASE), re.compile(r'^Sent:\s', re.IGNORECASE), re.compile(r'^Subject:\s', re.IGNORECASE), re.compile(r'^To:\s', re.IGNORECASE), re.compile(r'^Date:\s', re.IGNORECASE), re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE), re.compile(r'^__+', re.IGNORECASE), re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE), re.compile(r'^Begin forwarded message:', re.IGNORECASE), re.compile(r'^Forwarded message', re.IGNORECASE), # Portuguese patterns re.compile(r'^Em .* escreveu:', re.IGNORECASE), re.compile(r'^De:\s', re.IGNORECASE), re.compile(r'^Para:\s', re.IGNORECASE), re.compile(r'^Data:\s', re.IGNORECASE), re.compile(r'^Assunto:\s', re.IGNORECASE), re.compile(r'^Mensagem original', re.IGNORECASE), # French patterns re.compile(r'^De :\s', re.IGNORECASE), # "From:" re.compile(r'^Le .* a écrit :', re.IGNORECASE), # "On DATE, NAME wrote:" # German patterns re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:" re.compile(r'^Von:\s', re.IGNORECASE), # "From:" # Spanish patterns re.compile(r'^El .* escribió:', re.IGNORECASE), # "On DATE, NAME wrote:" # Chinese patterns re.compile(r'^历史邮件$', re.IGNORECASE), # "Historical Emails" # Dutch patterns re.compile(r'^Op .* schreef.*:', re.IGNORECASE), re.compile(r'^Van:\s', re.IGNORECASE), re.compile(r'^Aan:\s', re.IGNORECASE), re.compile(r'^Onderwerp:\s', re.IGNORECASE), re.compile(r'^Verzonden:\s', re.IGNORECASE), re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE), # Italian patterns re.compile(r'^Il .* ha scritto:', re.IGNORECASE), re.compile(r'^Da:\s', re.IGNORECASE), re.compile(r'^A:\s', re.IGNORECASE), re.compile(r'^Oggetto:\s', re.IGNORECASE), re.compile(r'^Data:\s', re.IGNORECASE), re.compile(r'^Messaggio originale', re.IGNORECASE), ] def remove_quoted_text(soup): """Remove quoted text sections from the email HTML content.""" # Remove Gmail quoted text for quote in soup.find_all('div', class_='gmail_quote'): quote.decompose() # Remove Gmail extra for extra in soup.find_all('div', class_='gmail_extra'): extra.decompose() # Remove Outlook quoted text for quote in soup.find_all('div', class_='OutlookMessageHeader'): quote.decompose() # Remove blockquotes for blockquote in soup.find_all('blockquote'): blockquote.decompose() # Remove Yahoo quoted text for quote in soup.find_all('div', class_='yahoo_quoted'): quote.decompose() # Remove reply intros for intro in soup.find_all('div', id='reply-intro'): intro.decompose() # Remove Mozilla's quoted text for cite in soup.find_all('div', class_='moz-cite-prefix'): cite.decompose() return soup def extract_latest_message_from_lines(lines): """Extract the latest message from the list of lines.""" latest_message_lines = [] for line in lines: # Clean up the line line = line.strip() # Check if the line matches any prior message markers if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS): break # Stop if a prior message marker is found latest_message_lines.append(line) return '\n'.join(latest_message_lines).strip() def extract_latest_email_text(email_html): """ Extracts the text of the latest email message from the given HTML content, removing any quoted threads or previous messages. """ # Parse the HTML content soup = BeautifulSoup(email_html, 'html.parser') # Remove quoted text sections soup = remove_quoted_text(soup) # Extract the text and split into lines email_text = soup.get_text(separator='\n', strip=True) lines = email_text.split('\n') # Extract the latest message latest_email_text = extract_latest_message_from_lines(lines) return latest_email_text # Streamlit app def main(): st.title("Email Latest Message Extractor") st.write(""" This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages. Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message. """) # Input field for the raw HTML email content email_html = st.text_area("Paste the HTML email content here", height=300) # Button to process the input if st.button("Extract Latest Message"): if email_html.strip(): try: latest_message = extract_latest_email_text(email_html) st.subheader("Extracted Latest Message:") st.text_area("Latest Message", value=latest_message, height=200) except Exception as e: st.error(f"An error occurred: {e}") else: st.warning("Please paste the HTML content of the email.") if __name__ == "__main__": main()