Spaces:
Sleeping
Sleeping
import re | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
# Define patterns that indicate the start of a previous message in multiple languages | |
PRIOR_MESSAGE_MARKERS = [ | |
# English patterns | |
re.compile(r'^From:', re.IGNORECASE), | |
re.compile(r'^Sent:', re.IGNORECASE), | |
re.compile(r'^Subject:', re.IGNORECASE), | |
re.compile(r'^To:', re.IGNORECASE), | |
re.compile(r'^Date:', re.IGNORECASE), | |
re.compile(r'^On .* wrote:', re.IGNORECASE), | |
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE), | |
re.compile(r'^Begin forwarded message:', re.IGNORECASE), | |
# Custom separators in email (like lines of dashes or borders) | |
re.compile(r'^-+.*-+$'), # For lines like "--------------------------------------------------" | |
# Portuguese patterns | |
re.compile(r'^Em .* escreveu:', re.IGNORECASE), | |
re.compile(r'^De:\s', re.IGNORECASE), | |
re.compile(r'^Para:\s', re.IGNORECASE), | |
re.compile(r'^Data:\s', re.IGNORECASE), | |
re.compile(r'^Assunto:\s', re.IGNORECASE), | |
re.compile(r'^Mensagem original', re.IGNORECASE), | |
# French patterns | |
re.compile(r'^De :\s', re.IGNORECASE), # "From:" | |
re.compile(r'^Le .* a écrit :', re.IGNORECASE), # "On DATE, NAME wrote:" | |
# German patterns | |
re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
re.compile(r'^Von:\s', re.IGNORECASE), # "From:" | |
# Spanish patterns | |
re.compile(r'^El .* escribió:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
# Chinese patterns | |
re.compile(r'^历史邮件$', re.IGNORECASE), # "Historical Emails" | |
# Dutch patterns | |
re.compile(r'^Op .* schreef.*:', re.IGNORECASE), | |
re.compile(r'^Van:\s', re.IGNORECASE), | |
re.compile(r'^Aan:\s', re.IGNORECASE), | |
re.compile(r'^Onderwerp:\s', re.IGNORECASE), | |
re.compile(r'^Verzonden:\s', re.IGNORECASE), | |
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE), | |
# Italian patterns | |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE), | |
re.compile(r'^Da:\s', re.IGNORECASE), | |
re.compile(r'^A:\s', re.IGNORECASE), | |
re.compile(r'^Oggetto:\s', re.IGNORECASE), | |
re.compile(r'^Data:\s', re.IGNORECASE), | |
re.compile(r'^Messaggio originale', re.IGNORECASE), | |
] | |
def remove_quoted_text(soup): | |
"""Remove quoted text sections from the email HTML content.""" | |
# Remove blockquotes or quoted sections (typical for email threads) | |
for blockquote in soup.find_all('blockquote'): | |
blockquote.decompose() | |
# Remove any divs that might indicate forwarded or quoted messages | |
for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'): | |
div.decompose() | |
# Remove horizontal rules (often used to separate replies) | |
for hr in soup.find_all('hr'): | |
hr.decompose() | |
# Remove tables with dotted borders (a typical marker of a previous conversation) | |
for table in soup.find_all('table'): | |
if table.get('style') and 'border-top:1px dotted' in table.get('style'): | |
table.decompose() | |
return soup | |
def extract_latest_message_from_lines(lines): | |
"""Extract the latest message from the list of lines.""" | |
latest_message_lines = [] | |
for line in lines: | |
# Clean up the line | |
line = line.strip() | |
# Ensure that we're handling None values safely | |
if line is None or line == '': | |
continue | |
# Check if the line matches any prior message markers | |
if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS): | |
break # Stop if a prior message marker is found | |
latest_message_lines.append(line) | |
return '\n'.join(latest_message_lines).strip() | |
def extract_latest_email_text(email_html): | |
""" | |
Extracts the text of the latest email message from the given HTML content, | |
removing any quoted threads or previous messages. | |
""" | |
# Parse the HTML content | |
soup = BeautifulSoup(email_html, 'html.parser') | |
# Remove quoted text sections | |
soup = remove_quoted_text(soup) | |
# Extract the text and split into lines | |
email_text = soup.get_text(separator='\n', strip=True) | |
lines = email_text.split('\n') | |
# Extract the latest message | |
latest_email_text = extract_latest_message_from_lines(lines) | |
return latest_email_text | |
# Streamlit app | |
def main(): | |
st.title("Email Latest Message Extractor") | |
st.write(""" | |
This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages. | |
Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message. | |
""") | |
# Input field for the raw HTML email content | |
email_html = st.text_area("Paste the HTML email content here", height=300) | |
# Button to process the input | |
if st.button("Extract Latest Message"): | |
if email_html.strip(): | |
try: | |
latest_message = extract_latest_email_text(email_html) | |
st.subheader("Extracted Latest Message:") | |
st.text_area("Latest Message", value=latest_message, height=200) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
else: | |
st.warning("Please paste the HTML content of the email.") | |
if __name__ == "__main__": | |
main() | |