Spaces:
Sleeping
Sleeping
import re | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
# Define patterns that indicate the start of a previous message | |
PRIOR_MESSAGE_MARKERS = [ | |
# English patterns | |
re.compile(r'^On .* wrote:', re.IGNORECASE), | |
re.compile(r'^From:\s', re.IGNORECASE), | |
re.compile(r'^Sent:\s', re.IGNORECASE), | |
re.compile(r'^Subject:\s', re.IGNORECASE), | |
re.compile(r'^To:\s', re.IGNORECASE), | |
re.compile(r'^Date:\s', re.IGNORECASE), | |
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE), | |
re.compile(r'^__+', re.IGNORECASE), | |
re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE), | |
re.compile(r'^Begin forwarded message:', re.IGNORECASE), | |
re.compile(r'^Forwarded message', re.IGNORECASE), | |
# Portuguese patterns | |
re.compile(r'^Em .* escreveu:', re.IGNORECASE), | |
re.compile(r'^De:\s', re.IGNORECASE), | |
re.compile(r'^Para:\s', re.IGNORECASE), | |
re.compile(r'^Data:\s', re.IGNORECASE), | |
re.compile(r'^Assunto:\s', re.IGNORECASE), | |
re.compile(r'^Mensagem original', re.IGNORECASE), | |
# French patterns | |
re.compile(r'^De :\s', re.IGNORECASE), # "From:" | |
re.compile(r'^Le .* a écrit :', re.IGNORECASE), # "On DATE, NAME wrote:" | |
# German patterns | |
re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
re.compile(r'^Von:\s', re.IGNORECASE), # "From:" | |
# Spanish patterns | |
re.compile(r'^El .* escribió:', re.IGNORECASE), # "On DATE, NAME wrote:" | |
# Chinese patterns | |
re.compile(r'^历史邮件$', re.IGNORECASE), # "Historical Emails" | |
# Dutch patterns | |
re.compile(r'^Op .* schreef.*:', re.IGNORECASE), | |
re.compile(r'^Van:\s', re.IGNORECASE), | |
re.compile(r'^Aan:\s', re.IGNORECASE), | |
re.compile(r'^Onderwerp:\s', re.IGNORECASE), | |
re.compile(r'^Verzonden:\s', re.IGNORECASE), | |
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE), | |
# Italian patterns | |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE), | |
re.compile(r'^Da:\s', re.IGNORECASE), | |
re.compile(r'^A:\s', re.IGNORECASE), | |
re.compile(r'^Oggetto:\s', re.IGNORECASE), | |
re.compile(r'^Data:\s', re.IGNORECASE), | |
re.compile(r'^Messaggio originale', re.IGNORECASE), | |
] | |
def remove_quoted_text(soup): | |
"""Remove quoted text sections from the email HTML content.""" | |
# Remove Gmail quoted text | |
for quote in soup.find_all('div', class_='gmail_quote'): | |
quote.decompose() | |
# Remove Gmail extra | |
for extra in soup.find_all('div', class_='gmail_extra'): | |
extra.decompose() | |
# Remove Outlook quoted text | |
for quote in soup.find_all('div', class_='OutlookMessageHeader'): | |
quote.decompose() | |
# Remove blockquotes | |
for blockquote in soup.find_all('blockquote'): | |
blockquote.decompose() | |
# Remove Yahoo quoted text | |
for quote in soup.find_all('div', class_='yahoo_quoted'): | |
quote.decompose() | |
# Remove reply intros | |
for intro in soup.find_all('div', id='reply-intro'): | |
intro.decompose() | |
# Remove Mozilla's quoted text | |
for cite in soup.find_all('div', class_='moz-cite-prefix'): | |
cite.decompose() | |
return soup | |
def extract_latest_message_from_lines(lines): | |
"""Extract the latest message from the list of lines.""" | |
latest_message_lines = [] | |
for line in lines: | |
# Clean up the line | |
line = line.strip() | |
# Check if the line matches any prior message markers | |
if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS): | |
break # Stop if a prior message marker is found | |
latest_message_lines.append(line) | |
return '\n'.join(latest_message_lines).strip() | |
def extract_latest_email_text(email_html): | |
""" | |
Extracts the text of the latest email message from the given HTML content, | |
removing any quoted threads or previous messages. | |
""" | |
# Parse the HTML content | |
soup = BeautifulSoup(email_html, 'html.parser') | |
# Remove quoted text sections | |
soup = remove_quoted_text(soup) | |
# Extract the text and split into lines | |
email_text = soup.get_text(separator='\n', strip=True) | |
lines = email_text.split('\n') | |
# Extract the latest message | |
latest_email_text = extract_latest_message_from_lines(lines) | |
return latest_email_text | |
# Streamlit app | |
def main(): | |
st.title("Email Latest Message Extractor") | |
st.write(""" | |
This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages. | |
Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message. | |
""") | |
# Input field for the raw HTML email content | |
email_html = st.text_area("Paste the HTML email content here", height=300) | |
# Button to process the input | |
if st.button("Extract Latest Message"): | |
if email_html.strip(): | |
try: | |
latest_message = extract_latest_email_text(email_html) | |
st.subheader("Extracted Latest Message:") | |
st.text_area("Latest Message", value=latest_message, height=200) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
else: | |
st.warning("Please paste the HTML content of the email.") | |
if __name__ == "__main__": | |
main() | |