Spaces:
Sleeping
Sleeping
File size: 5,312 Bytes
0a4c2f3 5e4312a 0a4c2f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import re
from bs4 import BeautifulSoup
import streamlit as st
# Define patterns that indicate the start of a previous message
PRIOR_MESSAGE_MARKERS = [
# English patterns
re.compile(r'^On .* wrote:', re.IGNORECASE),
re.compile(r'^From:\s', re.IGNORECASE),
re.compile(r'^Sent:\s', re.IGNORECASE),
re.compile(r'^Subject:\s', re.IGNORECASE),
re.compile(r'^To:\s', re.IGNORECASE),
re.compile(r'^Date:\s', re.IGNORECASE),
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
re.compile(r'^__+', re.IGNORECASE),
re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
re.compile(r'^Forwarded message', re.IGNORECASE),
# Portuguese patterns
re.compile(r'^Em .* escreveu:', re.IGNORECASE),
re.compile(r'^De:\s', re.IGNORECASE),
re.compile(r'^Para:\s', re.IGNORECASE),
re.compile(r'^Data:\s', re.IGNORECASE),
re.compile(r'^Assunto:\s', re.IGNORECASE),
re.compile(r'^Mensagem original', re.IGNORECASE),
# French patterns
re.compile(r'^De :\s', re.IGNORECASE), # "From:"
re.compile(r'^Le .* a écrit :', re.IGNORECASE), # "On DATE, NAME wrote:"
# German patterns
re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:"
re.compile(r'^Von:\s', re.IGNORECASE), # "From:"
# Spanish patterns
re.compile(r'^El .* escribió:', re.IGNORECASE), # "On DATE, NAME wrote:"
# Chinese patterns
re.compile(r'^历史邮件$', re.IGNORECASE), # "Historical Emails"
# Dutch patterns
re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
re.compile(r'^Van:\s', re.IGNORECASE),
re.compile(r'^Aan:\s', re.IGNORECASE),
re.compile(r'^Onderwerp:\s', re.IGNORECASE),
re.compile(r'^Verzonden:\s', re.IGNORECASE),
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
# Italian patterns
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
re.compile(r'^Da:\s', re.IGNORECASE),
re.compile(r'^A:\s', re.IGNORECASE),
re.compile(r'^Oggetto:\s', re.IGNORECASE),
re.compile(r'^Data:\s', re.IGNORECASE),
re.compile(r'^Messaggio originale', re.IGNORECASE),
]
def remove_quoted_text(soup):
"""Remove quoted text sections from the email HTML content."""
# Remove Gmail quoted text
for quote in soup.find_all('div', class_='gmail_quote'):
quote.decompose()
# Remove Gmail extra
for extra in soup.find_all('div', class_='gmail_extra'):
extra.decompose()
# Remove Outlook quoted text
for quote in soup.find_all('div', class_='OutlookMessageHeader'):
quote.decompose()
# Remove blockquotes
for blockquote in soup.find_all('blockquote'):
blockquote.decompose()
# Remove Yahoo quoted text
for quote in soup.find_all('div', class_='yahoo_quoted'):
quote.decompose()
# Remove reply intros
for intro in soup.find_all('div', id='reply-intro'):
intro.decompose()
# Remove Mozilla's quoted text
for cite in soup.find_all('div', class_='moz-cite-prefix'):
cite.decompose()
return soup
def extract_latest_message_from_lines(lines):
"""Extract the latest message from the list of lines."""
latest_message_lines = []
for line in lines:
# Clean up the line
line = line.strip()
# Check if the line matches any prior message markers
if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS):
break # Stop if a prior message marker is found
latest_message_lines.append(line)
return '\n'.join(latest_message_lines).strip()
def extract_latest_email_text(email_html):
"""
Extracts the text of the latest email message from the given HTML content,
removing any quoted threads or previous messages.
"""
# Parse the HTML content
soup = BeautifulSoup(email_html, 'html.parser')
# Remove quoted text sections
soup = remove_quoted_text(soup)
# Extract the text and split into lines
email_text = soup.get_text(separator='\n', strip=True)
lines = email_text.split('\n')
# Extract the latest message
latest_email_text = extract_latest_message_from_lines(lines)
return latest_email_text
# Streamlit app
def main():
st.title("Email Latest Message Extractor")
st.write("""
This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages.
Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message.
""")
# Input field for the raw HTML email content
email_html = st.text_area("Paste the HTML email content here", height=300)
# Button to process the input
if st.button("Extract Latest Message"):
if email_html.strip():
try:
latest_message = extract_latest_email_text(email_html)
st.subheader("Extracted Latest Message:")
st.text_area("Latest Message", value=latest_message, height=200)
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.warning("Please paste the HTML content of the email.")
if __name__ == "__main__":
main()
|