Spaces:

albertoarrigoni
/

email_parser

Sleeping

App Files Files Community

albertoarrigoni commited on Sep 15, 2024

Commit

0a4c2f3

verified ·

1 Parent(s): 8f10e35

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import re
+from bs4 import BeautifulSoup
+import streamlit as st
+# Define patterns that indicate the start of a previous message
+PRIOR_MESSAGE_MARKERS = [
+    # English patterns
+    re.compile(r'^On .* wrote:', re.IGNORECASE),
+    re.compile(r'^From:\s', re.IGNORECASE),
+    re.compile(r'^Sent:\s', re.IGNORECASE),
+    re.compile(r'^Subject:\s', re.IGNORECASE),
+    re.compile(r'^To:\s', re.IGNORECASE),
+    re.compile(r'^Date:\s', re.IGNORECASE),
+    re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
+    re.compile(r'^__+', re.IGNORECASE),
+    re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
+    re.compile(r'^Begin forwarded message:', re.IGNORECASE),
+    re.compile(r'^Forwarded message', re.IGNORECASE),
+    # Portuguese patterns
+    re.compile(r'^Em .* escreveu:', re.IGNORECASE),
+    re.compile(r'^De:\s', re.IGNORECASE),
+    re.compile(r'^Para:\s', re.IGNORECASE),
+    re.compile(r'^Data:\s', re.IGNORECASE),
+    re.compile(r'^Assunto:\s', re.IGNORECASE),
+    re.compile(r'^Mensagem original', re.IGNORECASE),
+    # Dutch patterns
+    re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
+    re.compile(r'^Van:\s', re.IGNORECASE),
+    re.compile(r'^Aan:\s', re.IGNORECASE),
+    re.compile(r'^Onderwerp:\s', re.IGNORECASE),
+    re.compile(r'^Verzonden:\s', re.IGNORECASE),
+    re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
+    # Italian patterns
+    re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
+    re.compile(r'^Da:\s', re.IGNORECASE),
+    re.compile(r'^A:\s', re.IGNORECASE),
+    re.compile(r'^Oggetto:\s', re.IGNORECASE),
+    re.compile(r'^Data:\s', re.IGNORECASE),
+    re.compile(r'^Messaggio originale', re.IGNORECASE),
+]
+def remove_quoted_text(soup):
+    """Remove quoted text sections from the email HTML content."""
+    # Remove Gmail quoted text
+    for quote in soup.find_all('div', class_='gmail_quote'):
+        quote.decompose()
+    # Remove Gmail extra
+    for extra in soup.find_all('div', class_='gmail_extra'):
+        extra.decompose()
+    # Remove Outlook quoted text
+    for quote in soup.find_all('div', class_='OutlookMessageHeader'):
+        quote.decompose()
+    # Remove blockquotes
+    for blockquote in soup.find_all('blockquote'):
+        blockquote.decompose()
+    # Remove Yahoo quoted text
+    for quote in soup.find_all('div', class_='yahoo_quoted'):
+        quote.decompose()
+    # Remove reply intros
+    for intro in soup.find_all('div', id='reply-intro'):
+        intro.decompose()
+    # Remove Mozilla's quoted text
+    for cite in soup.find_all('div', class_='moz-cite-prefix'):
+        cite.decompose()
+    return soup
+def extract_latest_message_from_lines(lines):
+    """Extract the latest message from the list of lines."""
+    latest_message_lines = []
+    for line in lines:
+        # Clean up the line
+        line = line.strip()
+        # Check if the line matches any prior message markers
+        if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS):
+            break  # Stop if a prior message marker is found
+        latest_message_lines.append(line)
+    return '\n'.join(latest_message_lines).strip()
+def extract_latest_email_text(email_html):
+    """
+    Extracts the text of the latest email message from the given HTML content,
+    removing any quoted threads or previous messages.
+    """
+    # Parse the HTML content
+    soup = BeautifulSoup(email_html, 'html.parser')
+    # Remove quoted text sections
+    soup = remove_quoted_text(soup)
+    # Extract the text and split into lines
+    email_text = soup.get_text(separator='\n', strip=True)
+    lines = email_text.split('\n')
+    # Extract the latest message
+    latest_email_text = extract_latest_message_from_lines(lines)
+    return latest_email_text
+# Streamlit app
+def main():
+    st.title("Email Latest Message Extractor")
+    st.write("""
+        This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages.
+        Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message.
+    """)
+    # Input field for the raw HTML email content
+    email_html = st.text_area("Paste the HTML email content here", height=300)
+    # Button to process the input
+    if st.button("Extract Latest Message"):
+        if email_html.strip():
+            try:
+                latest_message = extract_latest_email_text(email_html)
+                st.subheader("Extracted Latest Message:")
+                st.text_area("Latest Message", value=latest_message, height=200)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+        else:
+            st.warning("Please paste the HTML content of the email.")
+if __name__ == "__main__":
+    main()