Spaces:

albertoarrigoni
/

email_parser

Sleeping

App Files Files Community

albertoarrigoni commited on Sep 16, 2024

Commit

7b35151

verified ·

1 Parent(s): 36c634e

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -23

app.py CHANGED Viewed

@@ -2,46 +2,70 @@ import re
 from bs4 import BeautifulSoup
 import streamlit as st
-# Define patterns that indicate the start of a previous message
 PRIOR_MESSAGE_MARKERS = [
     # English patterns
     re.compile(r'^On .* wrote:', re.IGNORECASE),
-    re.compile(r'^From:\s', re.IGNORECASE),
-    re.compile(r'^Sent:\s', re.IGNORECASE),
-    re.compile(r'^Subject:\s', re.IGNORECASE),
-    re.compile(r'^To:\s', re.IGNORECASE),
-    re.compile(r'^Date:\s', re.IGNORECASE),
     re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
-    re.compile(r'^__+', re.IGNORECASE),
-    re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
     re.compile(r'^Begin forwarded message:', re.IGNORECASE),
-    re.compile(r'^Forwarded message', re.IGNORECASE),
-    # Italian patterns specific to the example email
     re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
     re.compile(r'^Da:\s', re.IGNORECASE),
     re.compile(r'^A:\s', re.IGNORECASE),
     re.compile(r'^Oggetto:\s', re.IGNORECASE),
     re.compile(r'^Data:\s', re.IGNORECASE),
     re.compile(r'^Messaggio originale', re.IGNORECASE),
-    # General timestamp patterns
-    re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
 ]
 def remove_quoted_text(soup):
     """Remove quoted text sections from the email HTML content."""
-    # Remove any divs with inline styles that indicate quoted text (like dotted borders)
-    for quote in soup.find_all('td', style=lambda value: value and 'dotted' in value):
-        quote.decompose()
-    # Remove common quoted sections based on the structure
-    for quote in soup.find_all('div', class_='zd-comment'):
-        quote.decompose()
-    # Additional cleaning specific to the example provided
-    for quote in soup.find_all('div', style=lambda value: value and 'color:#aaaaaa' in value):
-        quote.decompose()
     return soup

 from bs4 import BeautifulSoup
 import streamlit as st
+# Define patterns that indicate the start of a previous message in multiple languages
 PRIOR_MESSAGE_MARKERS = [
     # English patterns
+    re.compile(r'^From:', re.IGNORECASE),
+    re.compile(r'^Sent:', re.IGNORECASE),
+    re.compile(r'^Subject:', re.IGNORECASE),
+    re.compile(r'^To:', re.IGNORECASE),
+    re.compile(r'^Date:', re.IGNORECASE),
     re.compile(r'^On .* wrote:', re.IGNORECASE),
     re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
     re.compile(r'^Begin forwarded message:', re.IGNORECASE),
+    # Portuguese patterns
+    re.compile(r'^Em .* escreveu:', re.IGNORECASE),
+    re.compile(r'^De:\s', re.IGNORECASE),
+    re.compile(r'^Para:\s', re.IGNORECASE),
+    re.compile(r'^Data:\s', re.IGNORECASE),
+    re.compile(r'^Assunto:\s', re.IGNORECASE),
+    re.compile(r'^Mensagem original', re.IGNORECASE),
+    # French patterns
+    re.compile(r'^De :\s', re.IGNORECASE),  # "From:"
+    re.compile(r'^Le .* a écrit :', re.IGNORECASE),  # "On DATE, NAME wrote:"
+    # German patterns
+    re.compile(r'^Am .* schrieb.*:', re.IGNORECASE),  # "On DATE, NAME wrote:"
+    re.compile(r'^Von:\s', re.IGNORECASE),  # "From:"
+    # Spanish patterns
+    re.compile(r'^El .* escribió:', re.IGNORECASE),  # "On DATE, NAME wrote:"
+    # Chinese patterns
+    re.compile(r'^历史邮件$', re.IGNORECASE),  # "Historical Emails"
+    # Dutch patterns
+    re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
+    re.compile(r'^Van:\s', re.IGNORECASE),
+    re.compile(r'^Aan:\s', re.IGNORECASE),
+    re.compile(r'^Onderwerp:\s', re.IGNORECASE),
+    re.compile(r'^Verzonden:\s', re.IGNORECASE),
+    re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
+    # Italian patterns
     re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
     re.compile(r'^Da:\s', re.IGNORECASE),
     re.compile(r'^A:\s', re.IGNORECASE),
     re.compile(r'^Oggetto:\s', re.IGNORECASE),
     re.compile(r'^Data:\s', re.IGNORECASE),
     re.compile(r'^Messaggio originale', re.IGNORECASE),
 ]
 def remove_quoted_text(soup):
     """Remove quoted text sections from the email HTML content."""
+    # Remove blockquotes or quoted sections (typical for email threads)
+    for blockquote in soup.find_all('blockquote'):
+        blockquote.decompose()
+    # Remove any divs that might indicate forwarded or quoted messages
+    for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'):
+        div.decompose()
+    # Remove horizontal rules (often used to separate replies)
+    for hr in soup.find_all('hr'):
+        hr.decompose()
     return soup