Spaces:

albertoarrigoni
/

email_parser

Sleeping

App Files Files Community

albertoarrigoni commited on Sep 16, 2024

Commit

36c634e

verified ·

1 Parent(s): 5e4312a

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -53

app.py CHANGED Viewed

@@ -17,75 +17,32 @@ PRIOR_MESSAGE_MARKERS = [
     re.compile(r'^Begin forwarded message:', re.IGNORECASE),
     re.compile(r'^Forwarded message', re.IGNORECASE),
-    # Portuguese patterns
-    re.compile(r'^Em .* escreveu:', re.IGNORECASE),
-    re.compile(r'^De:\s', re.IGNORECASE),
-    re.compile(r'^Para:\s', re.IGNORECASE),
-    re.compile(r'^Data:\s', re.IGNORECASE),
-    re.compile(r'^Assunto:\s', re.IGNORECASE),
-    re.compile(r'^Mensagem original', re.IGNORECASE),
-    # French patterns
-    re.compile(r'^De :\s', re.IGNORECASE),  # "From:"
-    re.compile(r'^Le .* a écrit :', re.IGNORECASE),  # "On DATE, NAME wrote:"
-    # German patterns
-    re.compile(r'^Am .* schrieb.*:', re.IGNORECASE),  # "On DATE, NAME wrote:"
-    re.compile(r'^Von:\s', re.IGNORECASE),  # "From:"
-    # Spanish patterns
-    re.compile(r'^El .* escribió:', re.IGNORECASE),  # "On DATE, NAME wrote:"
-    # Chinese patterns
-    re.compile(r'^历史邮件$', re.IGNORECASE),  # "Historical Emails"
-    # Dutch patterns
-    re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
-    re.compile(r'^Van:\s', re.IGNORECASE),
-    re.compile(r'^Aan:\s', re.IGNORECASE),
-    re.compile(r'^Onderwerp:\s', re.IGNORECASE),
-    re.compile(r'^Verzonden:\s', re.IGNORECASE),
-    re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
-    # Italian patterns
     re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
     re.compile(r'^Da:\s', re.IGNORECASE),
     re.compile(r'^A:\s', re.IGNORECASE),
     re.compile(r'^Oggetto:\s', re.IGNORECASE),
     re.compile(r'^Data:\s', re.IGNORECASE),
     re.compile(r'^Messaggio originale', re.IGNORECASE),
 ]
 def remove_quoted_text(soup):
     """Remove quoted text sections from the email HTML content."""
-    # Remove Gmail quoted text
-    for quote in soup.find_all('div', class_='gmail_quote'):
         quote.decompose()
-    # Remove Gmail extra
-    for extra in soup.find_all('div', class_='gmail_extra'):
-        extra.decompose()
-    # Remove Outlook quoted text
-    for quote in soup.find_all('div', class_='OutlookMessageHeader'):
         quote.decompose()
-    # Remove blockquotes
-    for blockquote in soup.find_all('blockquote'):
-        blockquote.decompose()
-    # Remove Yahoo quoted text
-    for quote in soup.find_all('div', class_='yahoo_quoted'):
         quote.decompose()
-    # Remove reply intros
-    for intro in soup.find_all('div', id='reply-intro'):
-        intro.decompose()
-    # Remove Mozilla's quoted text
-    for cite in soup.find_all('div', class_='moz-cite-prefix'):
-        cite.decompose()
     return soup
 def extract_latest_message_from_lines(lines):

     re.compile(r'^Begin forwarded message:', re.IGNORECASE),
     re.compile(r'^Forwarded message', re.IGNORECASE),
+    # Italian patterns specific to the example email
     re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
     re.compile(r'^Da:\s', re.IGNORECASE),
     re.compile(r'^A:\s', re.IGNORECASE),
     re.compile(r'^Oggetto:\s', re.IGNORECASE),
     re.compile(r'^Data:\s', re.IGNORECASE),
     re.compile(r'^Messaggio originale', re.IGNORECASE),
+    # General timestamp patterns
+    re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
 ]
 def remove_quoted_text(soup):
     """Remove quoted text sections from the email HTML content."""
+    # Remove any divs with inline styles that indicate quoted text (like dotted borders)
+    for quote in soup.find_all('td', style=lambda value: value and 'dotted' in value):
         quote.decompose()
+    # Remove common quoted sections based on the structure
+    for quote in soup.find_all('div', class_='zd-comment'):
         quote.decompose()
+    # Additional cleaning specific to the example provided
+    for quote in soup.find_all('div', style=lambda value: value and 'color:#aaaaaa' in value):
         quote.decompose()
     return soup
 def extract_latest_message_from_lines(lines):