albertoarrigoni commited on
Commit
7b35151
verified
1 Parent(s): 36c634e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -23
app.py CHANGED
@@ -2,46 +2,70 @@ import re
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
4
 
5
- # Define patterns that indicate the start of a previous message
6
  PRIOR_MESSAGE_MARKERS = [
7
  # English patterns
 
 
 
 
 
8
  re.compile(r'^On .* wrote:', re.IGNORECASE),
9
- re.compile(r'^From:\s', re.IGNORECASE),
10
- re.compile(r'^Sent:\s', re.IGNORECASE),
11
- re.compile(r'^Subject:\s', re.IGNORECASE),
12
- re.compile(r'^To:\s', re.IGNORECASE),
13
- re.compile(r'^Date:\s', re.IGNORECASE),
14
  re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
15
- re.compile(r'^__+', re.IGNORECASE),
16
- re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
17
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
18
- re.compile(r'^Forwarded message', re.IGNORECASE),
19
-
20
- # Italian patterns specific to the example email
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
22
  re.compile(r'^Da:\s', re.IGNORECASE),
23
  re.compile(r'^A:\s', re.IGNORECASE),
24
  re.compile(r'^Oggetto:\s', re.IGNORECASE),
25
  re.compile(r'^Data:\s', re.IGNORECASE),
26
  re.compile(r'^Messaggio originale', re.IGNORECASE),
27
-
28
- # General timestamp patterns
29
- re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
30
  ]
31
 
32
  def remove_quoted_text(soup):
33
  """Remove quoted text sections from the email HTML content."""
34
- # Remove any divs with inline styles that indicate quoted text (like dotted borders)
35
- for quote in soup.find_all('td', style=lambda value: value and 'dotted' in value):
36
- quote.decompose()
37
 
38
- # Remove common quoted sections based on the structure
39
- for quote in soup.find_all('div', class_='zd-comment'):
40
- quote.decompose()
41
 
42
- # Additional cleaning specific to the example provided
43
- for quote in soup.find_all('div', style=lambda value: value and 'color:#aaaaaa' in value):
44
- quote.decompose()
45
 
46
  return soup
47
 
 
2
  from bs4 import BeautifulSoup
3
  import streamlit as st
4
 
5
+ # Define patterns that indicate the start of a previous message in multiple languages
6
  PRIOR_MESSAGE_MARKERS = [
7
  # English patterns
8
+ re.compile(r'^From:', re.IGNORECASE),
9
+ re.compile(r'^Sent:', re.IGNORECASE),
10
+ re.compile(r'^Subject:', re.IGNORECASE),
11
+ re.compile(r'^To:', re.IGNORECASE),
12
+ re.compile(r'^Date:', re.IGNORECASE),
13
  re.compile(r'^On .* wrote:', re.IGNORECASE),
 
 
 
 
 
14
  re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
 
 
15
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
16
+
17
+ # Portuguese patterns
18
+ re.compile(r'^Em .* escreveu:', re.IGNORECASE),
19
+ re.compile(r'^De:\s', re.IGNORECASE),
20
+ re.compile(r'^Para:\s', re.IGNORECASE),
21
+ re.compile(r'^Data:\s', re.IGNORECASE),
22
+ re.compile(r'^Assunto:\s', re.IGNORECASE),
23
+ re.compile(r'^Mensagem original', re.IGNORECASE),
24
+
25
+ # French patterns
26
+ re.compile(r'^De :\s', re.IGNORECASE), # "From:"
27
+ re.compile(r'^Le .* a 茅crit :', re.IGNORECASE), # "On DATE, NAME wrote:"
28
+
29
+ # German patterns
30
+ re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:"
31
+ re.compile(r'^Von:\s', re.IGNORECASE), # "From:"
32
+
33
+ # Spanish patterns
34
+ re.compile(r'^El .* escribi贸:', re.IGNORECASE), # "On DATE, NAME wrote:"
35
+
36
+ # Chinese patterns
37
+ re.compile(r'^鍘嗗彶閭欢$', re.IGNORECASE), # "Historical Emails"
38
+
39
+ # Dutch patterns
40
+ re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
41
+ re.compile(r'^Van:\s', re.IGNORECASE),
42
+ re.compile(r'^Aan:\s', re.IGNORECASE),
43
+ re.compile(r'^Onderwerp:\s', re.IGNORECASE),
44
+ re.compile(r'^Verzonden:\s', re.IGNORECASE),
45
+ re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
46
+
47
+ # Italian patterns
48
  re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
49
  re.compile(r'^Da:\s', re.IGNORECASE),
50
  re.compile(r'^A:\s', re.IGNORECASE),
51
  re.compile(r'^Oggetto:\s', re.IGNORECASE),
52
  re.compile(r'^Data:\s', re.IGNORECASE),
53
  re.compile(r'^Messaggio originale', re.IGNORECASE),
 
 
 
54
  ]
55
 
56
  def remove_quoted_text(soup):
57
  """Remove quoted text sections from the email HTML content."""
58
+ # Remove blockquotes or quoted sections (typical for email threads)
59
+ for blockquote in soup.find_all('blockquote'):
60
+ blockquote.decompose()
61
 
62
+ # Remove any divs that might indicate forwarded or quoted messages
63
+ for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'):
64
+ div.decompose()
65
 
66
+ # Remove horizontal rules (often used to separate replies)
67
+ for hr in soup.find_all('hr'):
68
+ hr.decompose()
69
 
70
  return soup
71