albertoarrigoni commited on
Commit
36c634e
verified
1 Parent(s): 5e4312a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -53
app.py CHANGED
@@ -17,75 +17,32 @@ PRIOR_MESSAGE_MARKERS = [
17
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
18
  re.compile(r'^Forwarded message', re.IGNORECASE),
19
 
20
- # Portuguese patterns
21
- re.compile(r'^Em .* escreveu:', re.IGNORECASE),
22
- re.compile(r'^De:\s', re.IGNORECASE),
23
- re.compile(r'^Para:\s', re.IGNORECASE),
24
- re.compile(r'^Data:\s', re.IGNORECASE),
25
- re.compile(r'^Assunto:\s', re.IGNORECASE),
26
- re.compile(r'^Mensagem original', re.IGNORECASE),
27
-
28
- # French patterns
29
- re.compile(r'^De :\s', re.IGNORECASE), # "From:"
30
- re.compile(r'^Le .* a 茅crit :', re.IGNORECASE), # "On DATE, NAME wrote:"
31
-
32
- # German patterns
33
- re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:"
34
- re.compile(r'^Von:\s', re.IGNORECASE), # "From:"
35
-
36
- # Spanish patterns
37
- re.compile(r'^El .* escribi贸:', re.IGNORECASE), # "On DATE, NAME wrote:"
38
-
39
- # Chinese patterns
40
- re.compile(r'^鍘嗗彶閭欢$', re.IGNORECASE), # "Historical Emails"
41
-
42
- # Dutch patterns
43
- re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
44
- re.compile(r'^Van:\s', re.IGNORECASE),
45
- re.compile(r'^Aan:\s', re.IGNORECASE),
46
- re.compile(r'^Onderwerp:\s', re.IGNORECASE),
47
- re.compile(r'^Verzonden:\s', re.IGNORECASE),
48
- re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
49
-
50
- # Italian patterns
51
  re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
52
  re.compile(r'^Da:\s', re.IGNORECASE),
53
  re.compile(r'^A:\s', re.IGNORECASE),
54
  re.compile(r'^Oggetto:\s', re.IGNORECASE),
55
  re.compile(r'^Data:\s', re.IGNORECASE),
56
  re.compile(r'^Messaggio originale', re.IGNORECASE),
 
 
 
57
  ]
58
 
59
  def remove_quoted_text(soup):
60
  """Remove quoted text sections from the email HTML content."""
61
- # Remove Gmail quoted text
62
- for quote in soup.find_all('div', class_='gmail_quote'):
63
  quote.decompose()
64
 
65
- # Remove Gmail extra
66
- for extra in soup.find_all('div', class_='gmail_extra'):
67
- extra.decompose()
68
-
69
- # Remove Outlook quoted text
70
- for quote in soup.find_all('div', class_='OutlookMessageHeader'):
71
  quote.decompose()
72
 
73
- # Remove blockquotes
74
- for blockquote in soup.find_all('blockquote'):
75
- blockquote.decompose()
76
-
77
- # Remove Yahoo quoted text
78
- for quote in soup.find_all('div', class_='yahoo_quoted'):
79
  quote.decompose()
80
 
81
- # Remove reply intros
82
- for intro in soup.find_all('div', id='reply-intro'):
83
- intro.decompose()
84
-
85
- # Remove Mozilla's quoted text
86
- for cite in soup.find_all('div', class_='moz-cite-prefix'):
87
- cite.decompose()
88
-
89
  return soup
90
 
91
  def extract_latest_message_from_lines(lines):
 
17
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
18
  re.compile(r'^Forwarded message', re.IGNORECASE),
19
 
20
+ # Italian patterns specific to the example email
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
22
  re.compile(r'^Da:\s', re.IGNORECASE),
23
  re.compile(r'^A:\s', re.IGNORECASE),
24
  re.compile(r'^Oggetto:\s', re.IGNORECASE),
25
  re.compile(r'^Data:\s', re.IGNORECASE),
26
  re.compile(r'^Messaggio originale', re.IGNORECASE),
27
+
28
+ # General timestamp patterns
29
+ re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
30
  ]
31
 
32
  def remove_quoted_text(soup):
33
  """Remove quoted text sections from the email HTML content."""
34
+ # Remove any divs with inline styles that indicate quoted text (like dotted borders)
35
+ for quote in soup.find_all('td', style=lambda value: value and 'dotted' in value):
36
  quote.decompose()
37
 
38
+ # Remove common quoted sections based on the structure
39
+ for quote in soup.find_all('div', class_='zd-comment'):
 
 
 
 
40
  quote.decompose()
41
 
42
+ # Additional cleaning specific to the example provided
43
+ for quote in soup.find_all('div', style=lambda value: value and 'color:#aaaaaa' in value):
 
 
 
 
44
  quote.decompose()
45
 
 
 
 
 
 
 
 
 
46
  return soup
47
 
48
  def extract_latest_message_from_lines(lines):