Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,75 +17,32 @@ PRIOR_MESSAGE_MARKERS = [
|
|
17 |
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
|
18 |
re.compile(r'^Forwarded message', re.IGNORECASE),
|
19 |
|
20 |
-
#
|
21 |
-
re.compile(r'^Em .* escreveu:', re.IGNORECASE),
|
22 |
-
re.compile(r'^De:\s', re.IGNORECASE),
|
23 |
-
re.compile(r'^Para:\s', re.IGNORECASE),
|
24 |
-
re.compile(r'^Data:\s', re.IGNORECASE),
|
25 |
-
re.compile(r'^Assunto:\s', re.IGNORECASE),
|
26 |
-
re.compile(r'^Mensagem original', re.IGNORECASE),
|
27 |
-
|
28 |
-
# French patterns
|
29 |
-
re.compile(r'^De :\s', re.IGNORECASE), # "From:"
|
30 |
-
re.compile(r'^Le .* a 茅crit :', re.IGNORECASE), # "On DATE, NAME wrote:"
|
31 |
-
|
32 |
-
# German patterns
|
33 |
-
re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:"
|
34 |
-
re.compile(r'^Von:\s', re.IGNORECASE), # "From:"
|
35 |
-
|
36 |
-
# Spanish patterns
|
37 |
-
re.compile(r'^El .* escribi贸:', re.IGNORECASE), # "On DATE, NAME wrote:"
|
38 |
-
|
39 |
-
# Chinese patterns
|
40 |
-
re.compile(r'^鍘嗗彶閭欢$', re.IGNORECASE), # "Historical Emails"
|
41 |
-
|
42 |
-
# Dutch patterns
|
43 |
-
re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
|
44 |
-
re.compile(r'^Van:\s', re.IGNORECASE),
|
45 |
-
re.compile(r'^Aan:\s', re.IGNORECASE),
|
46 |
-
re.compile(r'^Onderwerp:\s', re.IGNORECASE),
|
47 |
-
re.compile(r'^Verzonden:\s', re.IGNORECASE),
|
48 |
-
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
|
49 |
-
|
50 |
-
# Italian patterns
|
51 |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
|
52 |
re.compile(r'^Da:\s', re.IGNORECASE),
|
53 |
re.compile(r'^A:\s', re.IGNORECASE),
|
54 |
re.compile(r'^Oggetto:\s', re.IGNORECASE),
|
55 |
re.compile(r'^Data:\s', re.IGNORECASE),
|
56 |
re.compile(r'^Messaggio originale', re.IGNORECASE),
|
|
|
|
|
|
|
57 |
]
|
58 |
|
59 |
def remove_quoted_text(soup):
|
60 |
"""Remove quoted text sections from the email HTML content."""
|
61 |
-
# Remove
|
62 |
-
for quote in soup.find_all('
|
63 |
quote.decompose()
|
64 |
|
65 |
-
# Remove
|
66 |
-
for
|
67 |
-
extra.decompose()
|
68 |
-
|
69 |
-
# Remove Outlook quoted text
|
70 |
-
for quote in soup.find_all('div', class_='OutlookMessageHeader'):
|
71 |
quote.decompose()
|
72 |
|
73 |
-
#
|
74 |
-
for
|
75 |
-
blockquote.decompose()
|
76 |
-
|
77 |
-
# Remove Yahoo quoted text
|
78 |
-
for quote in soup.find_all('div', class_='yahoo_quoted'):
|
79 |
quote.decompose()
|
80 |
|
81 |
-
# Remove reply intros
|
82 |
-
for intro in soup.find_all('div', id='reply-intro'):
|
83 |
-
intro.decompose()
|
84 |
-
|
85 |
-
# Remove Mozilla's quoted text
|
86 |
-
for cite in soup.find_all('div', class_='moz-cite-prefix'):
|
87 |
-
cite.decompose()
|
88 |
-
|
89 |
return soup
|
90 |
|
91 |
def extract_latest_message_from_lines(lines):
|
|
|
17 |
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
|
18 |
re.compile(r'^Forwarded message', re.IGNORECASE),
|
19 |
|
20 |
+
# Italian patterns specific to the example email
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
|
22 |
re.compile(r'^Da:\s', re.IGNORECASE),
|
23 |
re.compile(r'^A:\s', re.IGNORECASE),
|
24 |
re.compile(r'^Oggetto:\s', re.IGNORECASE),
|
25 |
re.compile(r'^Data:\s', re.IGNORECASE),
|
26 |
re.compile(r'^Messaggio originale', re.IGNORECASE),
|
27 |
+
|
28 |
+
# General timestamp patterns
|
29 |
+
re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
|
30 |
]
|
31 |
|
32 |
def remove_quoted_text(soup):
|
33 |
"""Remove quoted text sections from the email HTML content."""
|
34 |
+
# Remove any divs with inline styles that indicate quoted text (like dotted borders)
|
35 |
+
for quote in soup.find_all('td', style=lambda value: value and 'dotted' in value):
|
36 |
quote.decompose()
|
37 |
|
38 |
+
# Remove common quoted sections based on the structure
|
39 |
+
for quote in soup.find_all('div', class_='zd-comment'):
|
|
|
|
|
|
|
|
|
40 |
quote.decompose()
|
41 |
|
42 |
+
# Additional cleaning specific to the example provided
|
43 |
+
for quote in soup.find_all('div', style=lambda value: value and 'color:#aaaaaa' in value):
|
|
|
|
|
|
|
|
|
44 |
quote.decompose()
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
return soup
|
47 |
|
48 |
def extract_latest_message_from_lines(lines):
|