Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,46 +2,70 @@ import re
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import streamlit as st
|
4 |
|
5 |
-
# Define patterns that indicate the start of a previous message
|
6 |
PRIOR_MESSAGE_MARKERS = [
|
7 |
# English patterns
|
|
|
|
|
|
|
|
|
|
|
8 |
re.compile(r'^On .* wrote:', re.IGNORECASE),
|
9 |
-
re.compile(r'^From:\s', re.IGNORECASE),
|
10 |
-
re.compile(r'^Sent:\s', re.IGNORECASE),
|
11 |
-
re.compile(r'^Subject:\s', re.IGNORECASE),
|
12 |
-
re.compile(r'^To:\s', re.IGNORECASE),
|
13 |
-
re.compile(r'^Date:\s', re.IGNORECASE),
|
14 |
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
|
15 |
-
re.compile(r'^__+', re.IGNORECASE),
|
16 |
-
re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
|
17 |
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
|
22 |
re.compile(r'^Da:\s', re.IGNORECASE),
|
23 |
re.compile(r'^A:\s', re.IGNORECASE),
|
24 |
re.compile(r'^Oggetto:\s', re.IGNORECASE),
|
25 |
re.compile(r'^Data:\s', re.IGNORECASE),
|
26 |
re.compile(r'^Messaggio originale', re.IGNORECASE),
|
27 |
-
|
28 |
-
# General timestamp patterns
|
29 |
-
re.compile(r'^\w{3,9} \d{1,2}, \d{4}, \d{1,2}:\d{2} GMT[+-]?\d*', re.IGNORECASE),
|
30 |
]
|
31 |
|
32 |
def remove_quoted_text(soup):
|
33 |
"""Remove quoted text sections from the email HTML content."""
|
34 |
-
# Remove
|
35 |
-
for
|
36 |
-
|
37 |
|
38 |
-
# Remove
|
39 |
-
for
|
40 |
-
|
41 |
|
42 |
-
#
|
43 |
-
for
|
44 |
-
|
45 |
|
46 |
return soup
|
47 |
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import streamlit as st
|
4 |
|
5 |
+
# Define patterns that indicate the start of a previous message in multiple languages
|
6 |
PRIOR_MESSAGE_MARKERS = [
|
7 |
# English patterns
|
8 |
+
re.compile(r'^From:', re.IGNORECASE),
|
9 |
+
re.compile(r'^Sent:', re.IGNORECASE),
|
10 |
+
re.compile(r'^Subject:', re.IGNORECASE),
|
11 |
+
re.compile(r'^To:', re.IGNORECASE),
|
12 |
+
re.compile(r'^Date:', re.IGNORECASE),
|
13 |
re.compile(r'^On .* wrote:', re.IGNORECASE),
|
|
|
|
|
|
|
|
|
|
|
14 |
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
|
|
|
|
|
15 |
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
|
16 |
+
|
17 |
+
# Portuguese patterns
|
18 |
+
re.compile(r'^Em .* escreveu:', re.IGNORECASE),
|
19 |
+
re.compile(r'^De:\s', re.IGNORECASE),
|
20 |
+
re.compile(r'^Para:\s', re.IGNORECASE),
|
21 |
+
re.compile(r'^Data:\s', re.IGNORECASE),
|
22 |
+
re.compile(r'^Assunto:\s', re.IGNORECASE),
|
23 |
+
re.compile(r'^Mensagem original', re.IGNORECASE),
|
24 |
+
|
25 |
+
# French patterns
|
26 |
+
re.compile(r'^De :\s', re.IGNORECASE), # "From:"
|
27 |
+
re.compile(r'^Le .* a 茅crit :', re.IGNORECASE), # "On DATE, NAME wrote:"
|
28 |
+
|
29 |
+
# German patterns
|
30 |
+
re.compile(r'^Am .* schrieb.*:', re.IGNORECASE), # "On DATE, NAME wrote:"
|
31 |
+
re.compile(r'^Von:\s', re.IGNORECASE), # "From:"
|
32 |
+
|
33 |
+
# Spanish patterns
|
34 |
+
re.compile(r'^El .* escribi贸:', re.IGNORECASE), # "On DATE, NAME wrote:"
|
35 |
+
|
36 |
+
# Chinese patterns
|
37 |
+
re.compile(r'^鍘嗗彶閭欢$', re.IGNORECASE), # "Historical Emails"
|
38 |
+
|
39 |
+
# Dutch patterns
|
40 |
+
re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
|
41 |
+
re.compile(r'^Van:\s', re.IGNORECASE),
|
42 |
+
re.compile(r'^Aan:\s', re.IGNORECASE),
|
43 |
+
re.compile(r'^Onderwerp:\s', re.IGNORECASE),
|
44 |
+
re.compile(r'^Verzonden:\s', re.IGNORECASE),
|
45 |
+
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
|
46 |
+
|
47 |
+
# Italian patterns
|
48 |
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
|
49 |
re.compile(r'^Da:\s', re.IGNORECASE),
|
50 |
re.compile(r'^A:\s', re.IGNORECASE),
|
51 |
re.compile(r'^Oggetto:\s', re.IGNORECASE),
|
52 |
re.compile(r'^Data:\s', re.IGNORECASE),
|
53 |
re.compile(r'^Messaggio originale', re.IGNORECASE),
|
|
|
|
|
|
|
54 |
]
|
55 |
|
56 |
def remove_quoted_text(soup):
|
57 |
"""Remove quoted text sections from the email HTML content."""
|
58 |
+
# Remove blockquotes or quoted sections (typical for email threads)
|
59 |
+
for blockquote in soup.find_all('blockquote'):
|
60 |
+
blockquote.decompose()
|
61 |
|
62 |
+
# Remove any divs that might indicate forwarded or quoted messages
|
63 |
+
for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'):
|
64 |
+
div.decompose()
|
65 |
|
66 |
+
# Remove horizontal rules (often used to separate replies)
|
67 |
+
for hr in soup.find_all('hr'):
|
68 |
+
hr.decompose()
|
69 |
|
70 |
return soup
|
71 |
|