File size: 5,286 Bytes
0a4c2f3
 
 
 
7b35151
0a4c2f3
 
7b35151
 
 
 
 
0a4c2f3
 
 
4c6a55e
75a6666
f69ab93
7b35151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a4c2f3
 
 
 
 
 
 
 
 
 
7b35151
 
 
0a4c2f3
7b35151
 
 
0a4c2f3
7b35151
 
 
0a4c2f3
4c6a55e
 
f69ab93
4c6a55e
 
0a4c2f3
 
 
 
 
 
 
 
f69ab93
 
 
0a4c2f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from bs4 import BeautifulSoup
import streamlit as st

# Define patterns that indicate the start of a previous message in multiple languages
PRIOR_MESSAGE_MARKERS = [
    # English patterns
    re.compile(r'^From:', re.IGNORECASE),
    re.compile(r'^Sent:', re.IGNORECASE),
    re.compile(r'^Subject:', re.IGNORECASE),
    re.compile(r'^To:', re.IGNORECASE),
    re.compile(r'^Date:', re.IGNORECASE),
    re.compile(r'^On .* wrote:', re.IGNORECASE),
    re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
    re.compile(r'^Begin forwarded message:', re.IGNORECASE),
    
    # Custom separators in email (like lines of dashes or borders)
    re.compile(r'^-+.*-+$'),  # For lines like "--------------------------------------------------"

    # Portuguese patterns
    re.compile(r'^Em .* escreveu:', re.IGNORECASE),
    re.compile(r'^De:\s', re.IGNORECASE),
    re.compile(r'^Para:\s', re.IGNORECASE),
    re.compile(r'^Data:\s', re.IGNORECASE),
    re.compile(r'^Assunto:\s', re.IGNORECASE),
    re.compile(r'^Mensagem original', re.IGNORECASE),

    # French patterns
    re.compile(r'^De :\s', re.IGNORECASE),  # "From:"
    re.compile(r'^Le .* a écrit :', re.IGNORECASE),  # "On DATE, NAME wrote:"

    # German patterns
    re.compile(r'^Am .* schrieb.*:', re.IGNORECASE),  # "On DATE, NAME wrote:"
    re.compile(r'^Von:\s', re.IGNORECASE),  # "From:"

    # Spanish patterns
    re.compile(r'^El .* escribió:', re.IGNORECASE),  # "On DATE, NAME wrote:"

    # Chinese patterns
    re.compile(r'^历史邮件$', re.IGNORECASE),  # "Historical Emails"

    # Dutch patterns
    re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
    re.compile(r'^Van:\s', re.IGNORECASE),
    re.compile(r'^Aan:\s', re.IGNORECASE),
    re.compile(r'^Onderwerp:\s', re.IGNORECASE),
    re.compile(r'^Verzonden:\s', re.IGNORECASE),
    re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),

    # Italian patterns
    re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
    re.compile(r'^Da:\s', re.IGNORECASE),
    re.compile(r'^A:\s', re.IGNORECASE),
    re.compile(r'^Oggetto:\s', re.IGNORECASE),
    re.compile(r'^Data:\s', re.IGNORECASE),
    re.compile(r'^Messaggio originale', re.IGNORECASE),
]

def remove_quoted_text(soup):
    """Remove quoted text sections from the email HTML content."""
    # Remove blockquotes or quoted sections (typical for email threads)
    for blockquote in soup.find_all('blockquote'):
        blockquote.decompose()

    # Remove any divs that might indicate forwarded or quoted messages
    for div in soup.find_all('div', class_='ms-outlook-mobile-reference-message'):
        div.decompose()

    # Remove horizontal rules (often used to separate replies)
    for hr in soup.find_all('hr'):
        hr.decompose()

    # Remove tables with dotted borders (a typical marker of a previous conversation)
    for table in soup.find_all('table'):
        if table.get('style') and 'border-top:1px dotted' in table.get('style'):
            table.decompose()

    return soup

def extract_latest_message_from_lines(lines):
    """Extract the latest message from the list of lines."""
    latest_message_lines = []
    for line in lines:
        # Clean up the line
        line = line.strip()
        # Ensure that we're handling None values safely
        if line is None or line == '':
            continue
        # Check if the line matches any prior message markers
        if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS):
            break  # Stop if a prior message marker is found
        latest_message_lines.append(line)
    return '\n'.join(latest_message_lines).strip()

def extract_latest_email_text(email_html):
    """
    Extracts the text of the latest email message from the given HTML content,
    removing any quoted threads or previous messages.
    """
    # Parse the HTML content
    soup = BeautifulSoup(email_html, 'html.parser')

    # Remove quoted text sections
    soup = remove_quoted_text(soup)

    # Extract the text and split into lines
    email_text = soup.get_text(separator='\n', strip=True)
    lines = email_text.split('\n')

    # Extract the latest message
    latest_email_text = extract_latest_message_from_lines(lines)

    return latest_email_text

# Streamlit app
def main():
    st.title("Email Latest Message Extractor")
    
    st.write("""
        This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages.
        Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message.
    """)
    
    # Input field for the raw HTML email content
    email_html = st.text_area("Paste the HTML email content here", height=300)
    
    # Button to process the input
    if st.button("Extract Latest Message"):
        if email_html.strip():
            try:
                latest_message = extract_latest_email_text(email_html)
                st.subheader("Extracted Latest Message:")
                st.text_area("Latest Message", value=latest_message, height=200)
            except Exception as e:
                st.error(f"An error occurred: {e}")
        else:
            st.warning("Please paste the HTML content of the email.")
    
if __name__ == "__main__":
    main()