Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
# Define patterns that indicate the start of a previous message
|
6 |
+
PRIOR_MESSAGE_MARKERS = [
|
7 |
+
# English patterns
|
8 |
+
re.compile(r'^On .* wrote:', re.IGNORECASE),
|
9 |
+
re.compile(r'^From:\s', re.IGNORECASE),
|
10 |
+
re.compile(r'^Sent:\s', re.IGNORECASE),
|
11 |
+
re.compile(r'^Subject:\s', re.IGNORECASE),
|
12 |
+
re.compile(r'^To:\s', re.IGNORECASE),
|
13 |
+
re.compile(r'^Date:\s', re.IGNORECASE),
|
14 |
+
re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
|
15 |
+
re.compile(r'^__+', re.IGNORECASE),
|
16 |
+
re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
|
17 |
+
re.compile(r'^Begin forwarded message:', re.IGNORECASE),
|
18 |
+
re.compile(r'^Forwarded message', re.IGNORECASE),
|
19 |
+
|
20 |
+
# Portuguese patterns
|
21 |
+
re.compile(r'^Em .* escreveu:', re.IGNORECASE),
|
22 |
+
re.compile(r'^De:\s', re.IGNORECASE),
|
23 |
+
re.compile(r'^Para:\s', re.IGNORECASE),
|
24 |
+
re.compile(r'^Data:\s', re.IGNORECASE),
|
25 |
+
re.compile(r'^Assunto:\s', re.IGNORECASE),
|
26 |
+
re.compile(r'^Mensagem original', re.IGNORECASE),
|
27 |
+
|
28 |
+
# Dutch patterns
|
29 |
+
re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
|
30 |
+
re.compile(r'^Van:\s', re.IGNORECASE),
|
31 |
+
re.compile(r'^Aan:\s', re.IGNORECASE),
|
32 |
+
re.compile(r'^Onderwerp:\s', re.IGNORECASE),
|
33 |
+
re.compile(r'^Verzonden:\s', re.IGNORECASE),
|
34 |
+
re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
|
35 |
+
|
36 |
+
# Italian patterns
|
37 |
+
re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
|
38 |
+
re.compile(r'^Da:\s', re.IGNORECASE),
|
39 |
+
re.compile(r'^A:\s', re.IGNORECASE),
|
40 |
+
re.compile(r'^Oggetto:\s', re.IGNORECASE),
|
41 |
+
re.compile(r'^Data:\s', re.IGNORECASE),
|
42 |
+
re.compile(r'^Messaggio originale', re.IGNORECASE),
|
43 |
+
]
|
44 |
+
|
45 |
+
def remove_quoted_text(soup):
|
46 |
+
"""Remove quoted text sections from the email HTML content."""
|
47 |
+
# Remove Gmail quoted text
|
48 |
+
for quote in soup.find_all('div', class_='gmail_quote'):
|
49 |
+
quote.decompose()
|
50 |
+
|
51 |
+
# Remove Gmail extra
|
52 |
+
for extra in soup.find_all('div', class_='gmail_extra'):
|
53 |
+
extra.decompose()
|
54 |
+
|
55 |
+
# Remove Outlook quoted text
|
56 |
+
for quote in soup.find_all('div', class_='OutlookMessageHeader'):
|
57 |
+
quote.decompose()
|
58 |
+
|
59 |
+
# Remove blockquotes
|
60 |
+
for blockquote in soup.find_all('blockquote'):
|
61 |
+
blockquote.decompose()
|
62 |
+
|
63 |
+
# Remove Yahoo quoted text
|
64 |
+
for quote in soup.find_all('div', class_='yahoo_quoted'):
|
65 |
+
quote.decompose()
|
66 |
+
|
67 |
+
# Remove reply intros
|
68 |
+
for intro in soup.find_all('div', id='reply-intro'):
|
69 |
+
intro.decompose()
|
70 |
+
|
71 |
+
# Remove Mozilla's quoted text
|
72 |
+
for cite in soup.find_all('div', class_='moz-cite-prefix'):
|
73 |
+
cite.decompose()
|
74 |
+
|
75 |
+
return soup
|
76 |
+
|
77 |
+
def extract_latest_message_from_lines(lines):
|
78 |
+
"""Extract the latest message from the list of lines."""
|
79 |
+
latest_message_lines = []
|
80 |
+
for line in lines:
|
81 |
+
# Clean up the line
|
82 |
+
line = line.strip()
|
83 |
+
# Check if the line matches any prior message markers
|
84 |
+
if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS):
|
85 |
+
break # Stop if a prior message marker is found
|
86 |
+
latest_message_lines.append(line)
|
87 |
+
return '\n'.join(latest_message_lines).strip()
|
88 |
+
|
89 |
+
def extract_latest_email_text(email_html):
|
90 |
+
"""
|
91 |
+
Extracts the text of the latest email message from the given HTML content,
|
92 |
+
removing any quoted threads or previous messages.
|
93 |
+
"""
|
94 |
+
# Parse the HTML content
|
95 |
+
soup = BeautifulSoup(email_html, 'html.parser')
|
96 |
+
|
97 |
+
# Remove quoted text sections
|
98 |
+
soup = remove_quoted_text(soup)
|
99 |
+
|
100 |
+
# Extract the text and split into lines
|
101 |
+
email_text = soup.get_text(separator='\n', strip=True)
|
102 |
+
lines = email_text.split('\n')
|
103 |
+
|
104 |
+
# Extract the latest message
|
105 |
+
latest_email_text = extract_latest_message_from_lines(lines)
|
106 |
+
|
107 |
+
return latest_email_text
|
108 |
+
|
109 |
+
# Streamlit app
|
110 |
+
def main():
|
111 |
+
st.title("Email Latest Message Extractor")
|
112 |
+
|
113 |
+
st.write("""
|
114 |
+
This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages.
|
115 |
+
Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message.
|
116 |
+
""")
|
117 |
+
|
118 |
+
# Input field for the raw HTML email content
|
119 |
+
email_html = st.text_area("Paste the HTML email content here", height=300)
|
120 |
+
|
121 |
+
# Button to process the input
|
122 |
+
if st.button("Extract Latest Message"):
|
123 |
+
if email_html.strip():
|
124 |
+
try:
|
125 |
+
latest_message = extract_latest_email_text(email_html)
|
126 |
+
st.subheader("Extracted Latest Message:")
|
127 |
+
st.text_area("Latest Message", value=latest_message, height=200)
|
128 |
+
except Exception as e:
|
129 |
+
st.error(f"An error occurred: {e}")
|
130 |
+
else:
|
131 |
+
st.warning("Please paste the HTML content of the email.")
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
main()
|