albertoarrigoni commited on
Commit
0a4c2f3
·
verified ·
1 Parent(s): 8f10e35

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import streamlit as st
4
+
5
+ # Define patterns that indicate the start of a previous message
6
+ PRIOR_MESSAGE_MARKERS = [
7
+ # English patterns
8
+ re.compile(r'^On .* wrote:', re.IGNORECASE),
9
+ re.compile(r'^From:\s', re.IGNORECASE),
10
+ re.compile(r'^Sent:\s', re.IGNORECASE),
11
+ re.compile(r'^Subject:\s', re.IGNORECASE),
12
+ re.compile(r'^To:\s', re.IGNORECASE),
13
+ re.compile(r'^Date:\s', re.IGNORECASE),
14
+ re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
15
+ re.compile(r'^__+', re.IGNORECASE),
16
+ re.compile(r'^-\s?-\s?-\s?Original Message\s?-\s?-\s?-$', re.IGNORECASE),
17
+ re.compile(r'^Begin forwarded message:', re.IGNORECASE),
18
+ re.compile(r'^Forwarded message', re.IGNORECASE),
19
+
20
+ # Portuguese patterns
21
+ re.compile(r'^Em .* escreveu:', re.IGNORECASE),
22
+ re.compile(r'^De:\s', re.IGNORECASE),
23
+ re.compile(r'^Para:\s', re.IGNORECASE),
24
+ re.compile(r'^Data:\s', re.IGNORECASE),
25
+ re.compile(r'^Assunto:\s', re.IGNORECASE),
26
+ re.compile(r'^Mensagem original', re.IGNORECASE),
27
+
28
+ # Dutch patterns
29
+ re.compile(r'^Op .* schreef.*:', re.IGNORECASE),
30
+ re.compile(r'^Van:\s', re.IGNORECASE),
31
+ re.compile(r'^Aan:\s', re.IGNORECASE),
32
+ re.compile(r'^Onderwerp:\s', re.IGNORECASE),
33
+ re.compile(r'^Verzonden:\s', re.IGNORECASE),
34
+ re.compile(r'^Oorspronkelijk bericht', re.IGNORECASE),
35
+
36
+ # Italian patterns
37
+ re.compile(r'^Il .* ha scritto:', re.IGNORECASE),
38
+ re.compile(r'^Da:\s', re.IGNORECASE),
39
+ re.compile(r'^A:\s', re.IGNORECASE),
40
+ re.compile(r'^Oggetto:\s', re.IGNORECASE),
41
+ re.compile(r'^Data:\s', re.IGNORECASE),
42
+ re.compile(r'^Messaggio originale', re.IGNORECASE),
43
+ ]
44
+
45
+ def remove_quoted_text(soup):
46
+ """Remove quoted text sections from the email HTML content."""
47
+ # Remove Gmail quoted text
48
+ for quote in soup.find_all('div', class_='gmail_quote'):
49
+ quote.decompose()
50
+
51
+ # Remove Gmail extra
52
+ for extra in soup.find_all('div', class_='gmail_extra'):
53
+ extra.decompose()
54
+
55
+ # Remove Outlook quoted text
56
+ for quote in soup.find_all('div', class_='OutlookMessageHeader'):
57
+ quote.decompose()
58
+
59
+ # Remove blockquotes
60
+ for blockquote in soup.find_all('blockquote'):
61
+ blockquote.decompose()
62
+
63
+ # Remove Yahoo quoted text
64
+ for quote in soup.find_all('div', class_='yahoo_quoted'):
65
+ quote.decompose()
66
+
67
+ # Remove reply intros
68
+ for intro in soup.find_all('div', id='reply-intro'):
69
+ intro.decompose()
70
+
71
+ # Remove Mozilla's quoted text
72
+ for cite in soup.find_all('div', class_='moz-cite-prefix'):
73
+ cite.decompose()
74
+
75
+ return soup
76
+
77
+ def extract_latest_message_from_lines(lines):
78
+ """Extract the latest message from the list of lines."""
79
+ latest_message_lines = []
80
+ for line in lines:
81
+ # Clean up the line
82
+ line = line.strip()
83
+ # Check if the line matches any prior message markers
84
+ if any(marker.match(line) for marker in PRIOR_MESSAGE_MARKERS):
85
+ break # Stop if a prior message marker is found
86
+ latest_message_lines.append(line)
87
+ return '\n'.join(latest_message_lines).strip()
88
+
89
+ def extract_latest_email_text(email_html):
90
+ """
91
+ Extracts the text of the latest email message from the given HTML content,
92
+ removing any quoted threads or previous messages.
93
+ """
94
+ # Parse the HTML content
95
+ soup = BeautifulSoup(email_html, 'html.parser')
96
+
97
+ # Remove quoted text sections
98
+ soup = remove_quoted_text(soup)
99
+
100
+ # Extract the text and split into lines
101
+ email_text = soup.get_text(separator='\n', strip=True)
102
+ lines = email_text.split('\n')
103
+
104
+ # Extract the latest message
105
+ latest_email_text = extract_latest_message_from_lines(lines)
106
+
107
+ return latest_email_text
108
+
109
+ # Streamlit app
110
+ def main():
111
+ st.title("Email Latest Message Extractor")
112
+
113
+ st.write("""
114
+ This tool extracts the latest message from an HTML email and removes any quoted thread or previous messages.
115
+ Paste the raw HTML of your email in the text area below, and the tool will parse and display the latest message.
116
+ """)
117
+
118
+ # Input field for the raw HTML email content
119
+ email_html = st.text_area("Paste the HTML email content here", height=300)
120
+
121
+ # Button to process the input
122
+ if st.button("Extract Latest Message"):
123
+ if email_html.strip():
124
+ try:
125
+ latest_message = extract_latest_email_text(email_html)
126
+ st.subheader("Extracted Latest Message:")
127
+ st.text_area("Latest Message", value=latest_message, height=200)
128
+ except Exception as e:
129
+ st.error(f"An error occurred: {e}")
130
+ else:
131
+ st.warning("Please paste the HTML content of the email.")
132
+
133
+ if __name__ == "__main__":
134
+ main()