albertoarrigoni commited on
Commit
4c6a55e
·
verified ·
1 Parent(s): 7b35151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -0
app.py CHANGED
@@ -13,6 +13,9 @@ PRIOR_MESSAGE_MARKERS = [
13
  re.compile(r'^On .* wrote:', re.IGNORECASE),
14
  re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
15
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
 
 
 
16
 
17
  # Portuguese patterns
18
  re.compile(r'^Em .* escreveu:', re.IGNORECASE),
@@ -67,6 +70,11 @@ def remove_quoted_text(soup):
67
  for hr in soup.find_all('hr'):
68
  hr.decompose()
69
 
 
 
 
 
 
70
  return soup
71
 
72
  def extract_latest_message_from_lines(lines):
 
13
  re.compile(r'^On .* wrote:', re.IGNORECASE),
14
  re.compile(r'^----\s?Original Message\s?----$', re.IGNORECASE),
15
  re.compile(r'^Begin forwarded message:', re.IGNORECASE),
16
+
17
+ # Custom separators in email
18
+ re.compile(r'^-+.*-+$'), # For lines like "----------------------------------------------------------------------------------------------------------------"
19
 
20
  # Portuguese patterns
21
  re.compile(r'^Em .* escreveu:', re.IGNORECASE),
 
70
  for hr in soup.find_all('hr'):
71
  hr.decompose()
72
 
73
+ # Remove tables with dotted borders (a typical marker of a previous conversation)
74
+ for table in soup.find_all('table'):
75
+ if 'border-top:1px dotted' in str(table):
76
+ table.decompose()
77
+
78
  return soup
79
 
80
  def extract_latest_message_from_lines(lines):