Spaces:
Runtime error
Runtime error
allenchienxxx
commited on
Commit
·
ae55d84
1
Parent(s):
7aed46a
Update modules.py
Browse files- modules.py +20 -21
modules.py
CHANGED
@@ -1,3 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def get_text_from_html(html_content):
|
2 |
soup = BeautifulSoup(html_content, 'html.parser')
|
3 |
# extract all the texts
|
@@ -16,8 +23,7 @@ def get_text(file):
|
|
16 |
return text_content.replace("\n","")
|
17 |
if text_content == "":
|
18 |
return get_text_from_html(get_html_general(file));
|
19 |
-
|
20 |
-
import email
|
21 |
def get_email_html(file):
|
22 |
content = email.message_from_bytes(file.read())
|
23 |
html_content = ""
|
@@ -157,12 +163,6 @@ def get_tags_from_html(html_content):
|
|
157 |
tag_list += [tag.name]
|
158 |
# print(tag_list)
|
159 |
return tag_list
|
160 |
-
import ipaddress
|
161 |
-
from urllib.parse import urlparse
|
162 |
-
import urllib.request
|
163 |
-
from bs4 import BeautifulSoup
|
164 |
-
import re
|
165 |
-
import email
|
166 |
|
167 |
#get urls in html content
|
168 |
def get_urls_from_html(html_content):
|
@@ -231,19 +231,18 @@ def get_num_FunctionWords(file):
|
|
231 |
|
232 |
|
233 |
def get_email_html(file):
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
return ""
|
247 |
|
248 |
#get how many words in subject
|
249 |
def get_num_sbj(file):
|
|
|
1 |
+
import ipaddress
|
2 |
+
from urllib.parse import urlparse
|
3 |
+
import urllib.request
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import re
|
6 |
+
import email
|
7 |
+
|
8 |
def get_text_from_html(html_content):
|
9 |
soup = BeautifulSoup(html_content, 'html.parser')
|
10 |
# extract all the texts
|
|
|
23 |
return text_content.replace("\n","")
|
24 |
if text_content == "":
|
25 |
return get_text_from_html(get_html_general(file));
|
26 |
+
|
|
|
27 |
def get_email_html(file):
|
28 |
content = email.message_from_bytes(file.read())
|
29 |
html_content = ""
|
|
|
163 |
tag_list += [tag.name]
|
164 |
# print(tag_list)
|
165 |
return tag_list
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
#get urls in html content
|
168 |
def get_urls_from_html(html_content):
|
|
|
231 |
|
232 |
|
233 |
def get_email_html(file):
|
234 |
+
content = email.message_from_bytes(file.read())
|
235 |
+
html_content = ""
|
236 |
+
for part in content.walk():
|
237 |
+
if part.get_content_type() == 'text/html':
|
238 |
+
html_content += part.get_payload(decode=True).decode('iso-8859-1')
|
239 |
+
html_content.replace("\n","")
|
240 |
+
if html_content != "":
|
241 |
+
# print("Found html at "+file)
|
242 |
+
return html_content
|
243 |
+
else:
|
244 |
+
# print("No html content found at "+file)
|
245 |
+
return ""
|
|
|
246 |
|
247 |
#get how many words in subject
|
248 |
def get_num_sbj(file):
|