Spaces:
Running
Running
Update NER/html/extractHTML.py
Browse files- NER/html/extractHTML.py +225 -221
NER/html/extractHTML.py
CHANGED
@@ -1,222 +1,226 @@
|
|
1 |
-
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
import requests
|
4 |
-
from DefaultPackages import openFile, saveFile
|
5 |
-
from NER import cleanText
|
6 |
-
import pandas as pd
|
7 |
-
class HTML():
|
8 |
-
def __init__(self, htmlFile, htmlLink):
|
9 |
-
self.htmlLink = htmlLink
|
10 |
-
self.htmlFile = htmlFile
|
11 |
-
# def openHTMLFile(self):
|
12 |
-
# headers = {
|
13 |
-
# "User-Agent": (
|
14 |
-
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
15 |
-
# "AppleWebKit/537.36 (KHTML, like Gecko) "
|
16 |
-
# "Chrome/114.0.0.0 Safari/537.36"
|
17 |
-
# ),
|
18 |
-
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
19 |
-
# "Referer": self.htmlLink,
|
20 |
-
# "Connection": "keep-alive"
|
21 |
-
# }
|
22 |
-
|
23 |
-
# session = requests.Session()
|
24 |
-
# session.headers.update(headers)
|
25 |
-
|
26 |
-
# if self.htmlLink != "None":
|
27 |
-
# try:
|
28 |
-
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
|
29 |
-
# if r.status_code != 200:
|
30 |
-
# print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
|
31 |
-
# return BeautifulSoup("", 'html.parser')
|
32 |
-
# soup = BeautifulSoup(r.content, 'html.parser')
|
33 |
-
# except Exception as e:
|
34 |
-
# print(f"❌ Exception fetching HTML: {e}")
|
35 |
-
# return BeautifulSoup("", 'html.parser')
|
36 |
-
# else:
|
37 |
-
# with open(self.htmlFile) as fp:
|
38 |
-
# soup = BeautifulSoup(fp, 'html.parser')
|
39 |
-
# return soup
|
40 |
-
from lxml.etree import ParserError, XMLSyntaxError
|
41 |
-
|
42 |
-
def openHTMLFile(self):
|
43 |
-
not_need_domain = ['https://broadinstitute.github.io/picard/',
|
44 |
-
'https://software.broadinstitute.org/gatk/best-practices/',
|
45 |
-
'https://www.ncbi.nlm.nih.gov/genbank/',
|
46 |
-
'https://www.mitomap.org/']
|
47 |
-
headers = {
|
48 |
-
"User-Agent": (
|
49 |
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
50 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
51 |
-
"Chrome/114.0.0.0 Safari/537.36"
|
52 |
-
),
|
53 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
54 |
-
"Referer": self.htmlLink,
|
55 |
-
"Connection": "keep-alive"
|
56 |
-
}
|
57 |
-
|
58 |
-
session = requests.Session()
|
59 |
-
session.headers.update(headers)
|
60 |
-
if self.htmlLink in not_need_domain:
|
61 |
-
return BeautifulSoup("", 'html.parser')
|
62 |
-
try:
|
63 |
-
if self.htmlLink and self.htmlLink != "None":
|
64 |
-
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
|
65 |
-
if r.status_code != 200 or not r.text.strip():
|
66 |
-
print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
|
67 |
-
return BeautifulSoup("", 'html.parser')
|
68 |
-
soup = BeautifulSoup(r.content, 'html.parser')
|
69 |
-
else:
|
70 |
-
with open(self.htmlFile, encoding='utf-8') as fp:
|
71 |
-
soup = BeautifulSoup(fp, 'html.parser')
|
72 |
-
except (ParserError, XMLSyntaxError, OSError) as e:
|
73 |
-
print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
|
74 |
-
return BeautifulSoup("", 'html.parser')
|
75 |
-
except Exception as e:
|
76 |
-
print(f"❌ General exception for {self.htmlLink}: {e}")
|
77 |
-
return BeautifulSoup("", 'html.parser')
|
78 |
-
|
79 |
-
return soup
|
80 |
-
|
81 |
-
def getText(self):
|
82 |
-
soup = self.openHTMLFile()
|
83 |
-
s = soup.find_all("html")
|
84 |
-
text = ""
|
85 |
-
if s:
|
86 |
-
for t in range(len(s)):
|
87 |
-
text = s[t].get_text()
|
88 |
-
cl = cleanText.cleanGenText()
|
89 |
-
text = cl.removeExtraSpaceBetweenWords(text)
|
90 |
-
return text
|
91 |
-
def getListSection(self, scienceDirect=None):
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
if
|
174 |
-
|
175 |
-
|
176 |
-
link
|
177 |
-
|
178 |
-
|
179 |
-
if
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
def
|
|
|
|
|
|
|
|
|
222 |
pass
|
|
|
1 |
+
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
from DefaultPackages import openFile, saveFile
|
5 |
+
from NER import cleanText
|
6 |
+
import pandas as pd
|
7 |
+
class HTML():
|
8 |
+
def __init__(self, htmlFile, htmlLink):
|
9 |
+
self.htmlLink = htmlLink
|
10 |
+
self.htmlFile = htmlFile
|
11 |
+
# def openHTMLFile(self):
|
12 |
+
# headers = {
|
13 |
+
# "User-Agent": (
|
14 |
+
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
15 |
+
# "AppleWebKit/537.36 (KHTML, like Gecko) "
|
16 |
+
# "Chrome/114.0.0.0 Safari/537.36"
|
17 |
+
# ),
|
18 |
+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
19 |
+
# "Referer": self.htmlLink,
|
20 |
+
# "Connection": "keep-alive"
|
21 |
+
# }
|
22 |
+
|
23 |
+
# session = requests.Session()
|
24 |
+
# session.headers.update(headers)
|
25 |
+
|
26 |
+
# if self.htmlLink != "None":
|
27 |
+
# try:
|
28 |
+
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
|
29 |
+
# if r.status_code != 200:
|
30 |
+
# print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
|
31 |
+
# return BeautifulSoup("", 'html.parser')
|
32 |
+
# soup = BeautifulSoup(r.content, 'html.parser')
|
33 |
+
# except Exception as e:
|
34 |
+
# print(f"❌ Exception fetching HTML: {e}")
|
35 |
+
# return BeautifulSoup("", 'html.parser')
|
36 |
+
# else:
|
37 |
+
# with open(self.htmlFile) as fp:
|
38 |
+
# soup = BeautifulSoup(fp, 'html.parser')
|
39 |
+
# return soup
|
40 |
+
from lxml.etree import ParserError, XMLSyntaxError
|
41 |
+
|
42 |
+
def openHTMLFile(self):
|
43 |
+
not_need_domain = ['https://broadinstitute.github.io/picard/',
|
44 |
+
'https://software.broadinstitute.org/gatk/best-practices/',
|
45 |
+
'https://www.ncbi.nlm.nih.gov/genbank/',
|
46 |
+
'https://www.mitomap.org/']
|
47 |
+
headers = {
|
48 |
+
"User-Agent": (
|
49 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
50 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
51 |
+
"Chrome/114.0.0.0 Safari/537.36"
|
52 |
+
),
|
53 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
54 |
+
"Referer": self.htmlLink,
|
55 |
+
"Connection": "keep-alive"
|
56 |
+
}
|
57 |
+
|
58 |
+
session = requests.Session()
|
59 |
+
session.headers.update(headers)
|
60 |
+
if self.htmlLink in not_need_domain:
|
61 |
+
return BeautifulSoup("", 'html.parser')
|
62 |
+
try:
|
63 |
+
if self.htmlLink and self.htmlLink != "None":
|
64 |
+
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
|
65 |
+
if r.status_code != 200 or not r.text.strip():
|
66 |
+
print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
|
67 |
+
return BeautifulSoup("", 'html.parser')
|
68 |
+
soup = BeautifulSoup(r.content, 'html.parser')
|
69 |
+
else:
|
70 |
+
with open(self.htmlFile, encoding='utf-8') as fp:
|
71 |
+
soup = BeautifulSoup(fp, 'html.parser')
|
72 |
+
except (ParserError, XMLSyntaxError, OSError) as e:
|
73 |
+
print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
|
74 |
+
return BeautifulSoup("", 'html.parser')
|
75 |
+
except Exception as e:
|
76 |
+
print(f"❌ General exception for {self.htmlLink}: {e}")
|
77 |
+
return BeautifulSoup("", 'html.parser')
|
78 |
+
|
79 |
+
return soup
|
80 |
+
|
81 |
+
def getText(self):
|
82 |
+
soup = self.openHTMLFile()
|
83 |
+
s = soup.find_all("html")
|
84 |
+
text = ""
|
85 |
+
if s:
|
86 |
+
for t in range(len(s)):
|
87 |
+
text = s[t].get_text()
|
88 |
+
cl = cleanText.cleanGenText()
|
89 |
+
text = cl.removeExtraSpaceBetweenWords(text)
|
90 |
+
return text
|
91 |
+
def getListSection(self, scienceDirect=None):
|
92 |
+
try:
|
93 |
+
json = {}
|
94 |
+
text = ""
|
95 |
+
textJson, textHTML = "",""
|
96 |
+
if scienceDirect == None:
|
97 |
+
soup = self.openHTMLFile()
|
98 |
+
# get list of section
|
99 |
+
json = {}
|
100 |
+
for h2Pos in range(len(soup.find_all('h2'))):
|
101 |
+
if soup.find_all('h2')[h2Pos].text not in json:
|
102 |
+
json[soup.find_all('h2')[h2Pos].text] = []
|
103 |
+
if h2Pos + 1 < len(soup.find_all('h2')):
|
104 |
+
content = soup.find_all('h2')[h2Pos].find_next("p")
|
105 |
+
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
|
106 |
+
while content.text != nexth2Content.text:
|
107 |
+
json[soup.find_all('h2')[h2Pos].text].append(content.text)
|
108 |
+
content = content.find_next("p")
|
109 |
+
else:
|
110 |
+
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
|
111 |
+
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
|
112 |
+
# format
|
113 |
+
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
|
114 |
+
'Results':[], 'Discussion':[], 'References':[],
|
115 |
+
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
|
116 |
+
'Additional information':[], 'Electronic supplementary material':[],
|
117 |
+
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
|
118 |
+
if scienceDirect!= None or len(json)==0:
|
119 |
+
# Replace with your actual Elsevier API key
|
120 |
+
api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
|
121 |
+
# ScienceDirect article DOI or PI (Example DOI)
|
122 |
+
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
|
123 |
+
# Base URL for the Elsevier API
|
124 |
+
base_url = "https://api.elsevier.com/content/article/doi/"
|
125 |
+
# Set headers with API key
|
126 |
+
headers = {
|
127 |
+
"Accept": "application/json",
|
128 |
+
"X-ELS-APIKey": api_key
|
129 |
+
}
|
130 |
+
# Make the API request
|
131 |
+
response = requests.get(base_url + doi, headers=headers)
|
132 |
+
# Check if the request was successful
|
133 |
+
if response.status_code == 200:
|
134 |
+
data = response.json()
|
135 |
+
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
|
136 |
+
if "originalText" in list(supp_data.keys()):
|
137 |
+
if type(supp_data["originalText"])==str:
|
138 |
+
json["originalText"] = [supp_data["originalText"]]
|
139 |
+
if type(supp_data["originalText"])==dict:
|
140 |
+
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
|
141 |
+
else:
|
142 |
+
if type(supp_data)==dict:
|
143 |
+
for key in supp_data:
|
144 |
+
json[key] = [supp_data[key]]
|
145 |
+
|
146 |
+
textJson = self.mergeTextInJson(json)
|
147 |
+
textHTML = self.getText()
|
148 |
+
if len(textHTML) > len(textJson):
|
149 |
+
text = textHTML
|
150 |
+
else: text = textJson
|
151 |
+
return text #json
|
152 |
+
except:
|
153 |
+
print("failed all")
|
154 |
+
return ""
|
155 |
+
def getReference(self):
|
156 |
+
# get reference to collect more next data
|
157 |
+
ref = []
|
158 |
+
json = self.getListSection()
|
159 |
+
for key in json["References"]:
|
160 |
+
ct = cleanText.cleanGenText(key)
|
161 |
+
cleanText, filteredWord = ct.cleanText()
|
162 |
+
if cleanText not in ref:
|
163 |
+
ref.append(cleanText)
|
164 |
+
return ref
|
165 |
+
def getSupMaterial(self):
|
166 |
+
# check if there is material or not
|
167 |
+
json = {}
|
168 |
+
soup = self.openHTMLFile()
|
169 |
+
for h2Pos in range(len(soup.find_all('h2'))):
|
170 |
+
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
|
171 |
+
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
|
172 |
+
link, output = [],[]
|
173 |
+
if soup.find_all('h2')[h2Pos].text not in json:
|
174 |
+
json[soup.find_all('h2')[h2Pos].text] = []
|
175 |
+
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
|
176 |
+
link.append(l["href"])
|
177 |
+
if h2Pos + 1 < len(soup.find_all('h2')):
|
178 |
+
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
|
179 |
+
if nexth2Link in link:
|
180 |
+
link = link[:link.index(nexth2Link)]
|
181 |
+
# only take links having "https" in that
|
182 |
+
for i in link:
|
183 |
+
if "https" in i: output.append(i)
|
184 |
+
json[soup.find_all('h2')[h2Pos].text].extend(output)
|
185 |
+
return json
|
186 |
+
def extractTable(self):
|
187 |
+
soup = self.openHTMLFile()
|
188 |
+
df = []
|
189 |
+
if len(soup)>0:
|
190 |
+
try:
|
191 |
+
df = pd.read_html(str(soup))
|
192 |
+
except ValueError:
|
193 |
+
df = []
|
194 |
+
print("No tables found in HTML file")
|
195 |
+
return df
|
196 |
+
def mergeTextInJson(self,jsonHTML):
|
197 |
+
cl = cleanText.cleanGenText()
|
198 |
+
#cl = cleanGenText()
|
199 |
+
htmlText = ""
|
200 |
+
for sec in jsonHTML:
|
201 |
+
# section is "\n\n"
|
202 |
+
if len(jsonHTML[sec]) > 0:
|
203 |
+
for i in range(len(jsonHTML[sec])):
|
204 |
+
# same section is just a dot.
|
205 |
+
text = jsonHTML[sec][i]
|
206 |
+
if len(text)>0:
|
207 |
+
#text = cl.removeTabWhiteSpaceNewLine(text)
|
208 |
+
#text = cl.removeExtraSpaceBetweenWords(text)
|
209 |
+
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
|
210 |
+
jsonHTML[sec][i] = text
|
211 |
+
if i-1 >= 0:
|
212 |
+
if len(jsonHTML[sec][i-1])>0:
|
213 |
+
if jsonHTML[sec][i-1][-1] != ".":
|
214 |
+
htmlText += ". "
|
215 |
+
htmlText += jsonHTML[sec][i]
|
216 |
+
if len(jsonHTML[sec][i]) > 0:
|
217 |
+
if jsonHTML[sec][i][-1]!=".":
|
218 |
+
htmlText += "."
|
219 |
+
htmlText += "\n\n"
|
220 |
+
return htmlText
|
221 |
+
def removeHeaders(self):
|
222 |
+
pass
|
223 |
+
def removeFooters(self):
|
224 |
+
pass
|
225 |
+
def removeReferences(self):
|
226 |
pass
|