VyLala commited on
Commit
f412fc0
·
verified ·
1 Parent(s): fcceb43

Update NER/html/extractHTML.py

Browse files
Files changed (1) hide show
  1. NER/html/extractHTML.py +225 -221
NER/html/extractHTML.py CHANGED
@@ -1,222 +1,226 @@
1
- # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
2
- from bs4 import BeautifulSoup
3
- import requests
4
- from DefaultPackages import openFile, saveFile
5
- from NER import cleanText
6
- import pandas as pd
7
- class HTML():
8
- def __init__(self, htmlFile, htmlLink):
9
- self.htmlLink = htmlLink
10
- self.htmlFile = htmlFile
11
- # def openHTMLFile(self):
12
- # headers = {
13
- # "User-Agent": (
14
- # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
- # "AppleWebKit/537.36 (KHTML, like Gecko) "
16
- # "Chrome/114.0.0.0 Safari/537.36"
17
- # ),
18
- # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
19
- # "Referer": self.htmlLink,
20
- # "Connection": "keep-alive"
21
- # }
22
-
23
- # session = requests.Session()
24
- # session.headers.update(headers)
25
-
26
- # if self.htmlLink != "None":
27
- # try:
28
- # r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
29
- # if r.status_code != 200:
30
- # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
31
- # return BeautifulSoup("", 'html.parser')
32
- # soup = BeautifulSoup(r.content, 'html.parser')
33
- # except Exception as e:
34
- # print(f"❌ Exception fetching HTML: {e}")
35
- # return BeautifulSoup("", 'html.parser')
36
- # else:
37
- # with open(self.htmlFile) as fp:
38
- # soup = BeautifulSoup(fp, 'html.parser')
39
- # return soup
40
- from lxml.etree import ParserError, XMLSyntaxError
41
-
42
- def openHTMLFile(self):
43
- not_need_domain = ['https://broadinstitute.github.io/picard/',
44
- 'https://software.broadinstitute.org/gatk/best-practices/',
45
- 'https://www.ncbi.nlm.nih.gov/genbank/',
46
- 'https://www.mitomap.org/']
47
- headers = {
48
- "User-Agent": (
49
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
- "AppleWebKit/537.36 (KHTML, like Gecko) "
51
- "Chrome/114.0.0.0 Safari/537.36"
52
- ),
53
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
54
- "Referer": self.htmlLink,
55
- "Connection": "keep-alive"
56
- }
57
-
58
- session = requests.Session()
59
- session.headers.update(headers)
60
- if self.htmlLink in not_need_domain:
61
- return BeautifulSoup("", 'html.parser')
62
- try:
63
- if self.htmlLink and self.htmlLink != "None":
64
- r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
65
- if r.status_code != 200 or not r.text.strip():
66
- print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
67
- return BeautifulSoup("", 'html.parser')
68
- soup = BeautifulSoup(r.content, 'html.parser')
69
- else:
70
- with open(self.htmlFile, encoding='utf-8') as fp:
71
- soup = BeautifulSoup(fp, 'html.parser')
72
- except (ParserError, XMLSyntaxError, OSError) as e:
73
- print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
74
- return BeautifulSoup("", 'html.parser')
75
- except Exception as e:
76
- print(f"❌ General exception for {self.htmlLink}: {e}")
77
- return BeautifulSoup("", 'html.parser')
78
-
79
- return soup
80
-
81
- def getText(self):
82
- soup = self.openHTMLFile()
83
- s = soup.find_all("html")
84
- text = ""
85
- if s:
86
- for t in range(len(s)):
87
- text = s[t].get_text()
88
- cl = cleanText.cleanGenText()
89
- text = cl.removeExtraSpaceBetweenWords(text)
90
- return text
91
- def getListSection(self, scienceDirect=None):
92
- json = {}
93
- text = ""
94
- textJson, textHTML = "",""
95
- if scienceDirect == None:
96
- soup = self.openHTMLFile()
97
- # get list of section
98
- json = {}
99
- for h2Pos in range(len(soup.find_all('h2'))):
100
- if soup.find_all('h2')[h2Pos].text not in json:
101
- json[soup.find_all('h2')[h2Pos].text] = []
102
- if h2Pos + 1 < len(soup.find_all('h2')):
103
- content = soup.find_all('h2')[h2Pos].find_next("p")
104
- nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
105
- while content.text != nexth2Content.text:
106
- json[soup.find_all('h2')[h2Pos].text].append(content.text)
107
- content = content.find_next("p")
108
- else:
109
- content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
110
- json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
111
- # format
112
- '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
113
- 'Results':[], 'Discussion':[], 'References':[],
114
- 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
115
- 'Additional information':[], 'Electronic supplementary material':[],
116
- 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
117
- if scienceDirect!= None or len(json)==0:
118
- # Replace with your actual Elsevier API key
119
- api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
120
- # ScienceDirect article DOI or PI (Example DOI)
121
- doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
122
- # Base URL for the Elsevier API
123
- base_url = "https://api.elsevier.com/content/article/doi/"
124
- # Set headers with API key
125
- headers = {
126
- "Accept": "application/json",
127
- "X-ELS-APIKey": api_key
128
- }
129
- # Make the API request
130
- response = requests.get(base_url + doi, headers=headers)
131
- # Check if the request was successful
132
- if response.status_code == 200:
133
- data = response.json()
134
- supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
135
- if "originalText" in list(supp_data.keys()):
136
- if type(supp_data["originalText"])==str:
137
- json["originalText"] = [supp_data["originalText"]]
138
- if type(supp_data["originalText"])==dict:
139
- json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
140
- else:
141
- if type(supp_data)==dict:
142
- for key in supp_data:
143
- json[key] = [supp_data[key]]
144
-
145
- textJson = self.mergeTextInJson(json)
146
- textHTML = self.getText()
147
- if len(textHTML) > len(textJson):
148
- text = textHTML
149
- else: text = textJson
150
- return text #json
151
- def getReference(self):
152
- # get reference to collect more next data
153
- ref = []
154
- json = self.getListSection()
155
- for key in json["References"]:
156
- ct = cleanText.cleanGenText(key)
157
- cleanText, filteredWord = ct.cleanText()
158
- if cleanText not in ref:
159
- ref.append(cleanText)
160
- return ref
161
- def getSupMaterial(self):
162
- # check if there is material or not
163
- json = {}
164
- soup = self.openHTMLFile()
165
- for h2Pos in range(len(soup.find_all('h2'))):
166
- if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
167
- #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
168
- link, output = [],[]
169
- if soup.find_all('h2')[h2Pos].text not in json:
170
- json[soup.find_all('h2')[h2Pos].text] = []
171
- for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
172
- link.append(l["href"])
173
- if h2Pos + 1 < len(soup.find_all('h2')):
174
- nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
175
- if nexth2Link in link:
176
- link = link[:link.index(nexth2Link)]
177
- # only take links having "https" in that
178
- for i in link:
179
- if "https" in i: output.append(i)
180
- json[soup.find_all('h2')[h2Pos].text].extend(output)
181
- return json
182
- def extractTable(self):
183
- soup = self.openHTMLFile()
184
- df = []
185
- if len(soup)>0:
186
- try:
187
- df = pd.read_html(str(soup))
188
- except ValueError:
189
- df = []
190
- print("No tables found in HTML file")
191
- return df
192
- def mergeTextInJson(self,jsonHTML):
193
- cl = cleanText.cleanGenText()
194
- #cl = cleanGenText()
195
- htmlText = ""
196
- for sec in jsonHTML:
197
- # section is "\n\n"
198
- if len(jsonHTML[sec]) > 0:
199
- for i in range(len(jsonHTML[sec])):
200
- # same section is just a dot.
201
- text = jsonHTML[sec][i]
202
- if len(text)>0:
203
- #text = cl.removeTabWhiteSpaceNewLine(text)
204
- #text = cl.removeExtraSpaceBetweenWords(text)
205
- text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
206
- jsonHTML[sec][i] = text
207
- if i-1 >= 0:
208
- if len(jsonHTML[sec][i-1])>0:
209
- if jsonHTML[sec][i-1][-1] != ".":
210
- htmlText += ". "
211
- htmlText += jsonHTML[sec][i]
212
- if len(jsonHTML[sec][i]) > 0:
213
- if jsonHTML[sec][i][-1]!=".":
214
- htmlText += "."
215
- htmlText += "\n\n"
216
- return htmlText
217
- def removeHeaders(self):
218
- pass
219
- def removeFooters(self):
220
- pass
221
- def removeReferences(self):
 
 
 
 
222
  pass
 
1
+ # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ from DefaultPackages import openFile, saveFile
5
+ from NER import cleanText
6
+ import pandas as pd
7
+ class HTML():
8
+ def __init__(self, htmlFile, htmlLink):
9
+ self.htmlLink = htmlLink
10
+ self.htmlFile = htmlFile
11
+ # def openHTMLFile(self):
12
+ # headers = {
13
+ # "User-Agent": (
14
+ # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
+ # "AppleWebKit/537.36 (KHTML, like Gecko) "
16
+ # "Chrome/114.0.0.0 Safari/537.36"
17
+ # ),
18
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
19
+ # "Referer": self.htmlLink,
20
+ # "Connection": "keep-alive"
21
+ # }
22
+
23
+ # session = requests.Session()
24
+ # session.headers.update(headers)
25
+
26
+ # if self.htmlLink != "None":
27
+ # try:
28
+ # r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
29
+ # if r.status_code != 200:
30
+ # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
31
+ # return BeautifulSoup("", 'html.parser')
32
+ # soup = BeautifulSoup(r.content, 'html.parser')
33
+ # except Exception as e:
34
+ # print(f"❌ Exception fetching HTML: {e}")
35
+ # return BeautifulSoup("", 'html.parser')
36
+ # else:
37
+ # with open(self.htmlFile) as fp:
38
+ # soup = BeautifulSoup(fp, 'html.parser')
39
+ # return soup
40
+ from lxml.etree import ParserError, XMLSyntaxError
41
+
42
+ def openHTMLFile(self):
43
+ not_need_domain = ['https://broadinstitute.github.io/picard/',
44
+ 'https://software.broadinstitute.org/gatk/best-practices/',
45
+ 'https://www.ncbi.nlm.nih.gov/genbank/',
46
+ 'https://www.mitomap.org/']
47
+ headers = {
48
+ "User-Agent": (
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
51
+ "Chrome/114.0.0.0 Safari/537.36"
52
+ ),
53
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
54
+ "Referer": self.htmlLink,
55
+ "Connection": "keep-alive"
56
+ }
57
+
58
+ session = requests.Session()
59
+ session.headers.update(headers)
60
+ if self.htmlLink in not_need_domain:
61
+ return BeautifulSoup("", 'html.parser')
62
+ try:
63
+ if self.htmlLink and self.htmlLink != "None":
64
+ r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
65
+ if r.status_code != 200 or not r.text.strip():
66
+ print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
67
+ return BeautifulSoup("", 'html.parser')
68
+ soup = BeautifulSoup(r.content, 'html.parser')
69
+ else:
70
+ with open(self.htmlFile, encoding='utf-8') as fp:
71
+ soup = BeautifulSoup(fp, 'html.parser')
72
+ except (ParserError, XMLSyntaxError, OSError) as e:
73
+ print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
74
+ return BeautifulSoup("", 'html.parser')
75
+ except Exception as e:
76
+ print(f"❌ General exception for {self.htmlLink}: {e}")
77
+ return BeautifulSoup("", 'html.parser')
78
+
79
+ return soup
80
+
81
+ def getText(self):
82
+ soup = self.openHTMLFile()
83
+ s = soup.find_all("html")
84
+ text = ""
85
+ if s:
86
+ for t in range(len(s)):
87
+ text = s[t].get_text()
88
+ cl = cleanText.cleanGenText()
89
+ text = cl.removeExtraSpaceBetweenWords(text)
90
+ return text
91
+ def getListSection(self, scienceDirect=None):
92
+ try:
93
+ json = {}
94
+ text = ""
95
+ textJson, textHTML = "",""
96
+ if scienceDirect == None:
97
+ soup = self.openHTMLFile()
98
+ # get list of section
99
+ json = {}
100
+ for h2Pos in range(len(soup.find_all('h2'))):
101
+ if soup.find_all('h2')[h2Pos].text not in json:
102
+ json[soup.find_all('h2')[h2Pos].text] = []
103
+ if h2Pos + 1 < len(soup.find_all('h2')):
104
+ content = soup.find_all('h2')[h2Pos].find_next("p")
105
+ nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
106
+ while content.text != nexth2Content.text:
107
+ json[soup.find_all('h2')[h2Pos].text].append(content.text)
108
+ content = content.find_next("p")
109
+ else:
110
+ content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
111
+ json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
112
+ # format
113
+ '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
114
+ 'Results':[], 'Discussion':[], 'References':[],
115
+ 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
116
+ 'Additional information':[], 'Electronic supplementary material':[],
117
+ 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
118
+ if scienceDirect!= None or len(json)==0:
119
+ # Replace with your actual Elsevier API key
120
+ api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
121
+ # ScienceDirect article DOI or PI (Example DOI)
122
+ doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
123
+ # Base URL for the Elsevier API
124
+ base_url = "https://api.elsevier.com/content/article/doi/"
125
+ # Set headers with API key
126
+ headers = {
127
+ "Accept": "application/json",
128
+ "X-ELS-APIKey": api_key
129
+ }
130
+ # Make the API request
131
+ response = requests.get(base_url + doi, headers=headers)
132
+ # Check if the request was successful
133
+ if response.status_code == 200:
134
+ data = response.json()
135
+ supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
136
+ if "originalText" in list(supp_data.keys()):
137
+ if type(supp_data["originalText"])==str:
138
+ json["originalText"] = [supp_data["originalText"]]
139
+ if type(supp_data["originalText"])==dict:
140
+ json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
141
+ else:
142
+ if type(supp_data)==dict:
143
+ for key in supp_data:
144
+ json[key] = [supp_data[key]]
145
+
146
+ textJson = self.mergeTextInJson(json)
147
+ textHTML = self.getText()
148
+ if len(textHTML) > len(textJson):
149
+ text = textHTML
150
+ else: text = textJson
151
+ return text #json
152
+ except:
153
+ print("failed all")
154
+ return ""
155
+ def getReference(self):
156
+ # get reference to collect more next data
157
+ ref = []
158
+ json = self.getListSection()
159
+ for key in json["References"]:
160
+ ct = cleanText.cleanGenText(key)
161
+ cleanText, filteredWord = ct.cleanText()
162
+ if cleanText not in ref:
163
+ ref.append(cleanText)
164
+ return ref
165
+ def getSupMaterial(self):
166
+ # check if there is material or not
167
+ json = {}
168
+ soup = self.openHTMLFile()
169
+ for h2Pos in range(len(soup.find_all('h2'))):
170
+ if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
171
+ #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
172
+ link, output = [],[]
173
+ if soup.find_all('h2')[h2Pos].text not in json:
174
+ json[soup.find_all('h2')[h2Pos].text] = []
175
+ for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
176
+ link.append(l["href"])
177
+ if h2Pos + 1 < len(soup.find_all('h2')):
178
+ nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
179
+ if nexth2Link in link:
180
+ link = link[:link.index(nexth2Link)]
181
+ # only take links having "https" in that
182
+ for i in link:
183
+ if "https" in i: output.append(i)
184
+ json[soup.find_all('h2')[h2Pos].text].extend(output)
185
+ return json
186
+ def extractTable(self):
187
+ soup = self.openHTMLFile()
188
+ df = []
189
+ if len(soup)>0:
190
+ try:
191
+ df = pd.read_html(str(soup))
192
+ except ValueError:
193
+ df = []
194
+ print("No tables found in HTML file")
195
+ return df
196
+ def mergeTextInJson(self,jsonHTML):
197
+ cl = cleanText.cleanGenText()
198
+ #cl = cleanGenText()
199
+ htmlText = ""
200
+ for sec in jsonHTML:
201
+ # section is "\n\n"
202
+ if len(jsonHTML[sec]) > 0:
203
+ for i in range(len(jsonHTML[sec])):
204
+ # same section is just a dot.
205
+ text = jsonHTML[sec][i]
206
+ if len(text)>0:
207
+ #text = cl.removeTabWhiteSpaceNewLine(text)
208
+ #text = cl.removeExtraSpaceBetweenWords(text)
209
+ text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
210
+ jsonHTML[sec][i] = text
211
+ if i-1 >= 0:
212
+ if len(jsonHTML[sec][i-1])>0:
213
+ if jsonHTML[sec][i-1][-1] != ".":
214
+ htmlText += ". "
215
+ htmlText += jsonHTML[sec][i]
216
+ if len(jsonHTML[sec][i]) > 0:
217
+ if jsonHTML[sec][i][-1]!=".":
218
+ htmlText += "."
219
+ htmlText += "\n\n"
220
+ return htmlText
221
+ def removeHeaders(self):
222
+ pass
223
+ def removeFooters(self):
224
+ pass
225
+ def removeReferences(self):
226
  pass