Spaces:
Running
Running
File size: 9,253 Bytes
f412fc0 bf89fa9 f412fc0 2621d77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
def __init__(self, htmlFile, htmlLink):
self.htmlLink = htmlLink
self.htmlFile = htmlFile
# def openHTMLFile(self):
# headers = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/114.0.0.0 Safari/537.36"
# ),
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Referer": self.htmlLink,
# "Connection": "keep-alive"
# }
# session = requests.Session()
# session.headers.update(headers)
# if self.htmlLink != "None":
# try:
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
# if r.status_code != 200:
# print(f"β HTML GET failed: {r.status_code} β {self.htmlLink}")
# return BeautifulSoup("", 'html.parser')
# soup = BeautifulSoup(r.content, 'html.parser')
# except Exception as e:
# print(f"β Exception fetching HTML: {e}")
# return BeautifulSoup("", 'html.parser')
# else:
# with open(self.htmlFile) as fp:
# soup = BeautifulSoup(fp, 'html.parser')
# return soup
from lxml.etree import ParserError, XMLSyntaxError
def openHTMLFile(self):
not_need_domain = ['https://broadinstitute.github.io/picard/',
'https://software.broadinstitute.org/gatk/best-practices/',
'https://www.ncbi.nlm.nih.gov/genbank/',
'https://www.mitomap.org/']
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": self.htmlLink,
"Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(headers)
if self.htmlLink in not_need_domain:
return BeautifulSoup("", 'html.parser')
try:
if self.htmlLink and self.htmlLink != "None":
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
if r.status_code != 200 or not r.text.strip():
print(f"β HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
return BeautifulSoup("", 'html.parser')
soup = BeautifulSoup(r.content, 'html.parser')
else:
with open(self.htmlFile, encoding='utf-8') as fp:
soup = BeautifulSoup(fp, 'html.parser')
except (ParserError, XMLSyntaxError, OSError) as e:
print(f"π« HTML parse error for {self.htmlLink}: {type(e).__name__}")
return BeautifulSoup("", 'html.parser')
except Exception as e:
print(f"β General exception for {self.htmlLink}: {e}")
return BeautifulSoup("", 'html.parser')
return soup
def getText(self):
soup = self.openHTMLFile()
s = soup.find_all("html")
text = ""
if s:
for t in range(len(s)):
text = s[t].get_text()
cl = cleanText.cleanGenText()
text = cl.removeExtraSpaceBetweenWords(text)
return text
def getListSection(self, scienceDirect=None):
try:
json = {}
text = ""
textJson, textHTML = "",""
if scienceDirect == None:
soup = self.openHTMLFile()
# get list of section
json = {}
for h2Pos in range(len(soup.find_all('h2'))):
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
if h2Pos + 1 < len(soup.find_all('h2')):
content = soup.find_all('h2')[h2Pos].find_next("p")
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
while content.text != nexth2Content.text:
json[soup.find_all('h2')[h2Pos].text].append(content.text)
content = content.find_next("p")
else:
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
# format
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
'Results':[], 'Discussion':[], 'References':[],
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
'Additional information':[], 'Electronic supplementary material':[],
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
if scienceDirect!= None or len(json)==0:
# Replace with your actual Elsevier API key
api_key = os.environ["SCIENCE_DIRECT_API"]
# ScienceDirect article DOI or PI (Example DOI)
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
# Base URL for the Elsevier API
base_url = "https://api.elsevier.com/content/article/doi/"
# Set headers with API key
headers = {
"Accept": "application/json",
"X-ELS-APIKey": api_key
}
# Make the API request
response = requests.get(base_url + doi, headers=headers)
# Check if the request was successful
if response.status_code == 200:
data = response.json()
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
if "originalText" in list(supp_data.keys()):
if type(supp_data["originalText"])==str:
json["originalText"] = [supp_data["originalText"]]
if type(supp_data["originalText"])==dict:
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
else:
if type(supp_data)==dict:
for key in supp_data:
json[key] = [supp_data[key]]
textJson = self.mergeTextInJson(json)
textHTML = self.getText()
if len(textHTML) > len(textJson):
text = textHTML
else: text = textJson
return text #json
except:
print("failed all")
return ""
def getReference(self):
# get reference to collect more next data
ref = []
json = self.getListSection()
for key in json["References"]:
ct = cleanText.cleanGenText(key)
cleanText, filteredWord = ct.cleanText()
if cleanText not in ref:
ref.append(cleanText)
return ref
def getSupMaterial(self):
# check if there is material or not
json = {}
soup = self.openHTMLFile()
for h2Pos in range(len(soup.find_all('h2'))):
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
link, output = [],[]
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
link.append(l["href"])
if h2Pos + 1 < len(soup.find_all('h2')):
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
if nexth2Link in link:
link = link[:link.index(nexth2Link)]
# only take links having "https" in that
for i in link:
if "https" in i: output.append(i)
json[soup.find_all('h2')[h2Pos].text].extend(output)
return json
def extractTable(self):
soup = self.openHTMLFile()
df = []
if len(soup)>0:
try:
df = pd.read_html(str(soup))
except ValueError:
df = []
print("No tables found in HTML file")
return df
def mergeTextInJson(self,jsonHTML):
cl = cleanText.cleanGenText()
#cl = cleanGenText()
htmlText = ""
for sec in jsonHTML:
# section is "\n\n"
if len(jsonHTML[sec]) > 0:
for i in range(len(jsonHTML[sec])):
# same section is just a dot.
text = jsonHTML[sec][i]
if len(text)>0:
#text = cl.removeTabWhiteSpaceNewLine(text)
#text = cl.removeExtraSpaceBetweenWords(text)
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
jsonHTML[sec][i] = text
if i-1 >= 0:
if len(jsonHTML[sec][i-1])>0:
if jsonHTML[sec][i-1][-1] != ".":
htmlText += ". "
htmlText += jsonHTML[sec][i]
if len(jsonHTML[sec][i]) > 0:
if jsonHTML[sec][i][-1]!=".":
htmlText += "."
htmlText += "\n\n"
return htmlText
def removeHeaders(self):
pass
def removeFooters(self):
pass
def removeReferences(self):
pass |