mtDNALocation / NER /html /extractHTML.py
VyLala's picture
Update NER/html/extractHTML.py
bf89fa9 verified
raw
history blame
9.25 kB
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
def __init__(self, htmlFile, htmlLink):
self.htmlLink = htmlLink
self.htmlFile = htmlFile
# def openHTMLFile(self):
# headers = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/114.0.0.0 Safari/537.36"
# ),
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Referer": self.htmlLink,
# "Connection": "keep-alive"
# }
# session = requests.Session()
# session.headers.update(headers)
# if self.htmlLink != "None":
# try:
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
# if r.status_code != 200:
# print(f"❌ HTML GET failed: {r.status_code} β€” {self.htmlLink}")
# return BeautifulSoup("", 'html.parser')
# soup = BeautifulSoup(r.content, 'html.parser')
# except Exception as e:
# print(f"❌ Exception fetching HTML: {e}")
# return BeautifulSoup("", 'html.parser')
# else:
# with open(self.htmlFile) as fp:
# soup = BeautifulSoup(fp, 'html.parser')
# return soup
from lxml.etree import ParserError, XMLSyntaxError
def openHTMLFile(self):
not_need_domain = ['https://broadinstitute.github.io/picard/',
'https://software.broadinstitute.org/gatk/best-practices/',
'https://www.ncbi.nlm.nih.gov/genbank/',
'https://www.mitomap.org/']
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": self.htmlLink,
"Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(headers)
if self.htmlLink in not_need_domain:
return BeautifulSoup("", 'html.parser')
try:
if self.htmlLink and self.htmlLink != "None":
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
if r.status_code != 200 or not r.text.strip():
print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
return BeautifulSoup("", 'html.parser')
soup = BeautifulSoup(r.content, 'html.parser')
else:
with open(self.htmlFile, encoding='utf-8') as fp:
soup = BeautifulSoup(fp, 'html.parser')
except (ParserError, XMLSyntaxError, OSError) as e:
print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
return BeautifulSoup("", 'html.parser')
except Exception as e:
print(f"❌ General exception for {self.htmlLink}: {e}")
return BeautifulSoup("", 'html.parser')
return soup
def getText(self):
soup = self.openHTMLFile()
s = soup.find_all("html")
text = ""
if s:
for t in range(len(s)):
text = s[t].get_text()
cl = cleanText.cleanGenText()
text = cl.removeExtraSpaceBetweenWords(text)
return text
def getListSection(self, scienceDirect=None):
try:
json = {}
text = ""
textJson, textHTML = "",""
if scienceDirect == None:
soup = self.openHTMLFile()
# get list of section
json = {}
for h2Pos in range(len(soup.find_all('h2'))):
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
if h2Pos + 1 < len(soup.find_all('h2')):
content = soup.find_all('h2')[h2Pos].find_next("p")
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
while content.text != nexth2Content.text:
json[soup.find_all('h2')[h2Pos].text].append(content.text)
content = content.find_next("p")
else:
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
# format
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
'Results':[], 'Discussion':[], 'References':[],
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
'Additional information':[], 'Electronic supplementary material':[],
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
if scienceDirect!= None or len(json)==0:
# Replace with your actual Elsevier API key
api_key = os.environ["SCIENCE_DIRECT_API"]
# ScienceDirect article DOI or PI (Example DOI)
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
# Base URL for the Elsevier API
base_url = "https://api.elsevier.com/content/article/doi/"
# Set headers with API key
headers = {
"Accept": "application/json",
"X-ELS-APIKey": api_key
}
# Make the API request
response = requests.get(base_url + doi, headers=headers)
# Check if the request was successful
if response.status_code == 200:
data = response.json()
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
if "originalText" in list(supp_data.keys()):
if type(supp_data["originalText"])==str:
json["originalText"] = [supp_data["originalText"]]
if type(supp_data["originalText"])==dict:
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
else:
if type(supp_data)==dict:
for key in supp_data:
json[key] = [supp_data[key]]
textJson = self.mergeTextInJson(json)
textHTML = self.getText()
if len(textHTML) > len(textJson):
text = textHTML
else: text = textJson
return text #json
except:
print("failed all")
return ""
def getReference(self):
# get reference to collect more next data
ref = []
json = self.getListSection()
for key in json["References"]:
ct = cleanText.cleanGenText(key)
cleanText, filteredWord = ct.cleanText()
if cleanText not in ref:
ref.append(cleanText)
return ref
def getSupMaterial(self):
# check if there is material or not
json = {}
soup = self.openHTMLFile()
for h2Pos in range(len(soup.find_all('h2'))):
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
link, output = [],[]
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
link.append(l["href"])
if h2Pos + 1 < len(soup.find_all('h2')):
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
if nexth2Link in link:
link = link[:link.index(nexth2Link)]
# only take links having "https" in that
for i in link:
if "https" in i: output.append(i)
json[soup.find_all('h2')[h2Pos].text].extend(output)
return json
def extractTable(self):
soup = self.openHTMLFile()
df = []
if len(soup)>0:
try:
df = pd.read_html(str(soup))
except ValueError:
df = []
print("No tables found in HTML file")
return df
def mergeTextInJson(self,jsonHTML):
cl = cleanText.cleanGenText()
#cl = cleanGenText()
htmlText = ""
for sec in jsonHTML:
# section is "\n\n"
if len(jsonHTML[sec]) > 0:
for i in range(len(jsonHTML[sec])):
# same section is just a dot.
text = jsonHTML[sec][i]
if len(text)>0:
#text = cl.removeTabWhiteSpaceNewLine(text)
#text = cl.removeExtraSpaceBetweenWords(text)
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
jsonHTML[sec][i] = text
if i-1 >= 0:
if len(jsonHTML[sec][i-1])>0:
if jsonHTML[sec][i-1][-1] != ".":
htmlText += ". "
htmlText += jsonHTML[sec][i]
if len(jsonHTML[sec][i]) > 0:
if jsonHTML[sec][i][-1]!=".":
htmlText += "."
htmlText += "\n\n"
return htmlText
def removeHeaders(self):
pass
def removeFooters(self):
pass
def removeReferences(self):
pass