Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

mtDNALocation / NER /html /extractHTML.py

VyLala

Update NER/html/extractHTML.py

bf89fa9 verified 17 days ago

raw

history blame

9.25 kB

	# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
	from bs4 import BeautifulSoup
	import requests
	from DefaultPackages import openFile, saveFile
	from NER import cleanText
	import pandas as pd
	class HTML():
	def __init__(self, htmlFile, htmlLink):
	self.htmlLink = htmlLink
	self.htmlFile = htmlFile
	# def openHTMLFile(self):
	# headers = {
	# "User-Agent": (
	# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	# "AppleWebKit/537.36 (KHTML, like Gecko) "
	# "Chrome/114.0.0.0 Safari/537.36"
	# ),
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	# "Referer": self.htmlLink,
	# "Connection": "keep-alive"
	# }

	# session = requests.Session()
	# session.headers.update(headers)

	# if self.htmlLink != "None":
	# try:
	# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
	# if r.status_code != 200:
	# print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
	# return BeautifulSoup("", 'html.parser')
	# soup = BeautifulSoup(r.content, 'html.parser')
	# except Exception as e:
	# print(f"❌ Exception fetching HTML: {e}")
	# return BeautifulSoup("", 'html.parser')
	# else:
	# with open(self.htmlFile) as fp:
	# soup = BeautifulSoup(fp, 'html.parser')
	# return soup
	from lxml.etree import ParserError, XMLSyntaxError

	def openHTMLFile(self):
	not_need_domain = ['https://broadinstitute.github.io/picard/',
	'https://software.broadinstitute.org/gatk/best-practices/',
	'https://www.ncbi.nlm.nih.gov/genbank/',
	'https://www.mitomap.org/']
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/114.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Referer": self.htmlLink,
	"Connection": "keep-alive"
	}

	session = requests.Session()
	session.headers.update(headers)
	if self.htmlLink in not_need_domain:
	return BeautifulSoup("", 'html.parser')
	try:
	if self.htmlLink and self.htmlLink != "None":
	r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
	if r.status_code != 200 or not r.text.strip():
	print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
	return BeautifulSoup("", 'html.parser')
	soup = BeautifulSoup(r.content, 'html.parser')
	else:
	with open(self.htmlFile, encoding='utf-8') as fp:
	soup = BeautifulSoup(fp, 'html.parser')
	except (ParserError, XMLSyntaxError, OSError) as e:
	print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
	return BeautifulSoup("", 'html.parser')
	except Exception as e:
	print(f"❌ General exception for {self.htmlLink}: {e}")
	return BeautifulSoup("", 'html.parser')

	return soup

	def getText(self):
	soup = self.openHTMLFile()
	s = soup.find_all("html")
	text = ""
	if s:
	for t in range(len(s)):
	text = s[t].get_text()
	cl = cleanText.cleanGenText()
	text = cl.removeExtraSpaceBetweenWords(text)
	return text
	def getListSection(self, scienceDirect=None):
	try:
	json = {}
	text = ""
	textJson, textHTML = "",""
	if scienceDirect == None:
	soup = self.openHTMLFile()
	# get list of section
	json = {}
	for h2Pos in range(len(soup.find_all('h2'))):
	if soup.find_all('h2')[h2Pos].text not in json:
	json[soup.find_all('h2')[h2Pos].text] = []
	if h2Pos + 1 < len(soup.find_all('h2')):
	content = soup.find_all('h2')[h2Pos].find_next("p")
	nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
	while content.text != nexth2Content.text:
	json[soup.find_all('h2')[h2Pos].text].append(content.text)
	content = content.find_next("p")
	else:
	content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
	json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
	# format
	'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
	'Results':[], 'Discussion':[], 'References':[],
	'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
	'Additional information':[], 'Electronic supplementary material':[],
	'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
	if scienceDirect!= None or len(json)==0:
	# Replace with your actual Elsevier API key
	api_key = os.environ["SCIENCE_DIRECT_API"]
	# ScienceDirect article DOI or PI (Example DOI)
	doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
	# Base URL for the Elsevier API
	base_url = "https://api.elsevier.com/content/article/doi/"
	# Set headers with API key
	headers = {
	"Accept": "application/json",
	"X-ELS-APIKey": api_key
	}
	# Make the API request
	response = requests.get(base_url + doi, headers=headers)
	# Check if the request was successful
	if response.status_code == 200:
	data = response.json()
	supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
	if "originalText" in list(supp_data.keys()):
	if type(supp_data["originalText"])==str:
	json["originalText"] = [supp_data["originalText"]]
	if type(supp_data["originalText"])==dict:
	json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
	else:
	if type(supp_data)==dict:
	for key in supp_data:
	json[key] = [supp_data[key]]

	textJson = self.mergeTextInJson(json)
	textHTML = self.getText()
	if len(textHTML) > len(textJson):
	text = textHTML
	else: text = textJson
	return text #json
	except:
	print("failed all")
	return ""
	def getReference(self):
	# get reference to collect more next data
	ref = []
	json = self.getListSection()
	for key in json["References"]:
	ct = cleanText.cleanGenText(key)
	cleanText, filteredWord = ct.cleanText()
	if cleanText not in ref:
	ref.append(cleanText)
	return ref
	def getSupMaterial(self):
	# check if there is material or not
	json = {}
	soup = self.openHTMLFile()
	for h2Pos in range(len(soup.find_all('h2'))):
	if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
	#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
	link, output = [],[]
	if soup.find_all('h2')[h2Pos].text not in json:
	json[soup.find_all('h2')[h2Pos].text] = []
	for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
	link.append(l["href"])
	if h2Pos + 1 < len(soup.find_all('h2')):
	nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
	if nexth2Link in link:
	link = link[:link.index(nexth2Link)]
	# only take links having "https" in that
	for i in link:
	if "https" in i: output.append(i)
	json[soup.find_all('h2')[h2Pos].text].extend(output)
	return json
	def extractTable(self):
	soup = self.openHTMLFile()
	df = []
	if len(soup)>0:
	try:
	df = pd.read_html(str(soup))
	except ValueError:
	df = []
	print("No tables found in HTML file")
	return df
	def mergeTextInJson(self,jsonHTML):
	cl = cleanText.cleanGenText()
	#cl = cleanGenText()
	htmlText = ""
	for sec in jsonHTML:
	# section is "\n\n"
	if len(jsonHTML[sec]) > 0:
	for i in range(len(jsonHTML[sec])):
	# same section is just a dot.
	text = jsonHTML[sec][i]
	if len(text)>0:
	#text = cl.removeTabWhiteSpaceNewLine(text)
	#text = cl.removeExtraSpaceBetweenWords(text)
	text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
	jsonHTML[sec][i] = text
	if i-1 >= 0:
	if len(jsonHTML[sec][i-1])>0:
	if jsonHTML[sec][i-1][-1] != ".":
	htmlText += ". "
	htmlText += jsonHTML[sec][i]
	if len(jsonHTML[sec][i]) > 0:
	if jsonHTML[sec][i][-1]!=".":
	htmlText += "."
	htmlText += "\n\n"
	return htmlText
	def removeHeaders(self):
	pass
	def removeFooters(self):
	pass
	def removeReferences(self):
	pass