Spaces:
Running
Running
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents | |
from bs4 import BeautifulSoup | |
import requests | |
from DefaultPackages import openFile, saveFile | |
from NER import cleanText | |
import pandas as pd | |
class HTML(): | |
def __init__(self, htmlFile, htmlLink): | |
self.htmlLink = htmlLink | |
self.htmlFile = htmlFile | |
# def openHTMLFile(self): | |
# headers = { | |
# "User-Agent": ( | |
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
# "AppleWebKit/537.36 (KHTML, like Gecko) " | |
# "Chrome/114.0.0.0 Safari/537.36" | |
# ), | |
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
# "Referer": self.htmlLink, | |
# "Connection": "keep-alive" | |
# } | |
# session = requests.Session() | |
# session.headers.update(headers) | |
# if self.htmlLink != "None": | |
# try: | |
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15) | |
# if r.status_code != 200: | |
# print(f"β HTML GET failed: {r.status_code} β {self.htmlLink}") | |
# return BeautifulSoup("", 'html.parser') | |
# soup = BeautifulSoup(r.content, 'html.parser') | |
# except Exception as e: | |
# print(f"β Exception fetching HTML: {e}") | |
# return BeautifulSoup("", 'html.parser') | |
# else: | |
# with open(self.htmlFile) as fp: | |
# soup = BeautifulSoup(fp, 'html.parser') | |
# return soup | |
from lxml.etree import ParserError, XMLSyntaxError | |
def openHTMLFile(self): | |
not_need_domain = ['https://broadinstitute.github.io/picard/', | |
'https://software.broadinstitute.org/gatk/best-practices/', | |
'https://www.ncbi.nlm.nih.gov/genbank/', | |
'https://www.mitomap.org/'] | |
headers = { | |
"User-Agent": ( | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/114.0.0.0 Safari/537.36" | |
), | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Referer": self.htmlLink, | |
"Connection": "keep-alive" | |
} | |
session = requests.Session() | |
session.headers.update(headers) | |
if self.htmlLink in not_need_domain: | |
return BeautifulSoup("", 'html.parser') | |
try: | |
if self.htmlLink and self.htmlLink != "None": | |
r = session.get(self.htmlLink, allow_redirects=True, timeout=15) | |
if r.status_code != 200 or not r.text.strip(): | |
print(f"β HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}") | |
return BeautifulSoup("", 'html.parser') | |
soup = BeautifulSoup(r.content, 'html.parser') | |
else: | |
with open(self.htmlFile, encoding='utf-8') as fp: | |
soup = BeautifulSoup(fp, 'html.parser') | |
except (ParserError, XMLSyntaxError, OSError) as e: | |
print(f"π« HTML parse error for {self.htmlLink}: {type(e).__name__}") | |
return BeautifulSoup("", 'html.parser') | |
except Exception as e: | |
print(f"β General exception for {self.htmlLink}: {e}") | |
return BeautifulSoup("", 'html.parser') | |
return soup | |
def getText(self): | |
soup = self.openHTMLFile() | |
s = soup.find_all("html") | |
text = "" | |
if s: | |
for t in range(len(s)): | |
text = s[t].get_text() | |
cl = cleanText.cleanGenText() | |
text = cl.removeExtraSpaceBetweenWords(text) | |
return text | |
def getListSection(self, scienceDirect=None): | |
try: | |
json = {} | |
text = "" | |
textJson, textHTML = "","" | |
if scienceDirect == None: | |
soup = self.openHTMLFile() | |
# get list of section | |
json = {} | |
for h2Pos in range(len(soup.find_all('h2'))): | |
if soup.find_all('h2')[h2Pos].text not in json: | |
json[soup.find_all('h2')[h2Pos].text] = [] | |
if h2Pos + 1 < len(soup.find_all('h2')): | |
content = soup.find_all('h2')[h2Pos].find_next("p") | |
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p") | |
while content.text != nexth2Content.text: | |
json[soup.find_all('h2')[h2Pos].text].append(content.text) | |
content = content.find_next("p") | |
else: | |
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True) | |
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content) | |
# format | |
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[], | |
'Results':[], 'Discussion':[], 'References':[], | |
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[], | |
'Additional information':[], 'Electronic supplementary material':[], | |
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}''' | |
if scienceDirect!= None or len(json)==0: | |
# Replace with your actual Elsevier API key | |
api_key = os.environ["SCIENCE_DIRECT_API"] | |
# ScienceDirect article DOI or PI (Example DOI) | |
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009" | |
# Base URL for the Elsevier API | |
base_url = "https://api.elsevier.com/content/article/doi/" | |
# Set headers with API key | |
headers = { | |
"Accept": "application/json", | |
"X-ELS-APIKey": api_key | |
} | |
# Make the API request | |
response = requests.get(base_url + doi, headers=headers) | |
# Check if the request was successful | |
if response.status_code == 200: | |
data = response.json() | |
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"] | |
if "originalText" in list(supp_data.keys()): | |
if type(supp_data["originalText"])==str: | |
json["originalText"] = [supp_data["originalText"]] | |
if type(supp_data["originalText"])==dict: | |
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]] | |
else: | |
if type(supp_data)==dict: | |
for key in supp_data: | |
json[key] = [supp_data[key]] | |
textJson = self.mergeTextInJson(json) | |
textHTML = self.getText() | |
if len(textHTML) > len(textJson): | |
text = textHTML | |
else: text = textJson | |
return text #json | |
except: | |
print("failed all") | |
return "" | |
def getReference(self): | |
# get reference to collect more next data | |
ref = [] | |
json = self.getListSection() | |
for key in json["References"]: | |
ct = cleanText.cleanGenText(key) | |
cleanText, filteredWord = ct.cleanText() | |
if cleanText not in ref: | |
ref.append(cleanText) | |
return ref | |
def getSupMaterial(self): | |
# check if there is material or not | |
json = {} | |
soup = self.openHTMLFile() | |
for h2Pos in range(len(soup.find_all('h2'))): | |
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower(): | |
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href")) | |
link, output = [],[] | |
if soup.find_all('h2')[h2Pos].text not in json: | |
json[soup.find_all('h2')[h2Pos].text] = [] | |
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True): | |
link.append(l["href"]) | |
if h2Pos + 1 < len(soup.find_all('h2')): | |
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"] | |
if nexth2Link in link: | |
link = link[:link.index(nexth2Link)] | |
# only take links having "https" in that | |
for i in link: | |
if "https" in i: output.append(i) | |
json[soup.find_all('h2')[h2Pos].text].extend(output) | |
return json | |
def extractTable(self): | |
soup = self.openHTMLFile() | |
df = [] | |
if len(soup)>0: | |
try: | |
df = pd.read_html(str(soup)) | |
except ValueError: | |
df = [] | |
print("No tables found in HTML file") | |
return df | |
def mergeTextInJson(self,jsonHTML): | |
cl = cleanText.cleanGenText() | |
#cl = cleanGenText() | |
htmlText = "" | |
for sec in jsonHTML: | |
# section is "\n\n" | |
if len(jsonHTML[sec]) > 0: | |
for i in range(len(jsonHTML[sec])): | |
# same section is just a dot. | |
text = jsonHTML[sec][i] | |
if len(text)>0: | |
#text = cl.removeTabWhiteSpaceNewLine(text) | |
#text = cl.removeExtraSpaceBetweenWords(text) | |
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True) | |
jsonHTML[sec][i] = text | |
if i-1 >= 0: | |
if len(jsonHTML[sec][i-1])>0: | |
if jsonHTML[sec][i-1][-1] != ".": | |
htmlText += ". " | |
htmlText += jsonHTML[sec][i] | |
if len(jsonHTML[sec][i]) > 0: | |
if jsonHTML[sec][i][-1]!=".": | |
htmlText += "." | |
htmlText += "\n\n" | |
return htmlText | |
def removeHeaders(self): | |
pass | |
def removeFooters(self): | |
pass | |
def removeReferences(self): | |
pass |