# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents from bs4 import BeautifulSoup import requests from DefaultPackages import openFile, saveFile from NER import cleanText import pandas as pd class HTML(): def __init__(self, htmlFile, htmlLink): self.htmlLink = htmlLink self.htmlFile = htmlFile # def openHTMLFile(self): # headers = { # "User-Agent": ( # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " # "AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/114.0.0.0 Safari/537.36" # ), # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # "Referer": self.htmlLink, # "Connection": "keep-alive" # } # session = requests.Session() # session.headers.update(headers) # if self.htmlLink != "None": # try: # r = session.get(self.htmlLink, allow_redirects=True, timeout=15) # if r.status_code != 200: # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}") # return BeautifulSoup("", 'html.parser') # soup = BeautifulSoup(r.content, 'html.parser') # except Exception as e: # print(f"❌ Exception fetching HTML: {e}") # return BeautifulSoup("", 'html.parser') # else: # with open(self.htmlFile) as fp: # soup = BeautifulSoup(fp, 'html.parser') # return soup from lxml.etree import ParserError, XMLSyntaxError def openHTMLFile(self): not_need_domain = ['https://broadinstitute.github.io/picard/', 'https://software.broadinstitute.org/gatk/best-practices/', 'https://www.ncbi.nlm.nih.gov/genbank/', 'https://www.mitomap.org/'] headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/114.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Referer": self.htmlLink, "Connection": "keep-alive" } session = requests.Session() session.headers.update(headers) if self.htmlLink in not_need_domain: return BeautifulSoup("", 'html.parser') try: if self.htmlLink and self.htmlLink != "None": r = session.get(self.htmlLink, allow_redirects=True, timeout=15) if r.status_code != 200 or not r.text.strip(): print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}") return BeautifulSoup("", 'html.parser') soup = BeautifulSoup(r.content, 'html.parser') else: with open(self.htmlFile, encoding='utf-8') as fp: soup = BeautifulSoup(fp, 'html.parser') except (ParserError, XMLSyntaxError, OSError) as e: print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}") return BeautifulSoup("", 'html.parser') except Exception as e: print(f"❌ General exception for {self.htmlLink}: {e}") return BeautifulSoup("", 'html.parser') return soup def getText(self): soup = self.openHTMLFile() s = soup.find_all("html") text = "" if s: for t in range(len(s)): text = s[t].get_text() cl = cleanText.cleanGenText() text = cl.removeExtraSpaceBetweenWords(text) return text def getListSection(self, scienceDirect=None): try: json = {} text = "" textJson, textHTML = "","" if scienceDirect == None: soup = self.openHTMLFile() # get list of section json = {} for h2Pos in range(len(soup.find_all('h2'))): if soup.find_all('h2')[h2Pos].text not in json: json[soup.find_all('h2')[h2Pos].text] = [] if h2Pos + 1 < len(soup.find_all('h2')): content = soup.find_all('h2')[h2Pos].find_next("p") nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p") while content.text != nexth2Content.text: json[soup.find_all('h2')[h2Pos].text].append(content.text) content = content.find_next("p") else: content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True) json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content) # format '''json = {'Abstract':[], 'Introduction':[], 'Methods'[], 'Results':[], 'Discussion':[], 'References':[], 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[], 'Additional information':[], 'Electronic supplementary material':[], 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}''' if scienceDirect!= None or len(json)==0: # Replace with your actual Elsevier API key api_key = os.environ["SCIENCE_DIRECT_API"] # ScienceDirect article DOI or PI (Example DOI) doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009" # Base URL for the Elsevier API base_url = "https://api.elsevier.com/content/article/doi/" # Set headers with API key headers = { "Accept": "application/json", "X-ELS-APIKey": api_key } # Make the API request response = requests.get(base_url + doi, headers=headers) # Check if the request was successful if response.status_code == 200: data = response.json() supp_data = data["full-text-retrieval-response"]#["coredata"]["link"] if "originalText" in list(supp_data.keys()): if type(supp_data["originalText"])==str: json["originalText"] = [supp_data["originalText"]] if type(supp_data["originalText"])==dict: json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]] else: if type(supp_data)==dict: for key in supp_data: json[key] = [supp_data[key]] textJson = self.mergeTextInJson(json) textHTML = self.getText() if len(textHTML) > len(textJson): text = textHTML else: text = textJson return text #json except: print("failed all") return "" def getReference(self): # get reference to collect more next data ref = [] json = self.getListSection() for key in json["References"]: ct = cleanText.cleanGenText(key) cleanText, filteredWord = ct.cleanText() if cleanText not in ref: ref.append(cleanText) return ref def getSupMaterial(self): # check if there is material or not json = {} soup = self.openHTMLFile() for h2Pos in range(len(soup.find_all('h2'))): if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower(): #print(soup.find_all('h2')[h2Pos].find_next("a").get("href")) link, output = [],[] if soup.find_all('h2')[h2Pos].text not in json: json[soup.find_all('h2')[h2Pos].text] = [] for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True): link.append(l["href"]) if h2Pos + 1 < len(soup.find_all('h2')): nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"] if nexth2Link in link: link = link[:link.index(nexth2Link)] # only take links having "https" in that for i in link: if "https" in i: output.append(i) json[soup.find_all('h2')[h2Pos].text].extend(output) return json def extractTable(self): soup = self.openHTMLFile() df = [] if len(soup)>0: try: df = pd.read_html(str(soup)) except ValueError: df = [] print("No tables found in HTML file") return df def mergeTextInJson(self,jsonHTML): cl = cleanText.cleanGenText() #cl = cleanGenText() htmlText = "" for sec in jsonHTML: # section is "\n\n" if len(jsonHTML[sec]) > 0: for i in range(len(jsonHTML[sec])): # same section is just a dot. text = jsonHTML[sec][i] if len(text)>0: #text = cl.removeTabWhiteSpaceNewLine(text) #text = cl.removeExtraSpaceBetweenWords(text) text, filteredWord = cl.textPreprocessing(text, keepPeriod=True) jsonHTML[sec][i] = text if i-1 >= 0: if len(jsonHTML[sec][i-1])>0: if jsonHTML[sec][i-1][-1] != ".": htmlText += ". " htmlText += jsonHTML[sec][i] if len(jsonHTML[sec][i]) > 0: if jsonHTML[sec][i][-1]!=".": htmlText += "." htmlText += "\n\n" return htmlText def removeHeaders(self): pass def removeFooters(self): pass def removeReferences(self): pass