Spaces:

VyLala
/

mtDNALocation

Running

File size: 9,253 Bytes

# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
  def __init__(self, htmlFile, htmlLink):
    self.htmlLink = htmlLink
    self.htmlFile = htmlFile
  # def openHTMLFile(self):
  #   headers = {
  #       "User-Agent": (
  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
  #           "Chrome/114.0.0.0 Safari/537.36"
  #       ),
  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  #       "Referer": self.htmlLink,
  #       "Connection": "keep-alive"
  #   }

  #   session = requests.Session()
  #   session.headers.update(headers)

  #   if self.htmlLink != "None":
  #       try:
  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
  #           if r.status_code != 200:
  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
  #               return BeautifulSoup("", 'html.parser')
  #           soup = BeautifulSoup(r.content, 'html.parser')
  #       except Exception as e:
  #           print(f"❌ Exception fetching HTML: {e}")
  #           return BeautifulSoup("", 'html.parser')
  #   else:
  #       with open(self.htmlFile) as fp:
  #           soup = BeautifulSoup(fp, 'html.parser')
  #   return soup
  from lxml.etree import ParserError, XMLSyntaxError
  
  def openHTMLFile(self):
      not_need_domain = ['https://broadinstitute.github.io/picard/',
              'https://software.broadinstitute.org/gatk/best-practices/',
              'https://www.ncbi.nlm.nih.gov/genbank/',
              'https://www.mitomap.org/']
      headers = {
          "User-Agent": (
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
              "AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/114.0.0.0 Safari/537.36"
          ),
          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
          "Referer": self.htmlLink,
          "Connection": "keep-alive"
      }

      session = requests.Session()
      session.headers.update(headers)
      if self.htmlLink in not_need_domain:
        return BeautifulSoup("", 'html.parser')
      try:
          if self.htmlLink and self.htmlLink != "None":
              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
              if r.status_code != 200 or not r.text.strip():
                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
                  return BeautifulSoup("", 'html.parser')
              soup = BeautifulSoup(r.content, 'html.parser')
          else:
              with open(self.htmlFile, encoding='utf-8') as fp:
                  soup = BeautifulSoup(fp, 'html.parser')
      except (ParserError, XMLSyntaxError, OSError) as e:
          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
          return BeautifulSoup("", 'html.parser')
      except Exception as e:
          print(f"❌ General exception for {self.htmlLink}: {e}")
          return BeautifulSoup("", 'html.parser')

      return soup

  def getText(self):
    soup = self.openHTMLFile()
    s = soup.find_all("html")
    text = ""
    if s:
      for t in range(len(s)):
        text = s[t].get_text()
    cl = cleanText.cleanGenText()
    text = cl.removeExtraSpaceBetweenWords(text)
    return text
  def getListSection(self, scienceDirect=None):
    try:  
        json = {}
        text = ""
        textJson, textHTML = "",""
        if scienceDirect == None:
          soup = self.openHTMLFile()
          # get list of section
          json = {}
          for h2Pos in range(len(soup.find_all('h2'))):
            if soup.find_all('h2')[h2Pos].text not in json:
              json[soup.find_all('h2')[h2Pos].text] = []
            if h2Pos + 1 < len(soup.find_all('h2')):
              content = soup.find_all('h2')[h2Pos].find_next("p")
              nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
              while content.text != nexth2Content.text:
                json[soup.find_all('h2')[h2Pos].text].append(content.text)
                content = content.find_next("p")
            else:
              content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
              json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
          # format
          '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
            'Results':[], 'Discussion':[], 'References':[],
            'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
            'Additional information':[], 'Electronic supplementary material':[],
            'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
        if scienceDirect!= None or len(json)==0:
          # Replace with your actual Elsevier API key
          api_key = os.environ["SCIENCE_DIRECT_API"]  
          # ScienceDirect article DOI or PI (Example DOI)
          doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
          # Base URL for the Elsevier API
          base_url = "https://api.elsevier.com/content/article/doi/"
          # Set headers with API key
          headers = {
              "Accept": "application/json",
              "X-ELS-APIKey": api_key
          }
          # Make the API request
          response = requests.get(base_url + doi, headers=headers)
    # Check if the request was successful
          if response.status_code == 200:
            data = response.json()
            supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
            if "originalText" in list(supp_data.keys()):
              if type(supp_data["originalText"])==str:
                json["originalText"] = [supp_data["originalText"]]
              if type(supp_data["originalText"])==dict:
                json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
            else:
              if type(supp_data)==dict:
                for key in supp_data:
                  json[key] = [supp_data[key]]
    
        textJson = self.mergeTextInJson(json)
        textHTML = self.getText()
        if len(textHTML) > len(textJson):
          text = textHTML
        else: text = textJson
        return text #json
    except:
        print("failed all")
        return ""  
  def getReference(self):
    # get reference to collect more next data
    ref = []
    json = self.getListSection()
    for key in json["References"]:
      ct = cleanText.cleanGenText(key)
      cleanText, filteredWord = ct.cleanText()
      if cleanText not in ref:
        ref.append(cleanText)
    return ref
  def getSupMaterial(self):
    # check if there is material or not
    json = {}
    soup = self.openHTMLFile()
    for h2Pos in range(len(soup.find_all('h2'))):
      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
        link, output = [],[]
        if soup.find_all('h2')[h2Pos].text not in json:
          json[soup.find_all('h2')[h2Pos].text] = []
        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
            link.append(l["href"])
        if h2Pos + 1 < len(soup.find_all('h2')):
          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
          if nexth2Link in link:
            link = link[:link.index(nexth2Link)]
        # only take links having "https" in that
        for i in link:
          if "https" in i:  output.append(i)
        json[soup.find_all('h2')[h2Pos].text].extend(output)
    return json
  def extractTable(self):
    soup = self.openHTMLFile()
    df = []
    if len(soup)>0:
      try:
        df = pd.read_html(str(soup))
      except ValueError:
        df = []
        print("No tables found in HTML file")
    return df
  def mergeTextInJson(self,jsonHTML):
    cl = cleanText.cleanGenText()
    #cl = cleanGenText()
    htmlText = ""
    for sec in jsonHTML:
      # section is "\n\n"
      if len(jsonHTML[sec]) > 0:
        for i in range(len(jsonHTML[sec])):
          # same section is just a dot.
          text = jsonHTML[sec][i]
          if len(text)>0:
            #text = cl.removeTabWhiteSpaceNewLine(text)
            #text = cl.removeExtraSpaceBetweenWords(text)
            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
          jsonHTML[sec][i] = text
          if i-1 >= 0:
            if len(jsonHTML[sec][i-1])>0:
              if jsonHTML[sec][i-1][-1] != ".":
                htmlText += ". "
          htmlText += jsonHTML[sec][i]
        if len(jsonHTML[sec][i]) > 0:
          if jsonHTML[sec][i][-1]!=".":
            htmlText += "."
        htmlText += "\n\n"
    return htmlText
  def removeHeaders(self):
    pass
  def removeFooters(self):
    pass
  def removeReferences(self):
    pass