Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 18 days ago

Commit

f412fc0

verified ·

1 Parent(s): fcceb43

Update NER/html/extractHTML.py

Browse files

Files changed (1) hide show

NER/html/extractHTML.py +225 -221

NER/html/extractHTML.py CHANGED Viewed

@@ -1,222 +1,226 @@
-# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
-from bs4 import BeautifulSoup
-import requests
-from DefaultPackages import openFile, saveFile
-from NER import cleanText
-import pandas as pd
-class HTML():
-  def __init__(self, htmlFile, htmlLink):
-    self.htmlLink = htmlLink
-    self.htmlFile = htmlFile
-  # def openHTMLFile(self):
-  #   headers = {
-  #       "User-Agent": (
-  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
-  #           "Chrome/114.0.0.0 Safari/537.36"
-  #       ),
-  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-  #       "Referer": self.htmlLink,
-  #       "Connection": "keep-alive"
-  #   }
-  #   session = requests.Session()
-  #   session.headers.update(headers)
-  #   if self.htmlLink != "None":
-  #       try:
-  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
-  #           if r.status_code != 200:
-  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
-  #               return BeautifulSoup("", 'html.parser')
-  #           soup = BeautifulSoup(r.content, 'html.parser')
-  #       except Exception as e:
-  #           print(f"❌ Exception fetching HTML: {e}")
-  #           return BeautifulSoup("", 'html.parser')
-  #   else:
-  #       with open(self.htmlFile) as fp:
-  #           soup = BeautifulSoup(fp, 'html.parser')
-  #   return soup
-  from lxml.etree import ParserError, XMLSyntaxError
-  def openHTMLFile(self):
-      not_need_domain = ['https://broadinstitute.github.io/picard/',
-              'https://software.broadinstitute.org/gatk/best-practices/',
-              'https://www.ncbi.nlm.nih.gov/genbank/',
-              'https://www.mitomap.org/']
-      headers = {
-          "User-Agent": (
-              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-              "AppleWebKit/537.36 (KHTML, like Gecko) "
-              "Chrome/114.0.0.0 Safari/537.36"
-          ),
-          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-          "Referer": self.htmlLink,
-          "Connection": "keep-alive"
-      }
-      session = requests.Session()
-      session.headers.update(headers)
-      if self.htmlLink in not_need_domain:
-        return BeautifulSoup("", 'html.parser')
-      try:
-          if self.htmlLink and self.htmlLink != "None":
-              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
-              if r.status_code != 200 or not r.text.strip():
-                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
-                  return BeautifulSoup("", 'html.parser')
-              soup = BeautifulSoup(r.content, 'html.parser')
-          else:
-              with open(self.htmlFile, encoding='utf-8') as fp:
-                  soup = BeautifulSoup(fp, 'html.parser')
-      except (ParserError, XMLSyntaxError, OSError) as e:
-          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
-          return BeautifulSoup("", 'html.parser')
-      except Exception as e:
-          print(f"❌ General exception for {self.htmlLink}: {e}")
-          return BeautifulSoup("", 'html.parser')
-      return soup
-  def getText(self):
-    soup = self.openHTMLFile()
-    s = soup.find_all("html")
-    text = ""
-    if s:
-      for t in range(len(s)):
-        text = s[t].get_text()
-    cl = cleanText.cleanGenText()
-    text = cl.removeExtraSpaceBetweenWords(text)
-    return text
-  def getListSection(self, scienceDirect=None):
-    json = {}
-    text = ""
-    textJson, textHTML = "",""
-    if scienceDirect == None:
-      soup = self.openHTMLFile()
-      # get list of section
-      json = {}
-      for h2Pos in range(len(soup.find_all('h2'))):
-        if soup.find_all('h2')[h2Pos].text not in json:
-          json[soup.find_all('h2')[h2Pos].text] = []
-        if h2Pos + 1 < len(soup.find_all('h2')):
-          content = soup.find_all('h2')[h2Pos].find_next("p")
-          nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
-          while content.text != nexth2Content.text:
-            json[soup.find_all('h2')[h2Pos].text].append(content.text)
-            content = content.find_next("p")
-        else:
-          content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
-          json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
-      # format
-      '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
-        'Results':[], 'Discussion':[], 'References':[],
-        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
-        'Additional information':[], 'Electronic supplementary material':[],
-        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
-    if scienceDirect!= None or len(json)==0:
-      # Replace with your actual Elsevier API key
-      api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
-      # ScienceDirect article DOI or PI (Example DOI)
-      doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
-      # Base URL for the Elsevier API
-      base_url = "https://api.elsevier.com/content/article/doi/"
-      # Set headers with API key
-      headers = {
-          "Accept": "application/json",
-          "X-ELS-APIKey": api_key
-      }
-      # Make the API request
-      response = requests.get(base_url + doi, headers=headers)
-# Check if the request was successful
-      if response.status_code == 200:
-        data = response.json()
-        supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
-        if "originalText" in list(supp_data.keys()):
-          if type(supp_data["originalText"])==str:
-            json["originalText"] = [supp_data["originalText"]]
-          if type(supp_data["originalText"])==dict:
-            json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
-        else:
-          if type(supp_data)==dict:
-            for key in supp_data:
-              json[key] = [supp_data[key]]
-    textJson = self.mergeTextInJson(json)
-    textHTML = self.getText()
-    if len(textHTML) > len(textJson):
-      text = textHTML
-    else: text = textJson
-    return text #json
-  def getReference(self):
-    # get reference to collect more next data
-    ref = []
-    json = self.getListSection()
-    for key in json["References"]:
-      ct = cleanText.cleanGenText(key)
-      cleanText, filteredWord = ct.cleanText()
-      if cleanText not in ref:
-        ref.append(cleanText)
-    return ref
-  def getSupMaterial(self):
-    # check if there is material or not
-    json = {}
-    soup = self.openHTMLFile()
-    for h2Pos in range(len(soup.find_all('h2'))):
-      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
-        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
-        link, output = [],[]
-        if soup.find_all('h2')[h2Pos].text not in json:
-          json[soup.find_all('h2')[h2Pos].text] = []
-        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
-            link.append(l["href"])
-        if h2Pos + 1 < len(soup.find_all('h2')):
-          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
-          if nexth2Link in link:
-            link = link[:link.index(nexth2Link)]
-        # only take links having "https" in that
-        for i in link:
-          if "https" in i:  output.append(i)
-        json[soup.find_all('h2')[h2Pos].text].extend(output)
-    return json
-  def extractTable(self):
-    soup = self.openHTMLFile()
-    df = []
-    if len(soup)>0:
-      try:
-        df = pd.read_html(str(soup))
-      except ValueError:
-        df = []
-        print("No tables found in HTML file")
-    return df
-  def mergeTextInJson(self,jsonHTML):
-    cl = cleanText.cleanGenText()
-    #cl = cleanGenText()
-    htmlText = ""
-    for sec in jsonHTML:
-      # section is "\n\n"
-      if len(jsonHTML[sec]) > 0:
-        for i in range(len(jsonHTML[sec])):
-          # same section is just a dot.
-          text = jsonHTML[sec][i]
-          if len(text)>0:
-            #text = cl.removeTabWhiteSpaceNewLine(text)
-            #text = cl.removeExtraSpaceBetweenWords(text)
-            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
-          jsonHTML[sec][i] = text
-          if i-1 >= 0:
-            if len(jsonHTML[sec][i-1])>0:
-              if jsonHTML[sec][i-1][-1] != ".":
-                htmlText += ". "
-          htmlText += jsonHTML[sec][i]
-        if len(jsonHTML[sec][i]) > 0:
-          if jsonHTML[sec][i][-1]!=".":
-            htmlText += "."
-        htmlText += "\n\n"
-    return htmlText
-  def removeHeaders(self):
-    pass
-  def removeFooters(self):
-    pass
-  def removeReferences(self):
     pass

+# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
+from bs4 import BeautifulSoup
+import requests
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+import pandas as pd
+class HTML():
+  def __init__(self, htmlFile, htmlLink):
+    self.htmlLink = htmlLink
+    self.htmlFile = htmlFile
+  # def openHTMLFile(self):
+  #   headers = {
+  #       "User-Agent": (
+  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
+  #           "Chrome/114.0.0.0 Safari/537.36"
+  #       ),
+  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+  #       "Referer": self.htmlLink,
+  #       "Connection": "keep-alive"
+  #   }
+  #   session = requests.Session()
+  #   session.headers.update(headers)
+  #   if self.htmlLink != "None":
+  #       try:
+  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+  #           if r.status_code != 200:
+  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
+  #               return BeautifulSoup("", 'html.parser')
+  #           soup = BeautifulSoup(r.content, 'html.parser')
+  #       except Exception as e:
+  #           print(f"❌ Exception fetching HTML: {e}")
+  #           return BeautifulSoup("", 'html.parser')
+  #   else:
+  #       with open(self.htmlFile) as fp:
+  #           soup = BeautifulSoup(fp, 'html.parser')
+  #   return soup
+  from lxml.etree import ParserError, XMLSyntaxError
+  def openHTMLFile(self):
+      not_need_domain = ['https://broadinstitute.github.io/picard/',
+              'https://software.broadinstitute.org/gatk/best-practices/',
+              'https://www.ncbi.nlm.nih.gov/genbank/',
+              'https://www.mitomap.org/']
+      headers = {
+          "User-Agent": (
+              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+              "AppleWebKit/537.36 (KHTML, like Gecko) "
+              "Chrome/114.0.0.0 Safari/537.36"
+          ),
+          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+          "Referer": self.htmlLink,
+          "Connection": "keep-alive"
+      }
+      session = requests.Session()
+      session.headers.update(headers)
+      if self.htmlLink in not_need_domain:
+        return BeautifulSoup("", 'html.parser')
+      try:
+          if self.htmlLink and self.htmlLink != "None":
+              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+              if r.status_code != 200 or not r.text.strip():
+                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
+                  return BeautifulSoup("", 'html.parser')
+              soup = BeautifulSoup(r.content, 'html.parser')
+          else:
+              with open(self.htmlFile, encoding='utf-8') as fp:
+                  soup = BeautifulSoup(fp, 'html.parser')
+      except (ParserError, XMLSyntaxError, OSError) as e:
+          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
+          return BeautifulSoup("", 'html.parser')
+      except Exception as e:
+          print(f"❌ General exception for {self.htmlLink}: {e}")
+          return BeautifulSoup("", 'html.parser')
+      return soup
+  def getText(self):
+    soup = self.openHTMLFile()
+    s = soup.find_all("html")
+    text = ""
+    if s:
+      for t in range(len(s)):
+        text = s[t].get_text()
+    cl = cleanText.cleanGenText()
+    text = cl.removeExtraSpaceBetweenWords(text)
+    return text
+  def getListSection(self, scienceDirect=None):
+    try:
+        json = {}
+        text = ""
+        textJson, textHTML = "",""
+        if scienceDirect == None:
+          soup = self.openHTMLFile()
+          # get list of section
+          json = {}
+          for h2Pos in range(len(soup.find_all('h2'))):
+            if soup.find_all('h2')[h2Pos].text not in json:
+              json[soup.find_all('h2')[h2Pos].text] = []
+            if h2Pos + 1 < len(soup.find_all('h2')):
+              content = soup.find_all('h2')[h2Pos].find_next("p")
+              nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
+              while content.text != nexth2Content.text:
+                json[soup.find_all('h2')[h2Pos].text].append(content.text)
+                content = content.find_next("p")
+            else:
+              content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
+              json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
+          # format
+          '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
+            'Results':[], 'Discussion':[], 'References':[],
+            'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
+            'Additional information':[], 'Electronic supplementary material':[],
+            'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
+        if scienceDirect!= None or len(json)==0:
+          # Replace with your actual Elsevier API key
+          api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
+          # ScienceDirect article DOI or PI (Example DOI)
+          doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
+          # Base URL for the Elsevier API
+          base_url = "https://api.elsevier.com/content/article/doi/"
+          # Set headers with API key
+          headers = {
+              "Accept": "application/json",
+              "X-ELS-APIKey": api_key
+          }
+          # Make the API request
+          response = requests.get(base_url + doi, headers=headers)
+    # Check if the request was successful
+          if response.status_code == 200:
+            data = response.json()
+            supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
+            if "originalText" in list(supp_data.keys()):
+              if type(supp_data["originalText"])==str:
+                json["originalText"] = [supp_data["originalText"]]
+              if type(supp_data["originalText"])==dict:
+                json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
+            else:
+              if type(supp_data)==dict:
+                for key in supp_data:
+                  json[key] = [supp_data[key]]
+        textJson = self.mergeTextInJson(json)
+        textHTML = self.getText()
+        if len(textHTML) > len(textJson):
+          text = textHTML
+        else: text = textJson
+        return text #json
+    except:
+        print("failed all")
+        return ""
+  def getReference(self):
+    # get reference to collect more next data
+    ref = []
+    json = self.getListSection()
+    for key in json["References"]:
+      ct = cleanText.cleanGenText(key)
+      cleanText, filteredWord = ct.cleanText()
+      if cleanText not in ref:
+        ref.append(cleanText)
+    return ref
+  def getSupMaterial(self):
+    # check if there is material or not
+    json = {}
+    soup = self.openHTMLFile()
+    for h2Pos in range(len(soup.find_all('h2'))):
+      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
+        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
+        link, output = [],[]
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
+            link.append(l["href"])
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
+          if nexth2Link in link:
+            link = link[:link.index(nexth2Link)]
+        # only take links having "https" in that
+        for i in link:
+          if "https" in i:  output.append(i)
+        json[soup.find_all('h2')[h2Pos].text].extend(output)
+    return json
+  def extractTable(self):
+    soup = self.openHTMLFile()
+    df = []
+    if len(soup)>0:
+      try:
+        df = pd.read_html(str(soup))
+      except ValueError:
+        df = []
+        print("No tables found in HTML file")
+    return df
+  def mergeTextInJson(self,jsonHTML):
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    htmlText = ""
+    for sec in jsonHTML:
+      # section is "\n\n"
+      if len(jsonHTML[sec]) > 0:
+        for i in range(len(jsonHTML[sec])):
+          # same section is just a dot.
+          text = jsonHTML[sec][i]
+          if len(text)>0:
+            #text = cl.removeTabWhiteSpaceNewLine(text)
+            #text = cl.removeExtraSpaceBetweenWords(text)
+            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
+          jsonHTML[sec][i] = text
+          if i-1 >= 0:
+            if len(jsonHTML[sec][i-1])>0:
+              if jsonHTML[sec][i-1][-1] != ".":
+                htmlText += ". "
+          htmlText += jsonHTML[sec][i]
+        if len(jsonHTML[sec][i]) > 0:
+          if jsonHTML[sec][i][-1]!=".":
+            htmlText += "."
+        htmlText += "\n\n"
+    return htmlText
+  def removeHeaders(self):
+    pass
+  def removeFooters(self):
+    pass
+  def removeReferences(self):
     pass