Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on Apr 13

Commit

2621d77

verified ·

1 Parent(s): f758879

Upload 19 files

Browse files

Files changed (19) hide show

DefaultPackages/__init__.py +4 -0
DefaultPackages/__pycache__/__init__.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/__init__.cpython-311.pyc +0 -0
DefaultPackages/__pycache__/openFile.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/openFile.cpython-311.pyc +0 -0
DefaultPackages/__pycache__/saveFile.cpython-310.pyc +0 -0
DefaultPackages/__pycache__/saveFile.cpython-311.pyc +0 -0
DefaultPackages/openFile.py +12 -0
DefaultPackages/saveFile.py +11 -0
NER/PDF/pdf.py +142 -0
NER/WordDoc/wordDoc.py +149 -0
NER/cleanText.py +116 -0
NER/html/extractHTML.py +158 -0
NER/word2Vec/word2vec.py +364 -0
app.py +132 -0
data/haplogroup_regions_extended.csv +51 -0
mtdna_classifier.py +242 -0
requirements.txt +19 -0
setup.sh +8 -0

DefaultPackages/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+__all__ = [
+        'openFile',
+        'saveFile',
+        ]

DefaultPackages/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (193 Bytes). View file

DefaultPackages/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (212 Bytes). View file

DefaultPackages/__pycache__/openFile.cpython-310.pyc ADDED Viewed

Binary file (566 Bytes). View file

DefaultPackages/__pycache__/openFile.cpython-311.pyc ADDED Viewed

Binary file (989 Bytes). View file

DefaultPackages/__pycache__/saveFile.cpython-310.pyc ADDED Viewed

Binary file (590 Bytes). View file

DefaultPackages/__pycache__/saveFile.cpython-311.pyc ADDED Viewed

Binary file (1.02 kB). View file

DefaultPackages/openFile.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def openFile(file):
+  with open(file) as f:
+    openFile = f.read()
+  return openFile
+def openJsonFile(file):
+  import json
+  # Opening JSON file
+  with open(file, 'r') as openfile:
+    # Reading from json file
+    json_object = json.load(openfile)
+  return json_object

DefaultPackages/saveFile.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import json
+def saveFile(name,content):
+  Name = name
+  fi = open(Name, "a")
+      # Add new change in to saved file
+  with open(Name, "w") as external_file:
+    add_text = content
+    print(add_text, file=external_file)
+    external_file.close()
+def saveJsonFile(name,content):
+  saveFile(name,json.dumps(content))

NER/PDF/pdf.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!pip install pdfreader
+import pdfreader
+from pdfreader import PDFDocument, SimplePDFViewer
+#!pip install bs4
+from bs4 import BeautifulSoup
+import requests
+from NER import cleanText
+#!pip install tabula-py
+import tabula
+class PDF(): # using PyPDF2
+  def __init__(self, pdf, saveFolder, doi=None):
+    self.pdf = pdf
+    self.doi = doi
+    self.saveFolder = saveFolder
+  def openPDFFile(self):
+    if "https" in self.pdf:
+      name = self.pdf.split("/")[-1]
+      name = self.downloadPDF(self.saveFolder)
+      if name != "no pdfLink to download":
+        fileToOpen = self.saveFolder + "/" + name
+      else: fileToOpen = self.pdf
+    else: fileToOpen = self.pdf
+    return open(fileToOpen, "rb")
+  def downloadPDF(self, saveFolder):
+    pdfLink = ''
+    if ".pdf" not in self.pdf and "https" not in self.pdf: # the download link is a general URL not pdf link
+      r = requests.get(self.pdf)
+      soup = BeautifulSoup(r.content, 'html.parser')
+      links = soup.find_all("a")
+      for link in links:
+        if ".pdf" in link.get("href"):
+          if self.doi in link.get("href"):
+            pdfLink = link.get("href")
+            break
+    else:
+      pdfLink = self.pdf
+    if pdfLink != '':
+      response = requests.get(pdfLink)
+      name = pdfLink.split("/")[-1]
+      pdf = open(saveFolder+"/"+name, 'wb')
+      pdf.write(response.content)
+      pdf.close()
+      print("pdf downloaded")
+      return name
+    else:
+      return "no pdfLink to download"
+  def extractText(self):
+    jsonPage = {}
+    pdf = self.openPDFFile()
+    doc = PDFDocument(pdf)
+    viewer = SimplePDFViewer(pdf)
+    all_pages = [p for p in doc.pages()]
+    cl = cleanText.cleanGenText()
+    for page in range(1,len(all_pages)):
+      viewer.navigate(page)
+      viewer.render()
+      if str(page) not in jsonPage:
+        jsonPage[str(page)] = {}
+      # text
+        text = "".join(viewer.canvas.strings)
+      clean, filteredWord = cl.textPreprocessing(text) #cleanText.cleanGenText(text).cleanText()
+      # save the text of filtered words which remove "a", the, "an", "is", etc.
+      jsonPage[str(page)]["normalText"] = [text]
+      jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
+      #image
+      image = viewer.canvas.images
+      jsonPage[str(page)]["image"] = [image]
+      #form
+      form = viewer.canvas.forms
+      jsonPage[str(page)]["form"] = [form]
+      # content based on PDF adobe
+      content = viewer.canvas.text_content
+      jsonPage[str(page)]["content"] = [content]
+      # inline_image:'''
+      '''Inline images are aligned with the text,
+      and are usually content images like photos, charts, or graphs.'''
+      inline_image = viewer.canvas.inline_images
+      jsonPage[str(page)]["inline_image"] = [inline_image]
+    pdf.close()
+    '''Output Format:
+    jsonPage[str(page)]["normalText"]
+    jsonPage[str(page)]["cleanText"]
+    jsonPage[str(page)]["image"]
+    jsonPage[str(page)]["form"]
+    jsonPage[str(page)]["content"]'''
+    return jsonPage
+  def extractTable(self,pages,saveFile=None,outputFormat=None):
+    '''pages (str, int, iterable of int, optional) –
+      An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
+      Examples: '1-2,3', 'all', [1,2]'''
+    df = []
+    if "https" in self.pdf:
+      name = self.pdf.split("/")[-1]
+      name = self.downloadPDF(self.saveFolder)
+      if name != "no pdfLink to download":
+        fileToOpen = self.saveFolder + "/" + name
+      else: fileToOpen = self.pdf
+    else: fileToOpen = self.pdf
+    try:
+      df = tabula.read_pdf(fileToOpen, pages=pages)
+    # saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv"
+    # outputFormat: "csv"
+    #tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages)
+    except:# ValueError:
+      df = []
+      print("No tables found in PDF file")
+    return df
+  def mergeTextinJson(self,jsonPDF):
+    # pdf
+    #cl = cleanGenText()
+    cl = cleanText.cleanGenText()
+    pdfText = ""
+    for page in jsonPDF:
+      # page is "\n\n"
+      if len(jsonPDF[page]["normalText"]) > 0:
+        for i in range(len(jsonPDF[page]["normalText"])):
+          text = jsonPDF[page]["normalText"][i]
+          if len(text)>0:
+            text = cl.removeTabWhiteSpaceNewLine(text)
+            text = cl.removeExtraSpaceBetweenWords(text)
+          jsonPDF[page]["normalText"][i] = text
+          # same page is just a dot.
+          if i-1 > 0:
+            if jsonPDF[page]["normalText"][i-1][-1] != ".":
+              pdfText += ". "
+          pdfText += jsonPDF[page]["normalText"][i]
+        if len(jsonPDF[page]["normalText"][i])>0:
+          if jsonPDF[page]["normalText"][i][-1]!=".":
+            pdfText += "."
+          pdfText += "\n\n"
+    return pdfText
+  def getReference(self):
+    pass
+  def getSupMaterial(self):
+    pass
+  def removeHeaders(self):
+    pass
+  def removeFooters(self):
+    pass
+  def removeReference(self):
+    pass

NER/WordDoc/wordDoc.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#! pip install spire.doc
+#! pip install Spire.XLS
+import pandas as pd
+from spire.doc import *
+from spire.doc.common import *
+from spire.xls import *
+from spire.xls.common import *
+from NER import cleanText
+import requests
+class wordDoc(): # using python-docx
+  def __init__(self, wordDoc,saveFolder):
+    self.wordDoc = wordDoc
+    self.saveFolder = saveFolder
+  def openFile(self):
+    document = Document()
+    return document.LoadFromFile(self.wordDoc)
+  def extractTextByPage(self):
+    # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
+    json = {}
+    #doc = self.openFile()
+    # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
+    try:
+      doc = Document()
+      doc.LoadFromFile(self.wordDoc)
+    except:
+      response = requests.get(self.wordDoc)
+      name = self.wordDoc.split("/")[-1]
+      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
+        temp_file.write(response.content)
+      doc = Document()
+      doc.LoadFromFile(self.saveFolder+"/" + name)
+    text = doc.GetText()
+    return text
+  def extractTableAsText(self):
+    getDoc = ''
+    try:
+      # reference:
+      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
+      doc = Document()
+      doc.LoadFromFile(self.wordDoc)
+      getDoc = "have document"
+    except:
+      response = requests.get(self.wordDoc)
+      name = self.wordDoc.split("/")[-1]
+      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
+        temp_file.write(response.content)
+      doc = Document()
+      doc.LoadFromFile(self.saveFolder+"/" + name)
+      getDoc = "have document"
+    json = {}
+    if len(getDoc) > 0:
+      # Loop through the sections
+      for s in range(doc.Sections.Count):
+        # Get a section
+          section = doc.Sections.get_Item(s)
+          # Get the tables in the section
+          json["Section" + str(s)] = {}
+          tables = section.Tables
+          # Loop through the tables
+          for i in range(0, tables.Count):
+              # Get a table
+              table = tables.get_Item(i)
+              # Initialize a string to store the table data
+              tableData = ''
+              # Loop through the rows of the table
+              for j in range(0, table.Rows.Count):
+                  # Loop through the cells of the row
+                  for k in range(0, table.Rows.get_Item(j).Cells.Count):
+                      # Get a cell
+                      cell = table.Rows.get_Item(j).Cells.get_Item(k)
+                      # Get the text in the cell
+                      cellText = ''
+                      for para in range(cell.Paragraphs.Count):
+                          paragraphText = cell.Paragraphs.get_Item(para).Text
+                          cellText += (paragraphText + ' ')
+                      # Add the text to the string
+                      tableData += cellText
+                      if k < table.Rows.get_Item(j).Cells.Count - 1:
+                          tableData += '\t'
+                  # Add a new line
+                  tableData += '\n'
+              json["Section" + str(s)]["Table"+str(i)] = tableData
+    return json
+  def extractTableAsExcel(self):
+    getDoc = ''
+    try:
+      # reference:
+      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
+      doc = Document()
+      doc.LoadFromFile(self.wordDoc)
+      getDoc = "have document"
+    except:
+      response = requests.get(self.wordDoc)
+      name = self.wordDoc.split("/")[-1]
+      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
+        temp_file.write(response.content)
+      doc = Document()
+      doc.LoadFromFile(self.saveFolder+"/" + name)
+      getDoc = "have document"
+    if len(getDoc) > 0:
+      try:
+        # Create an instance of Workbook
+        wb = Workbook()
+        wb.Worksheets.Clear()
+        # Loop through sections in the document
+        for i in range(doc.Sections.Count):
+            # Get a section
+            section = doc.Sections.get_Item(i)
+            # Loop through tables in the section
+            for j in range(section.Tables.Count):
+                # Get a table
+                table = section.Tables.get_Item(j)
+                # Create a worksheet
+                ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
+                # Write the table to the worksheet
+                for row in range(table.Rows.Count):
+                    # Get a row
+                    tableRow = table.Rows.get_Item(row)
+                    # Loop through cells in the row
+                    for cell in range(tableRow.Cells.Count):
+                        # Get a cell
+                        tableCell = tableRow.Cells.get_Item(cell)
+                        # Get the text in the cell
+                        cellText = ''
+                        for paragraph in range(tableCell.Paragraphs.Count):
+                            paragraph = tableCell.Paragraphs.get_Item(paragraph)
+                            cellText = cellText + (paragraph.Text + ' ')
+                        # Write the cell text to the worksheet
+                        ws.SetCellValue(row + 1, cell + 1, cellText)
+        # Save the workbook
+        name = self.wordDoc.split("/")[-1]
+        if self.saveFolder == None:
+          wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
+          nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
+        else:
+          wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
+          nameFile = self.saveFolder+'/'+name + ".xlsx"
+        doc.Close()
+        wb.Dispose()
+        return nameFile
+      except: return "No table found on word doc"
+    else:
+      return "No table found on word doc"
+  def getReference(self):
+    pass
+  def getSupMaterial(self):
+    pass

NER/cleanText.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# reference:
+# https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
+import re
+import nltk
+#nltk.download('stopwords')
+#nltk.download()
+from DefaultPackages import openFile, saveFile
+import json
+from nltk.corpus import stopwords
+from nltk.corpus.reader.api import wordpunct_tokenize
+from nltk.tokenize import word_tokenize
+#from wordsegment import load, segment
+from wordsegment import load, segment
+class cleanGenText():
+  def __init__(self):
+    #self.text = text
+    load()
+    pass
+  def removePunct(self,text,KeepPeriod=False):
+    punctuation = r'[^\w\s]'
+    if KeepPeriod==True:
+      punctuation = r'[^\w\s\.]'
+    return re.sub(punctuation, '', text)
+  def removeURL(self,text):
+    url_pattern = re.compile(r'https?://\S+|www\.\S+')
+    return url_pattern.sub(r'', text)
+  def removeHTMLTag(self,text):
+    html_tags_pattern = r'<.*?>'
+    return re.sub(html_tags_pattern, '', text)
+  def removeTabWhiteSpaceNewLine(self,text):
+    # remove \n or \t and unnecessary white space
+    cleanText = text.replace("\n\n","")
+    cleanText = text.replace("\n","")
+    cleanText = cleanText.replace("\t","")
+    cleanText = cleanText.strip()
+    return cleanText
+  def removeExtraSpaceBetweenWords(self,text):
+    return re.sub(r'\s+', ' ',text).strip()
+  def removeStopWords(self,text):
+    #extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
+    filteredWord = []
+    stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
+    textWords = word_tokenize(text)
+    for word in textWords:
+      if word.lower() not in stopWords:
+        filteredWord.append(word) # and w.isalpha()==True]
+    return filteredWord
+  def removeLowercaseBetweenUppercase(self,segment):
+    # segment such as "Myanmar (formerly Burma)"
+    # but not change anything for "Viet Nam"
+    # for special cases:
+        # the capital letter:
+        # When there is a lowercase word between:
+        # e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"
+        # When there is no lowercase word or uppercase words in a row:
+        # e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
+    outputUp = []
+    segment = self.removeTabWhiteSpaceNewLine(segment)
+    segments = segment.split(" ")
+    for w in range(len(segments)):
+      word = segments[w]
+      cleanWord = self.removePunct(word)
+      cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
+      prevWord = ""
+      if w > 0:
+        prevWord = segments[w-1]
+        cleanPreWord = self.removePunct(prevWord)
+        cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
+      if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
+        if len(prevWord)>0 and prevWord[0].isupper() == True:
+          outputUp[-1] += " " + cleanWord
+        else:
+          outputUp.append(cleanWord)
+    return outputUp
+  def textPreprocessing(self, text, keepPeriod=False):
+    # lowercase
+    #lowerText = self.text.lower()
+    # remove punctuation & special characacters
+    cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
+    # removal of URLs in text
+    cleanText = self.removeURL(cleanText)
+    # removal of HTML Tags
+    cleanText = self.removeHTMLTag(cleanText)
+    # remove \n or \t and unnecessary white space
+    cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
+    # stop-words removal
+    filteredWord = self.removeStopWords(cleanText)
+    # a sentence or the capital word behind a period "."
+    return cleanText, filteredWord
+  #generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
+  #saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
+  def splitStickWords(self,word):
+    #output = []
+    split_words = segment(word)
+    '''for w in split_words:
+      pos = word.lower().find(w)
+      if word[pos].isupper() == True:
+        output.append(w[0].upper() + w[1:])
+      else:
+        output.append(w)
+      if pos >=0:
+        if pos+len(w)<len(word):
+          if word[pos+len(w)] == ".":
+            output[-1] = output[-1] + "."  '''
+    return " ".join(split_words)
+  def removeDOI(self, word, doiLink=None):
+    # if they have the word DOI in that: ex: 1368598DOI after general clean
+    if "DOI" in word:
+      word = word.replace(word,"")
+    # if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
+    if doiLink != None:
+      w = self.splitStickWords(word)
+      cleanDOI = self.removePunct(doiLink)
+      if cleanDOI in w:
+        word = w.replace(cleanDOI,"")
+    return word

NER/html/extractHTML.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!pip install bs4
+# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
+from bs4 import BeautifulSoup
+import requests
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+import pandas as pd
+class HTML():
+  def __init__(self, htmlFile, htmlLink):
+    self.htmlLink = htmlLink
+    self.htmlFile = htmlFile
+  def openHTMLFile(self):
+    if self.htmlLink != "None":
+      r = requests.get(self.htmlLink)
+      soup = BeautifulSoup(r.content, 'html.parser')
+    else:
+      with open(self.htmlFile) as fp:
+        soup = BeautifulSoup(fp, 'html.parser')
+    return soup
+  def getText(self):
+    soup = self.openHTMLFile()
+    s = soup.find_all("html")
+    for t in range(len(s)):
+      text = s[t].get_text()
+    cl = cleanText.cleanGenText()
+    text = cl.removeExtraSpaceBetweenWords(text)
+    return text
+  def getListSection(self, scienceDirect=None):
+    json = {}
+    text = ""
+    textJson, textHTML = "",""
+    if scienceDirect == None:
+      soup = self.openHTMLFile()
+      # get list of section
+      json = {}
+      for h2Pos in range(len(soup.find_all('h2'))):
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          content = soup.find_all('h2')[h2Pos].find_next("p")
+          nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
+          while content.text != nexth2Content.text:
+            json[soup.find_all('h2')[h2Pos].text].append(content.text)
+            content = content.find_next("p")
+        else:
+          content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
+          json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
+      # format
+      '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
+        'Results':[], 'Discussion':[], 'References':[],
+        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
+        'Additional information':[], 'Electronic supplementary material':[],
+        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
+    if scienceDirect!= None or len(json)==0:
+      # Replace with your actual Elsevier API key
+      api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
+      # ScienceDirect article DOI or PI (Example DOI)
+      doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
+      # Base URL for the Elsevier API
+      base_url = "https://api.elsevier.com/content/article/doi/"
+      # Set headers with API key
+      headers = {
+          "Accept": "application/json",
+          "X-ELS-APIKey": api_key
+      }
+      # Make the API request
+      response = requests.get(base_url + doi, headers=headers)
+# Check if the request was successful
+      if response.status_code == 200:
+        data = response.json()
+        supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
+        if "originalText" in list(supp_data.keys()):
+          if type(supp_data["originalText"])==str:
+            json["originalText"] = [supp_data["originalText"]]
+          if type(supp_data["originalText"])==dict:
+            json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
+        else:
+          if type(supp_data)==dict:
+            for key in supp_data:
+              json[key] = [supp_data[key]]
+    textJson = self.mergeTextInJson(json)
+    textHTML = self.getText()
+    if len(textHTML) > len(textJson):
+      text = textHTML
+    else: text = textJson
+    return text #json
+  def getReference(self):
+    # get reference to collect more next data
+    ref = []
+    json = self.getListSection()
+    for key in json["References"]:
+      ct = cleanText.cleanGenText(key)
+      cleanText, filteredWord = ct.cleanText()
+      if cleanText not in ref:
+        ref.append(cleanText)
+    return ref
+  def getSupMaterial(self):
+    # check if there is material or not
+    json = {}
+    soup = self.openHTMLFile()
+    for h2Pos in range(len(soup.find_all('h2'))):
+      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
+        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
+        link, output = [],[]
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
+            link.append(l["href"])
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
+          if nexth2Link in link:
+            link = link[:link.index(nexth2Link)]
+        # only take links having "https" in that
+        for i in link:
+          if "https" in i:  output.append(i)
+        json[soup.find_all('h2')[h2Pos].text].extend(output)
+    return json
+  def extractTable(self):
+    soup = self.openHTMLFile()
+    df = []
+    try:
+      df = pd.read_html(str(soup))
+    except ValueError:
+      df = []
+      print("No tables found in HTML file")
+    return df
+  def mergeTextInJson(self,jsonHTML):
+    #cl = cleanText.cleanGenText()
+    cl = cleanGenText()
+    htmlText = ""
+    for sec in jsonHTML:
+      # section is "\n\n"
+      if len(jsonHTML[sec]) > 0:
+        for i in range(len(jsonHTML[sec])):
+          # same section is just a dot.
+          text = jsonHTML[sec][i]
+          if len(text)>0:
+            #text = cl.removeTabWhiteSpaceNewLine(text)
+            #text = cl.removeExtraSpaceBetweenWords(text)
+            text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
+          jsonHTML[sec][i] = text
+          if i-1 >= 0:
+            if len(jsonHTML[sec][i-1])>0:
+              if jsonHTML[sec][i-1][-1] != ".":
+                htmlText += ". "
+          htmlText += jsonHTML[sec][i]
+        if len(jsonHTML[sec][i]) > 0:
+          if jsonHTML[sec][i][-1]!=".":
+            htmlText += "."
+        htmlText += "\n\n"
+    return htmlText
+  def removeHeaders(self):
+    pass
+  def removeFooters(self):
+    pass
+  def removeReferences(self):
+    pass

NER/word2Vec/word2vec.py ADDED Viewed

	@@ -0,0 +1,364 @@

+'''WORD TO VECTOR'''
+import pandas as pd
+import json
+import gensim
+import spacy
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.test.utils import common_texts
+from gensim.models.word2vec import Word2Vec
+from gensim.scripts.glove2word2vec import glove2word2vec
+from gensim.test.utils import datapath, get_tmpfile
+import sys
+import subprocess
+# can try multiprocessing to run quicker
+import multiprocessing
+import copy
+sys.setrecursionlimit(1000)
+# creat folder word2Vec
+#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
+# create word2vec model
+#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
+'''Some notes for this model
+sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
+a similar word to the word we are finding, so can we try to preprocess text so that
+we make the corpus more effective and only contains the important words. Then when we
+train the model, the important words will be seen as important. Or
+when we already have the similar list of words, we can remove the words in there
+that are stopwords/unnecessary words.'''
+### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
+class word2Vec():
+  def __init__(self, nameFile=None, modelName=None):
+    self.nameFile = nameFile
+    self.modelName = modelName
+  def spacy_similarity(self, word):
+    # when use word2vec, try medium or large is better
+    # maybe try odc similarity?
+    nlp = spacy.load("en_core_web_lg")
+    doc = nlp(word)
+    for token1 in doc:
+      for token2 in doc:
+        print(token1.text, token2.text, token1.similarity(token2))
+    pass
+  # clean text before transform to corpus
+  def cleanTextBeforeCorpus(self,oriText, doi=None):
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    output = ""
+    alreadyRemoveDoi = False
+    for word in oriText.split(" "):
+      # remove DOI
+      if doi != None and doi in oriText:
+        if alreadyRemoveDoi == False:
+          newWord = cl.removeDOI(word,doi)
+          if len(newWord) > 0 and newWord != word:
+            alreadyRemoveDoi = True
+            word = newWord
+      # remove punctuation
+      # split the sticked words
+      #word = cl.splitStickWords(word)
+      # remove punctuation
+      word = cl.removePunct(word,True)
+      # remove URL
+      word = cl.removeURL(word)
+      # remove HTMLTag
+      word = cl.removeHTMLTag(word)
+      # remove tab, white space, newline
+      word = cl.removeTabWhiteSpaceNewLine(word)
+      # optional: remove stopwords
+      #word = cl.removeStopWords(word)
+      if len(word)>0:
+        output += word + " "
+    return output
+  def cleanAllTextBeforeCorpus(self, allText, doi=None):
+    cleanOutput = ""
+    remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
+    if len(allText) > 0:
+      corpusText = allText
+      for pos in range(len(corpusText.split("\n\n"))):
+        if len(corpusText.split("\n\n")[pos]) > 0:
+          lines = corpusText.split("\n\n")[pos]
+          for line in lines.split("\n"):
+            if remove in line:  line = line.replace(remove, "")
+            clean_text = self.cleanTextBeforeCorpus(line, doi)
+            cleanOutput += clean_text + "\n"
+          cleanOutput += "\n\n"
+    return cleanOutput
+  def tableTransformToCorpusText(self, df, excelFile=None):
+    # PDF, Excel, WordDoc
+    #cl = cleanText.cleanGenText()
+    corpus = {}
+      # PDF or df
+    if excelFile == None:
+      if len(df) > 0:
+        try:
+          for i in range(len(df)):
+            # each new dimension/page is considered to be a sentence which ends with the period.
+            # each new line is a new list, and each new df is a new corpus
+            outputDF = []
+            text = df[i].values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+            if len(outputDF) > 0:
+              corpus["corpus" + str(i)] = outputDF
+        except:
+          outputDF = []
+          text = df.values.tolist()
+          if len(text) > 0:
+            outputRowDF = self.helperRowTableToCorpus(text)
+            #outputColDF = self.helperColTableToCorpus(text)
+            outputDF.extend(outputRowDF)
+            #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus0"] = outputDF
+    else:
+      df = pd.ExcelFile(excelFile)
+      sheetNames = df.sheet_names
+      output = []
+      if len(sheetNames) > 0:
+        for s in range(len(sheetNames)):
+          outputDF = []
+          with pd.ExcelFile(excelFile) as xls:
+            data = pd.read_excel(xls, sheetNames[s])
+          if sheetNames[s] != 'Evaluation Warning':
+            text = data.values.tolist()
+            if len(text) > 0:
+              outputRowDF = self.helperRowTableToCorpus(text)
+              #outputColDF = self.helperColTableToCorpus(text)
+              outputDF.extend(outputRowDF)
+              #outputDF.extend(outputColDF)
+          if len(outputDF) > 0:
+            corpus["corpus" + str(s)] = outputDF
+    return corpus
+  def helperRowTableToCorpus(self, textList):
+    #cl = cleanGenText()
+    cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    for line in textList:
+      outputLine = []
+      for words in line:
+        words = str(words)
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = cl.removePunct(word)
+              word = " ".join(cl.removeStopWords(word))
+              word = cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  def helperColTableToCorpus(self, dfList):
+    #cl = cleanGenText()
+    cl = cleanText.cleanGenText()
+    stopWords = ["NaN","Unnamed:","nan"]
+    outputDF = []
+    # use the first length line as the column ref
+    for pos in range(len(dfList[0])):
+      outputLine = []
+      for line in dfList:
+        if pos < len(line):
+          words = line[pos]
+          words = str(words)
+        else: words = ""
+        if len(words) > 0:
+          for word in words.split(" "):
+            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
+            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
+              #word = cl.splitStickWords(word)
+              word = cl.removePunct(word)
+              word = " ".join(cl.removeStopWords(word))
+              word = cl.removeTabWhiteSpaceNewLine(word)
+              if len(word) > 1:
+                if len(word.split(" ")) > 1:
+                  for x in word.split(" "):
+                    if len(x) > 1 and x.isnumeric()==False:
+                      outputLine.append(x.lower())
+                else:
+                  if word.isnumeric() == False:
+                    outputLine.append(word.lower())
+      if len(outputLine) > 0:
+        outputDF.append(outputLine)
+    return outputDF
+  # create a corpus
+  def createCorpusText(self, corpusText):
+    '''ex: "Tom is cat. Jerry is mouse."
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
+    # the output should be like this:
+    '''texts = {
+      "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
+      "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
+    }
+    '''
+    # separate paragraph
+    '''Ex: Cat is an animal. Tom is cat.
+    Mouse is an animal.
+    Jerry is mouse.'''
+    texts = {}
+    cl = cleanText.cleanGenText()
+    #cl = cleanGenText()
+    for pos in range(len(corpusText.split("\n\n"))):
+      if len(corpusText.split("\n\n")[pos]) > 0:
+        texts["Paragraph "+str(pos)] = []
+        lines = corpusText.split("\n\n")[pos]
+        for line in lines.split("\n"):
+          for l in line.split("."):
+            if len(l) > 0:
+              cl.removeTabWhiteSpaceNewLine(l)
+              l = l.lower()
+              newL = []
+              for word in l.split(" "):
+                if len(word) > 0:
+                  word = cl.removeStopWords(word)
+                  for w in word:
+                    if len(w) > 0 and w.isnumeric()==False:
+                      newL.append(w)
+              if len(newL)>0:
+                texts["Paragraph "+str(pos)].append(newL)
+        if len(texts["Paragraph "+str(pos)]) == 0:
+          del texts["Paragraph "+str(pos)]
+    return texts
+  def selectParaForWC(self,corpus):
+    ''' corpus should be in the format:
+    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
+    corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
+    corSize = len(corpus)
+    # less than 2000
+    if 0 < corSize < 2000:
+      window=3.5
+      vector_size=75
+      sample=1e-3
+      negative=10
+      epochs=10
+      sg=1
+    # 2000 - 100000
+    elif 2000 <= corSize < 100000:
+      window=3.5
+      vector_size=75
+      sample=1e-5
+      negative=10
+      epochs=10
+      sg=1
+    elif 100000 <=corSize < 1000000:
+      window=7.5
+      vector_size=150
+      sample=1e-5
+      negative=10
+      epochs=6
+      sg=0
+    return window, vector_size, sample, negative, epochs, sg
+  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
+                    vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
+    # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
+    jsonFile = ""
+    jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
+    cores = multiprocessing.cpu_count()
+    combinedCorpus = []
+    window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
+    if len(jsonFile) > 0:
+      for key in jsonFile:
+        combinedCorpus.extend(jsonFile[key])
+      window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+      # # min_count=1 ensures all words are included
+      '''w2vModel = Word2Vec(
+                          min_count=1,
+                          window=window,
+                          vector_size=vector_size,
+                          sample=sample,
+                          alpha=0.03,
+                          min_alpha=0.0007,
+                          negative=negative,
+                          workers=cores-1,
+                          epochs = epochs,
+                          sg=sg)'''
+      #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
+      accept = False
+      while not accept:
+        if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
+          try:
+            w2vModel = Word2Vec(
+                            min_count=1,
+                            window=window,
+                            vector_size=vector_size,
+                            sample=sample,
+                            alpha=0.03,
+                            min_alpha=0.0007,
+                            negative=negative,
+                            workers=cores-1,
+                            epochs = epochs,
+                            sg=sg)
+            w2vModel.build_vocab(combinedCorpus)
+            w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+            accept = True
+          except:
+            for key in jsonFile:
+              combinedCorpus.extend(jsonFile[key])
+            window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
+            print("next is " + str(len(combinedCorpus)))
+        else:
+          print("no parameter to train")
+          break
+      #w2vModel.build_vocab(combinedCorpus)
+      #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
+      #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
+      #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
+      w2vModel.save(saveFolder+"/"+modelName+".model")
+      w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
+      print("done w2v")
+    else: print("no corpus to train")
+    #return combinedCorpus
+  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
+    # might not be a meaningful keyword
+    #stopWords = ["show"]
+    # same word but just plural nouns, tense
+    simWords = [word+"s",word+"es",word+"ing",word+"ed"]
+    model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
+    results = model.most_similar(positive=[word],topn=n)
+    #removeIndex = []
+    #currN = copy.deepcopy(n)
+    '''for r in range(len(results)):
+      if len(results[r][0]) < 2:
+        removeIndex.append(results[r])
+      # remove the same word but just plural and singular noun and lower than the cos_thres
+      elif results[r][0] == word:
+        removeIndex.append(results[r])
+      elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
+        removeIndex.append(results[r])
+    for rem in removeIndex:
+      results.remove(rem)
+    while len(results)!=n and len(results) != 0:
+      moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
+      if moreNewResult not in results and len(moreNewResult[0])>1:
+        if moreNewResult[0] not in stopWords and results[0] != word:
+          results.append(moreNewResult)
+      currN +=1'''
+    return results
+  # adding our model into spacy
+  # this deals with command line; but instead of using it, we write python script to run command line
+  def loadWordVec(self,modelName,wordVec):
+    # modelName is the name you want to save into spacy
+    # wordVec is the trained word2vec in txt format
+    subprocess.run([sys.executable,
+                    "-m",
+                    "spacy",
+                    "init-model",
+                    "en",
+                    modelName, # this modelName comes from the saved modelName of function trainWord2Vec
+                    "--vectors-loc",
+                    wordVec])
+    print("done")

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
+import gradio as gr
+from collections import Counter
+import csv
+import os
+from functools import lru_cache
+from mtdna_classifier import classify_sample_location
+@lru_cache(maxsize=128)
+def classify_sample_location_cached(accession):
+    return classify_sample_location(accession)
+# Count and suggest final location
+def compute_final_suggested_location(rows):
+    candidates = [
+        row.get("Predicted Location", "").strip()
+        for row in rows
+        if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found"]
+    ] + [
+        row.get("Inferred Region", "").strip()
+        for row in rows
+        if row.get("Inferred Region", "").strip().lower() not in  ["","unknown"]
+    ]
+    if not candidates:
+        return Counter(), ("Unknown", 0)
+    counts = Counter(candidates)
+    top_location, count = counts.most_common(1)[0]
+    return counts, (top_location, count)
+# Store feedback (with required fields)
+def store_feedback_to_drive(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    feedback_file = "/content/drive/MyDrive/Customers/feedback_mtdna.csv"
+    header = ["accession", "helpful", "improvement", "contact"]
+    row = [accession, answer1, answer2, contact]
+    file_exists = os.path.isfile(feedback_file)
+    with open(feedback_file, "a", newline="") as f:
+        writer = csv.writer(f)
+        if not file_exists:
+            writer.writerow(header)
+        writer.writerow(row)
+    return "✅ Feedback submitted. Thank you!"
+def summarize_results(accession):
+    try:
+        output = classify_sample_location_cached(accession)
+    except Exception as e:
+        return [], f"❌ Error: {e}"
+    if accession not in output:
+        return [], "❌ Accession not found in results."
+    isolate = next((k for k in output if k != accession), None)
+    row_score = []
+    rows = []
+    for key in [accession, isolate]:
+        if key not in output:
+            continue
+        sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
+        for section, techniques in output[key].items():
+            for technique, content in techniques.items():
+                source = content.get("source", "")
+                predicted = content.get("predicted_location", "")
+                haplogroup = content.get("haplogroup", "")
+                inferred = content.get("inferred_location", "")
+                context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
+                row = {
+                    "Sample ID": sample_id_label,
+                    "Technique": technique,
+                    "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
+                    "Predicted Location": "" if technique == "haplogroup" else predicted,
+                    "Haplogroup": haplogroup if technique == "haplogroup" else "",
+                    "Inferred Region": inferred if technique == "haplogroup" else "",
+                    "Context Snippet": context
+                }
+                row_score.append(row)
+                rows.append(list(row.values()))
+    location_counts, (final_location, count) = compute_final_suggested_location(row_score)
+    summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
+    summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
+    summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
+    summary = "\n".join(summary_lines)
+    return rows, summary
+# Gradio UI
+with gr.Blocks() as interface:
+    gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
+    gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
+    with gr.Row():
+        accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
+        run_button = gr.Button("🔍 Submit and Classify")
+        reset_button = gr.Button("🔄 Reset")
+    status = gr.Markdown(visible=False)
+    headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
+    output_table = gr.Dataframe(headers=headers, interactive=False)
+    output_summary = gr.Markdown()
+    gr.Markdown("---")
+    gr.Markdown("### 💬 Feedback (required)")
+    q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
+    q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
+    contact = gr.Textbox(label="📧 Your email or institution (optional)")
+    submit_feedback = gr.Button("✅ Submit Feedback")
+    feedback_status = gr.Markdown()
+    def classify_with_loading(accession):
+        return gr.update(value="⏳ Please wait... processing...", visible=True)
+    def classify_main(accession):
+        table, summary = summarize_results(accession)
+        return table, summary, gr.update(visible=False)
+    def reset_fields():
+        return "", "", "", "", "", [], "", gr.update(visible=False)
+    run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
+    run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
+    submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
+    reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
+interface.launch()

data/haplogroup_regions_extended.csv ADDED Viewed

	@@ -0,0 +1,51 @@

+haplogroup,region,source
+H,Western Europe,PhyloTree
+U,Eurasia,PhyloTree
+L0,Southern Africa,EMPOP
+L1,Central Africa,EMPOP
+L2,West Africa,EMPOP
+L3,East Africa,EMPOP
+B4,Southeast Asia,EMPOP
+A2,Native North America,PhyloTree
+C1,Siberia and Americas,PhyloTree
+D4,East Asia,PhyloTree
+X,Western Eurasia / North America,PhyloTree
+J,Europe and Near East,PhyloTree
+K,Europe,PhyloTree
+T,Europe and Central Asia,PhyloTree
+M,Asia,EMPOP
+N,Worldwide (basal),PhyloTree
+I,Europe,PhyloTree
+W,Eurasia,PhyloTree
+Z,North and East Asia,PhyloTree
+Y,Southeast Asia,EMPOP
+E,Oceania and Southeast Asia,PhyloTree
+F,East and Southeast Asia,EMPOP
+B2,Native South America,EMPOP
+A1,Central Asia,EMPOP
+C4,Siberia,PhyloTree
+D1,South America,PhyloTree
+M7,East Asia,EMPOP
+M8,Japan,EMPOP
+G,Siberia,PhyloTree
+HV,Europe and Middle East,PhyloTree
+U5,Northern Europe,PhyloTree
+U6,North Africa,PhyloTree
+U7,South Asia,PhyloTree
+U8,Central Europe,PhyloTree
+R0,Arabian Peninsula,PhyloTree
+R9,Southeast Asia,PhyloTree
+H1,Iberian Peninsula,PhyloTree
+H2,Eastern Europe,PhyloTree
+H3,Western Europe,PhyloTree
+H5,Balkans,PhyloTree
+J1,Europe,PhyloTree
+J2,Middle East,PhyloTree
+T1,Eastern Europe,PhyloTree
+T2,Near East,PhyloTree
+M1,North Africa,PhyloTree
+M2,South Asia,PhyloTree
+M3,South Asia,PhyloTree
+M4,South Asia,PhyloTree
+M5,South Asia,PhyloTree
+M6,South Asia,PhyloTree

mtdna_classifier.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# mtDNA Location Classifier MVP (Google Colab)
+# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
+import os
+import subprocess
+import re
+from Bio import Entrez
+import fitz
+import spacy
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+from transformers import pipeline
+# Set your email (required by NCBI Entrez)
+#Entrez.email = "[email protected]"
+# Step 1: Get PubMed ID from Accession using EDirect
+def get_info_from_accession(accession):
+    cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    output = result.stdout
+    pubmedID, isolate = "", ""
+    for line in output.split("\n"):
+      if len(line) > 0:
+        if "PUBMED" in line:
+          pubmedID = line.split()[-1]
+        if "isolate" in line:  # Check for isolate information
+          # Try direct GenBank annotation: /isolate="XXX"
+          match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line)  # search on current line
+          if match1:
+            isolate = match1.group(1)
+          else:
+            # Try from DEFINITION line: ...isolate XXX...
+            match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
+            if match2:
+              isolate = match2.group(1)
+    # Return the values, even if they are empty strings
+    return pubmedID, isolate
+# Step 2: Get doi link to access the paper
+def get_doi_from_pubmed_id(id):
+    cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {id} -format medline | grep -i "AID"'
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    output = result.stdout
+    doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
+    match = re.search(doi_pattern, output, re.IGNORECASE)
+    return match.group(0)
+# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
+# Step 3.1: Extract Text
+def get_paper_text(doi,id):
+  # create the temporary folder to contain the texts
+  cmd = f'mkdir {id}'
+  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  saveLinkFolder = "/mtDNALocation/data/"+id
+  link = 'https://doi.org/' + doi
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  textsToExtract = {}
+  # get the file to create listOfFile for each id
+  html = extractHTML.HTML("",link)
+  jsonSM = html.getSupMaterial()
+  text = ""
+  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
+  #print(links)
+  for l in links:
+    # get the main paper
+    if l == link:
+      text = html.getListSection()
+      textsToExtract[link] = text
+    elif l.endswith(".pdf"):
+      p = pdf.PDF(l,saveLinkFolder,doi)
+      f = p.openPDFFile()
+      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
+      doc = fitz.open(pdf_path)
+      text = "\n".join([page.get_text() for page in doc])
+      textsToExtract[l] = text
+    elif l.endswith(".doc") or l.endswith(".docx"):
+      d = wordDoc.wordDoc(l,saveLinkFolder)
+      text = d.extractTextByPage()
+      textsToExtract[l] = text
+    elif l.split(".")[-1].lower() in "xlsx":
+      wc = word2vec.word2Vec()
+      corpus = wc.tableTransformToCorpusText([],l)
+      text = ''
+      for c in corpus:
+        para = corpus[c]
+        for words in para:
+          text += " ".join(words)
+      textsToExtract[l] = text
+  # delete folder after finishing getting text
+  cmd = f'rm -r /mtDNALocation/data/{id}'
+  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return textsToExtract
+# Step 3.2: Extract context
+def extract_context(text, keyword, window=500):
+    idx = text.find(keyword)
+    if idx == -1:
+        return "Sample ID not found."
+    return text[max(0, idx-window): idx+window]
+# Step 4: Classification for now (demo purposes)
+# 4.1: Using a HuggingFace model (question-answering)
+def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
+    qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+    result = qa({"context": context, "question": question})
+    return result["answer"]
+# 4.2: Infer from haplogroup
+# Load pre-trained spaCy model for NER
+nlp = spacy.load("en_core_web_sm")
+# Define the haplogroup-to-region mapping (simple rule-based)
+import csv
+def load_haplogroup_mapping(csv_path):
+    mapping = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            mapping[row["haplogroup"]] = [row["region"],row["source"]]
+    return mapping
+# Function to extract haplogroup from the text
+def extract_haplogroup(text):
+    # 1. Try to find a haplogroup preceded by the word "haplogroup"
+    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
+    if match:
+      return re.match(r'^[A-Z][0-9]*', match.group(1)).group(0)
+        #return match.group(1)  # This is the actual haplogroup code like U5b1
+    # 2. Fallback: try to find isolated uppercase-letter haplogroup codes
+    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
+    if fallback:
+        return fallback.group(1)
+    return None  # If nothing found
+# Function to extract location based on NER
+def extract_location(text):
+    doc = nlp(text)
+    locations = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
+            locations.append(ent.text)
+    return locations
+# Function to infer location from haplogroup
+def infer_location_from_haplogroup(haplogroup):
+  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
+  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
+# Function to classify the mtDNA sample
+def classify_mtDNA_sample_from_haplo(text):
+    # Extract haplogroup
+    haplogroup = extract_haplogroup(text)
+    # Extract location based on NER
+    locations = extract_location(text)
+    # Infer location based on haplogroup
+    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
+    return {
+        "source":sourceHaplo,
+        "locations_found_in_context": locations,
+        "haplogroup": haplogroup,
+        "inferred_location": inferred_location
+    }
+# 4.3 Get from available NCBI
+def infer_location_fromNCBI(accession):
+    cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "location|country|geo"'
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    output, location = "",""
+    output = result.stdout
+    if "location" in output or "country" in output or "geo" in output:
+        location = output.split('"')[1]
+        output = output.split()[0]
+    else:
+      location = "Unknown"
+      output = "No location information found in NCBI."
+    return location, output
+# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
+def classify_sample_location(accession):
+  outputs = {}
+  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
+  # Step 1: get pubmed id and isolate
+  pubmedID, isolate = get_info_from_accession(accession)
+  # Step 2: get doi
+  doi = get_doi_from_pubmed_id(pubmedID)
+  # Step 3: get text
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  textsToExtract = get_paper_text(doi,pubmedID)
+  # Step 4: prediction
+  outputs[accession] = {}
+  outputs[isolate] = {}
+  # 4.0 Infer from NCBI
+  location, outputNCBI = infer_location_fromNCBI(accession)
+  NCBI_result = {
+      "source": "NCBI",
+      "sample_id": accession,
+      "predicted_location": location,
+      "context_snippet": outputNCBI}
+  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
+  for key in textsToExtract:
+    text = textsToExtract[key]
+    # try accession number first
+    outputs[accession][key] = {}
+    keyword = accession
+    context = extract_context(text, keyword, window=500)
+    # 4.1: Using a HuggingFace model (question-answering)
+    location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+    qa_result = {
+        "source": key,
+        "sample_id": keyword,
+        "predicted_location": location,
+        "context_snippet": context
+    }
+    outputs[keyword][key]["QAModel"] = qa_result
+    # 4.2: Infer from haplogroup
+    haplo_result = classify_mtDNA_sample_from_haplo(context)
+    outputs[keyword][key]["haplogroup"] = haplo_result
+    # try isolate
+    keyword = isolate
+    outputs[isolate][key] = {}
+    context = extract_context(text, keyword, window=500)
+    # 4.1.1: Using a HuggingFace model (question-answering)
+    location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+    qa_result = {
+        "source": key,
+        "sample_id": keyword,
+        "predicted_location": location,
+        "context_snippet": context
+    }
+    outputs[keyword][key]["QAModel"] = qa_result
+    # 4.2.1: Infer from haplogroup
+    haplo_result = classify_mtDNA_sample_from_haplo(context)
+    outputs[keyword][key]["haplogroup"] = haplo_result
+  return outputs

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+gradio
+transformers
+torch
+pandas
+scikit-learn
+spacy
+pymupdf
+requests
+biopython
+openpyxl
+bs4
+pdfreader
+tabula-py
+python-docx
+thefuzz
+wordsegment
+spacy-lookups-data
+gensim
+nltk

setup.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+# Install Entrez Direct automatically with yes
+yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
+# Add EDirect to PATH for the current session
+echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
+source ~/.bashrc