Spaces:
Running
Running
Upload 19 files
Browse files- DefaultPackages/__init__.py +4 -0
- DefaultPackages/__pycache__/__init__.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/__init__.cpython-311.pyc +0 -0
- DefaultPackages/__pycache__/openFile.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/openFile.cpython-311.pyc +0 -0
- DefaultPackages/__pycache__/saveFile.cpython-310.pyc +0 -0
- DefaultPackages/__pycache__/saveFile.cpython-311.pyc +0 -0
- DefaultPackages/openFile.py +12 -0
- DefaultPackages/saveFile.py +11 -0
- NER/PDF/pdf.py +142 -0
- NER/WordDoc/wordDoc.py +149 -0
- NER/cleanText.py +116 -0
- NER/html/extractHTML.py +158 -0
- NER/word2Vec/word2vec.py +364 -0
- app.py +132 -0
- data/haplogroup_regions_extended.csv +51 -0
- mtdna_classifier.py +242 -0
- requirements.txt +19 -0
- setup.sh +8 -0
DefaultPackages/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = [
|
2 |
+
'openFile',
|
3 |
+
'saveFile',
|
4 |
+
]
|
DefaultPackages/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (193 Bytes). View file
|
|
DefaultPackages/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (212 Bytes). View file
|
|
DefaultPackages/__pycache__/openFile.cpython-310.pyc
ADDED
Binary file (566 Bytes). View file
|
|
DefaultPackages/__pycache__/openFile.cpython-311.pyc
ADDED
Binary file (989 Bytes). View file
|
|
DefaultPackages/__pycache__/saveFile.cpython-310.pyc
ADDED
Binary file (590 Bytes). View file
|
|
DefaultPackages/__pycache__/saveFile.cpython-311.pyc
ADDED
Binary file (1.02 kB). View file
|
|
DefaultPackages/openFile.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def openFile(file):
|
2 |
+
with open(file) as f:
|
3 |
+
openFile = f.read()
|
4 |
+
return openFile
|
5 |
+
|
6 |
+
def openJsonFile(file):
|
7 |
+
import json
|
8 |
+
# Opening JSON file
|
9 |
+
with open(file, 'r') as openfile:
|
10 |
+
# Reading from json file
|
11 |
+
json_object = json.load(openfile)
|
12 |
+
return json_object
|
DefaultPackages/saveFile.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
def saveFile(name,content):
|
3 |
+
Name = name
|
4 |
+
fi = open(Name, "a")
|
5 |
+
# Add new change in to saved file
|
6 |
+
with open(Name, "w") as external_file:
|
7 |
+
add_text = content
|
8 |
+
print(add_text, file=external_file)
|
9 |
+
external_file.close()
|
10 |
+
def saveJsonFile(name,content):
|
11 |
+
saveFile(name,json.dumps(content))
|
NER/PDF/pdf.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!pip install pdfreader
|
2 |
+
import pdfreader
|
3 |
+
from pdfreader import PDFDocument, SimplePDFViewer
|
4 |
+
#!pip install bs4
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import requests
|
7 |
+
from NER import cleanText
|
8 |
+
#!pip install tabula-py
|
9 |
+
|
10 |
+
import tabula
|
11 |
+
class PDF(): # using PyPDF2
|
12 |
+
def __init__(self, pdf, saveFolder, doi=None):
|
13 |
+
self.pdf = pdf
|
14 |
+
self.doi = doi
|
15 |
+
self.saveFolder = saveFolder
|
16 |
+
def openPDFFile(self):
|
17 |
+
if "https" in self.pdf:
|
18 |
+
name = self.pdf.split("/")[-1]
|
19 |
+
name = self.downloadPDF(self.saveFolder)
|
20 |
+
if name != "no pdfLink to download":
|
21 |
+
fileToOpen = self.saveFolder + "/" + name
|
22 |
+
else: fileToOpen = self.pdf
|
23 |
+
else: fileToOpen = self.pdf
|
24 |
+
return open(fileToOpen, "rb")
|
25 |
+
def downloadPDF(self, saveFolder):
|
26 |
+
pdfLink = ''
|
27 |
+
if ".pdf" not in self.pdf and "https" not in self.pdf: # the download link is a general URL not pdf link
|
28 |
+
r = requests.get(self.pdf)
|
29 |
+
soup = BeautifulSoup(r.content, 'html.parser')
|
30 |
+
links = soup.find_all("a")
|
31 |
+
for link in links:
|
32 |
+
if ".pdf" in link.get("href"):
|
33 |
+
if self.doi in link.get("href"):
|
34 |
+
pdfLink = link.get("href")
|
35 |
+
break
|
36 |
+
else:
|
37 |
+
pdfLink = self.pdf
|
38 |
+
if pdfLink != '':
|
39 |
+
response = requests.get(pdfLink)
|
40 |
+
name = pdfLink.split("/")[-1]
|
41 |
+
pdf = open(saveFolder+"/"+name, 'wb')
|
42 |
+
pdf.write(response.content)
|
43 |
+
pdf.close()
|
44 |
+
print("pdf downloaded")
|
45 |
+
return name
|
46 |
+
else:
|
47 |
+
return "no pdfLink to download"
|
48 |
+
def extractText(self):
|
49 |
+
jsonPage = {}
|
50 |
+
pdf = self.openPDFFile()
|
51 |
+
doc = PDFDocument(pdf)
|
52 |
+
viewer = SimplePDFViewer(pdf)
|
53 |
+
all_pages = [p for p in doc.pages()]
|
54 |
+
cl = cleanText.cleanGenText()
|
55 |
+
for page in range(1,len(all_pages)):
|
56 |
+
viewer.navigate(page)
|
57 |
+
viewer.render()
|
58 |
+
if str(page) not in jsonPage:
|
59 |
+
jsonPage[str(page)] = {}
|
60 |
+
# text
|
61 |
+
text = "".join(viewer.canvas.strings)
|
62 |
+
clean, filteredWord = cl.textPreprocessing(text) #cleanText.cleanGenText(text).cleanText()
|
63 |
+
# save the text of filtered words which remove "a", the, "an", "is", etc.
|
64 |
+
jsonPage[str(page)]["normalText"] = [text]
|
65 |
+
jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
|
66 |
+
#image
|
67 |
+
image = viewer.canvas.images
|
68 |
+
jsonPage[str(page)]["image"] = [image]
|
69 |
+
#form
|
70 |
+
form = viewer.canvas.forms
|
71 |
+
jsonPage[str(page)]["form"] = [form]
|
72 |
+
# content based on PDF adobe
|
73 |
+
content = viewer.canvas.text_content
|
74 |
+
jsonPage[str(page)]["content"] = [content]
|
75 |
+
# inline_image:'''
|
76 |
+
'''Inline images are aligned with the text,
|
77 |
+
and are usually content images like photos, charts, or graphs.'''
|
78 |
+
inline_image = viewer.canvas.inline_images
|
79 |
+
jsonPage[str(page)]["inline_image"] = [inline_image]
|
80 |
+
pdf.close()
|
81 |
+
'''Output Format:
|
82 |
+
jsonPage[str(page)]["normalText"]
|
83 |
+
jsonPage[str(page)]["cleanText"]
|
84 |
+
jsonPage[str(page)]["image"]
|
85 |
+
jsonPage[str(page)]["form"]
|
86 |
+
jsonPage[str(page)]["content"]'''
|
87 |
+
return jsonPage
|
88 |
+
def extractTable(self,pages,saveFile=None,outputFormat=None):
|
89 |
+
'''pages (str, int, iterable of int, optional) –
|
90 |
+
An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
|
91 |
+
Examples: '1-2,3', 'all', [1,2]'''
|
92 |
+
df = []
|
93 |
+
if "https" in self.pdf:
|
94 |
+
name = self.pdf.split("/")[-1]
|
95 |
+
name = self.downloadPDF(self.saveFolder)
|
96 |
+
if name != "no pdfLink to download":
|
97 |
+
fileToOpen = self.saveFolder + "/" + name
|
98 |
+
else: fileToOpen = self.pdf
|
99 |
+
else: fileToOpen = self.pdf
|
100 |
+
try:
|
101 |
+
df = tabula.read_pdf(fileToOpen, pages=pages)
|
102 |
+
# saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv"
|
103 |
+
# outputFormat: "csv"
|
104 |
+
#tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages)
|
105 |
+
except:# ValueError:
|
106 |
+
df = []
|
107 |
+
print("No tables found in PDF file")
|
108 |
+
return df
|
109 |
+
def mergeTextinJson(self,jsonPDF):
|
110 |
+
# pdf
|
111 |
+
#cl = cleanGenText()
|
112 |
+
cl = cleanText.cleanGenText()
|
113 |
+
pdfText = ""
|
114 |
+
for page in jsonPDF:
|
115 |
+
# page is "\n\n"
|
116 |
+
if len(jsonPDF[page]["normalText"]) > 0:
|
117 |
+
for i in range(len(jsonPDF[page]["normalText"])):
|
118 |
+
text = jsonPDF[page]["normalText"][i]
|
119 |
+
if len(text)>0:
|
120 |
+
text = cl.removeTabWhiteSpaceNewLine(text)
|
121 |
+
text = cl.removeExtraSpaceBetweenWords(text)
|
122 |
+
jsonPDF[page]["normalText"][i] = text
|
123 |
+
# same page is just a dot.
|
124 |
+
if i-1 > 0:
|
125 |
+
if jsonPDF[page]["normalText"][i-1][-1] != ".":
|
126 |
+
pdfText += ". "
|
127 |
+
pdfText += jsonPDF[page]["normalText"][i]
|
128 |
+
if len(jsonPDF[page]["normalText"][i])>0:
|
129 |
+
if jsonPDF[page]["normalText"][i][-1]!=".":
|
130 |
+
pdfText += "."
|
131 |
+
pdfText += "\n\n"
|
132 |
+
return pdfText
|
133 |
+
def getReference(self):
|
134 |
+
pass
|
135 |
+
def getSupMaterial(self):
|
136 |
+
pass
|
137 |
+
def removeHeaders(self):
|
138 |
+
pass
|
139 |
+
def removeFooters(self):
|
140 |
+
pass
|
141 |
+
def removeReference(self):
|
142 |
+
pass
|
NER/WordDoc/wordDoc.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! pip install spire.doc
|
2 |
+
#! pip install Spire.XLS
|
3 |
+
import pandas as pd
|
4 |
+
from spire.doc import *
|
5 |
+
from spire.doc.common import *
|
6 |
+
from spire.xls import *
|
7 |
+
from spire.xls.common import *
|
8 |
+
from NER import cleanText
|
9 |
+
import requests
|
10 |
+
class wordDoc(): # using python-docx
|
11 |
+
def __init__(self, wordDoc,saveFolder):
|
12 |
+
self.wordDoc = wordDoc
|
13 |
+
self.saveFolder = saveFolder
|
14 |
+
def openFile(self):
|
15 |
+
document = Document()
|
16 |
+
return document.LoadFromFile(self.wordDoc)
|
17 |
+
def extractTextByPage(self):
|
18 |
+
# reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
|
19 |
+
json = {}
|
20 |
+
#doc = self.openFile()
|
21 |
+
# Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
|
22 |
+
try:
|
23 |
+
doc = Document()
|
24 |
+
doc.LoadFromFile(self.wordDoc)
|
25 |
+
except:
|
26 |
+
response = requests.get(self.wordDoc)
|
27 |
+
name = self.wordDoc.split("/")[-1]
|
28 |
+
with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
|
29 |
+
temp_file.write(response.content)
|
30 |
+
doc = Document()
|
31 |
+
doc.LoadFromFile(self.saveFolder+"/" + name)
|
32 |
+
text = doc.GetText()
|
33 |
+
return text
|
34 |
+
def extractTableAsText(self):
|
35 |
+
getDoc = ''
|
36 |
+
try:
|
37 |
+
# reference:
|
38 |
+
# https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
|
39 |
+
doc = Document()
|
40 |
+
doc.LoadFromFile(self.wordDoc)
|
41 |
+
getDoc = "have document"
|
42 |
+
except:
|
43 |
+
response = requests.get(self.wordDoc)
|
44 |
+
name = self.wordDoc.split("/")[-1]
|
45 |
+
with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
|
46 |
+
temp_file.write(response.content)
|
47 |
+
doc = Document()
|
48 |
+
doc.LoadFromFile(self.saveFolder+"/" + name)
|
49 |
+
getDoc = "have document"
|
50 |
+
json = {}
|
51 |
+
if len(getDoc) > 0:
|
52 |
+
# Loop through the sections
|
53 |
+
for s in range(doc.Sections.Count):
|
54 |
+
# Get a section
|
55 |
+
section = doc.Sections.get_Item(s)
|
56 |
+
# Get the tables in the section
|
57 |
+
json["Section" + str(s)] = {}
|
58 |
+
tables = section.Tables
|
59 |
+
# Loop through the tables
|
60 |
+
for i in range(0, tables.Count):
|
61 |
+
# Get a table
|
62 |
+
table = tables.get_Item(i)
|
63 |
+
# Initialize a string to store the table data
|
64 |
+
tableData = ''
|
65 |
+
# Loop through the rows of the table
|
66 |
+
for j in range(0, table.Rows.Count):
|
67 |
+
# Loop through the cells of the row
|
68 |
+
for k in range(0, table.Rows.get_Item(j).Cells.Count):
|
69 |
+
# Get a cell
|
70 |
+
cell = table.Rows.get_Item(j).Cells.get_Item(k)
|
71 |
+
# Get the text in the cell
|
72 |
+
cellText = ''
|
73 |
+
for para in range(cell.Paragraphs.Count):
|
74 |
+
paragraphText = cell.Paragraphs.get_Item(para).Text
|
75 |
+
cellText += (paragraphText + ' ')
|
76 |
+
# Add the text to the string
|
77 |
+
tableData += cellText
|
78 |
+
if k < table.Rows.get_Item(j).Cells.Count - 1:
|
79 |
+
tableData += '\t'
|
80 |
+
# Add a new line
|
81 |
+
tableData += '\n'
|
82 |
+
json["Section" + str(s)]["Table"+str(i)] = tableData
|
83 |
+
return json
|
84 |
+
def extractTableAsExcel(self):
|
85 |
+
getDoc = ''
|
86 |
+
try:
|
87 |
+
# reference:
|
88 |
+
# https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
|
89 |
+
doc = Document()
|
90 |
+
doc.LoadFromFile(self.wordDoc)
|
91 |
+
getDoc = "have document"
|
92 |
+
except:
|
93 |
+
response = requests.get(self.wordDoc)
|
94 |
+
name = self.wordDoc.split("/")[-1]
|
95 |
+
with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
|
96 |
+
temp_file.write(response.content)
|
97 |
+
doc = Document()
|
98 |
+
doc.LoadFromFile(self.saveFolder+"/" + name)
|
99 |
+
getDoc = "have document"
|
100 |
+
if len(getDoc) > 0:
|
101 |
+
try:
|
102 |
+
# Create an instance of Workbook
|
103 |
+
wb = Workbook()
|
104 |
+
wb.Worksheets.Clear()
|
105 |
+
|
106 |
+
# Loop through sections in the document
|
107 |
+
for i in range(doc.Sections.Count):
|
108 |
+
# Get a section
|
109 |
+
section = doc.Sections.get_Item(i)
|
110 |
+
# Loop through tables in the section
|
111 |
+
for j in range(section.Tables.Count):
|
112 |
+
# Get a table
|
113 |
+
table = section.Tables.get_Item(j)
|
114 |
+
# Create a worksheet
|
115 |
+
ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
|
116 |
+
# Write the table to the worksheet
|
117 |
+
for row in range(table.Rows.Count):
|
118 |
+
# Get a row
|
119 |
+
tableRow = table.Rows.get_Item(row)
|
120 |
+
# Loop through cells in the row
|
121 |
+
for cell in range(tableRow.Cells.Count):
|
122 |
+
# Get a cell
|
123 |
+
tableCell = tableRow.Cells.get_Item(cell)
|
124 |
+
# Get the text in the cell
|
125 |
+
cellText = ''
|
126 |
+
for paragraph in range(tableCell.Paragraphs.Count):
|
127 |
+
paragraph = tableCell.Paragraphs.get_Item(paragraph)
|
128 |
+
cellText = cellText + (paragraph.Text + ' ')
|
129 |
+
# Write the cell text to the worksheet
|
130 |
+
ws.SetCellValue(row + 1, cell + 1, cellText)
|
131 |
+
|
132 |
+
# Save the workbook
|
133 |
+
name = self.wordDoc.split("/")[-1]
|
134 |
+
if self.saveFolder == None:
|
135 |
+
wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
|
136 |
+
nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
|
137 |
+
else:
|
138 |
+
wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
|
139 |
+
nameFile = self.saveFolder+'/'+name + ".xlsx"
|
140 |
+
doc.Close()
|
141 |
+
wb.Dispose()
|
142 |
+
return nameFile
|
143 |
+
except: return "No table found on word doc"
|
144 |
+
else:
|
145 |
+
return "No table found on word doc"
|
146 |
+
def getReference(self):
|
147 |
+
pass
|
148 |
+
def getSupMaterial(self):
|
149 |
+
pass
|
NER/cleanText.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# reference:
|
2 |
+
# https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
|
3 |
+
import re
|
4 |
+
import nltk
|
5 |
+
#nltk.download('stopwords')
|
6 |
+
#nltk.download()
|
7 |
+
from DefaultPackages import openFile, saveFile
|
8 |
+
import json
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.corpus.reader.api import wordpunct_tokenize
|
11 |
+
from nltk.tokenize import word_tokenize
|
12 |
+
#from wordsegment import load, segment
|
13 |
+
from wordsegment import load, segment
|
14 |
+
class cleanGenText():
|
15 |
+
def __init__(self):
|
16 |
+
#self.text = text
|
17 |
+
load()
|
18 |
+
pass
|
19 |
+
def removePunct(self,text,KeepPeriod=False):
|
20 |
+
punctuation = r'[^\w\s]'
|
21 |
+
if KeepPeriod==True:
|
22 |
+
punctuation = r'[^\w\s\.]'
|
23 |
+
return re.sub(punctuation, '', text)
|
24 |
+
def removeURL(self,text):
|
25 |
+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
26 |
+
return url_pattern.sub(r'', text)
|
27 |
+
def removeHTMLTag(self,text):
|
28 |
+
html_tags_pattern = r'<.*?>'
|
29 |
+
return re.sub(html_tags_pattern, '', text)
|
30 |
+
def removeTabWhiteSpaceNewLine(self,text):
|
31 |
+
# remove \n or \t and unnecessary white space
|
32 |
+
cleanText = text.replace("\n\n","")
|
33 |
+
cleanText = text.replace("\n","")
|
34 |
+
cleanText = cleanText.replace("\t","")
|
35 |
+
cleanText = cleanText.strip()
|
36 |
+
return cleanText
|
37 |
+
def removeExtraSpaceBetweenWords(self,text):
|
38 |
+
return re.sub(r'\s+', ' ',text).strip()
|
39 |
+
def removeStopWords(self,text):
|
40 |
+
#extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
|
41 |
+
filteredWord = []
|
42 |
+
stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
|
43 |
+
textWords = word_tokenize(text)
|
44 |
+
for word in textWords:
|
45 |
+
if word.lower() not in stopWords:
|
46 |
+
filteredWord.append(word) # and w.isalpha()==True]
|
47 |
+
return filteredWord
|
48 |
+
def removeLowercaseBetweenUppercase(self,segment):
|
49 |
+
# segment such as "Myanmar (formerly Burma)"
|
50 |
+
# but not change anything for "Viet Nam"
|
51 |
+
# for special cases:
|
52 |
+
# the capital letter:
|
53 |
+
# When there is a lowercase word between:
|
54 |
+
# e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"
|
55 |
+
# When there is no lowercase word or uppercase words in a row:
|
56 |
+
# e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
|
57 |
+
outputUp = []
|
58 |
+
segment = self.removeTabWhiteSpaceNewLine(segment)
|
59 |
+
segments = segment.split(" ")
|
60 |
+
for w in range(len(segments)):
|
61 |
+
word = segments[w]
|
62 |
+
cleanWord = self.removePunct(word)
|
63 |
+
cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
|
64 |
+
prevWord = ""
|
65 |
+
if w > 0:
|
66 |
+
prevWord = segments[w-1]
|
67 |
+
cleanPreWord = self.removePunct(prevWord)
|
68 |
+
cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
|
69 |
+
if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
|
70 |
+
if len(prevWord)>0 and prevWord[0].isupper() == True:
|
71 |
+
outputUp[-1] += " " + cleanWord
|
72 |
+
else:
|
73 |
+
outputUp.append(cleanWord)
|
74 |
+
return outputUp
|
75 |
+
def textPreprocessing(self, text, keepPeriod=False):
|
76 |
+
# lowercase
|
77 |
+
#lowerText = self.text.lower()
|
78 |
+
# remove punctuation & special characacters
|
79 |
+
cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
|
80 |
+
# removal of URLs in text
|
81 |
+
cleanText = self.removeURL(cleanText)
|
82 |
+
# removal of HTML Tags
|
83 |
+
cleanText = self.removeHTMLTag(cleanText)
|
84 |
+
# remove \n or \t and unnecessary white space
|
85 |
+
cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
|
86 |
+
# stop-words removal
|
87 |
+
filteredWord = self.removeStopWords(cleanText)
|
88 |
+
# a sentence or the capital word behind a period "."
|
89 |
+
return cleanText, filteredWord
|
90 |
+
#generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
|
91 |
+
#saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
|
92 |
+
def splitStickWords(self,word):
|
93 |
+
#output = []
|
94 |
+
split_words = segment(word)
|
95 |
+
'''for w in split_words:
|
96 |
+
pos = word.lower().find(w)
|
97 |
+
if word[pos].isupper() == True:
|
98 |
+
output.append(w[0].upper() + w[1:])
|
99 |
+
else:
|
100 |
+
output.append(w)
|
101 |
+
if pos >=0:
|
102 |
+
if pos+len(w)<len(word):
|
103 |
+
if word[pos+len(w)] == ".":
|
104 |
+
output[-1] = output[-1] + "." '''
|
105 |
+
return " ".join(split_words)
|
106 |
+
def removeDOI(self, word, doiLink=None):
|
107 |
+
# if they have the word DOI in that: ex: 1368598DOI after general clean
|
108 |
+
if "DOI" in word:
|
109 |
+
word = word.replace(word,"")
|
110 |
+
# if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
|
111 |
+
if doiLink != None:
|
112 |
+
w = self.splitStickWords(word)
|
113 |
+
cleanDOI = self.removePunct(doiLink)
|
114 |
+
if cleanDOI in w:
|
115 |
+
word = w.replace(cleanDOI,"")
|
116 |
+
return word
|
NER/html/extractHTML.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!pip install bs4
|
2 |
+
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import requests
|
5 |
+
from DefaultPackages import openFile, saveFile
|
6 |
+
from NER import cleanText
|
7 |
+
import pandas as pd
|
8 |
+
class HTML():
|
9 |
+
def __init__(self, htmlFile, htmlLink):
|
10 |
+
self.htmlLink = htmlLink
|
11 |
+
self.htmlFile = htmlFile
|
12 |
+
def openHTMLFile(self):
|
13 |
+
if self.htmlLink != "None":
|
14 |
+
r = requests.get(self.htmlLink)
|
15 |
+
soup = BeautifulSoup(r.content, 'html.parser')
|
16 |
+
else:
|
17 |
+
with open(self.htmlFile) as fp:
|
18 |
+
soup = BeautifulSoup(fp, 'html.parser')
|
19 |
+
return soup
|
20 |
+
def getText(self):
|
21 |
+
soup = self.openHTMLFile()
|
22 |
+
s = soup.find_all("html")
|
23 |
+
for t in range(len(s)):
|
24 |
+
text = s[t].get_text()
|
25 |
+
cl = cleanText.cleanGenText()
|
26 |
+
text = cl.removeExtraSpaceBetweenWords(text)
|
27 |
+
return text
|
28 |
+
def getListSection(self, scienceDirect=None):
|
29 |
+
json = {}
|
30 |
+
text = ""
|
31 |
+
textJson, textHTML = "",""
|
32 |
+
if scienceDirect == None:
|
33 |
+
soup = self.openHTMLFile()
|
34 |
+
# get list of section
|
35 |
+
json = {}
|
36 |
+
for h2Pos in range(len(soup.find_all('h2'))):
|
37 |
+
if soup.find_all('h2')[h2Pos].text not in json:
|
38 |
+
json[soup.find_all('h2')[h2Pos].text] = []
|
39 |
+
if h2Pos + 1 < len(soup.find_all('h2')):
|
40 |
+
content = soup.find_all('h2')[h2Pos].find_next("p")
|
41 |
+
nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
|
42 |
+
while content.text != nexth2Content.text:
|
43 |
+
json[soup.find_all('h2')[h2Pos].text].append(content.text)
|
44 |
+
content = content.find_next("p")
|
45 |
+
else:
|
46 |
+
content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
|
47 |
+
json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
|
48 |
+
# format
|
49 |
+
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
|
50 |
+
'Results':[], 'Discussion':[], 'References':[],
|
51 |
+
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
|
52 |
+
'Additional information':[], 'Electronic supplementary material':[],
|
53 |
+
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
|
54 |
+
if scienceDirect!= None or len(json)==0:
|
55 |
+
# Replace with your actual Elsevier API key
|
56 |
+
api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
|
57 |
+
# ScienceDirect article DOI or PI (Example DOI)
|
58 |
+
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
|
59 |
+
# Base URL for the Elsevier API
|
60 |
+
base_url = "https://api.elsevier.com/content/article/doi/"
|
61 |
+
# Set headers with API key
|
62 |
+
headers = {
|
63 |
+
"Accept": "application/json",
|
64 |
+
"X-ELS-APIKey": api_key
|
65 |
+
}
|
66 |
+
# Make the API request
|
67 |
+
response = requests.get(base_url + doi, headers=headers)
|
68 |
+
# Check if the request was successful
|
69 |
+
if response.status_code == 200:
|
70 |
+
data = response.json()
|
71 |
+
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
|
72 |
+
if "originalText" in list(supp_data.keys()):
|
73 |
+
if type(supp_data["originalText"])==str:
|
74 |
+
json["originalText"] = [supp_data["originalText"]]
|
75 |
+
if type(supp_data["originalText"])==dict:
|
76 |
+
json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
|
77 |
+
else:
|
78 |
+
if type(supp_data)==dict:
|
79 |
+
for key in supp_data:
|
80 |
+
json[key] = [supp_data[key]]
|
81 |
+
|
82 |
+
textJson = self.mergeTextInJson(json)
|
83 |
+
textHTML = self.getText()
|
84 |
+
if len(textHTML) > len(textJson):
|
85 |
+
text = textHTML
|
86 |
+
else: text = textJson
|
87 |
+
return text #json
|
88 |
+
def getReference(self):
|
89 |
+
# get reference to collect more next data
|
90 |
+
ref = []
|
91 |
+
json = self.getListSection()
|
92 |
+
for key in json["References"]:
|
93 |
+
ct = cleanText.cleanGenText(key)
|
94 |
+
cleanText, filteredWord = ct.cleanText()
|
95 |
+
if cleanText not in ref:
|
96 |
+
ref.append(cleanText)
|
97 |
+
return ref
|
98 |
+
def getSupMaterial(self):
|
99 |
+
# check if there is material or not
|
100 |
+
json = {}
|
101 |
+
soup = self.openHTMLFile()
|
102 |
+
for h2Pos in range(len(soup.find_all('h2'))):
|
103 |
+
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
|
104 |
+
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
|
105 |
+
link, output = [],[]
|
106 |
+
if soup.find_all('h2')[h2Pos].text not in json:
|
107 |
+
json[soup.find_all('h2')[h2Pos].text] = []
|
108 |
+
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
|
109 |
+
link.append(l["href"])
|
110 |
+
if h2Pos + 1 < len(soup.find_all('h2')):
|
111 |
+
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
|
112 |
+
if nexth2Link in link:
|
113 |
+
link = link[:link.index(nexth2Link)]
|
114 |
+
# only take links having "https" in that
|
115 |
+
for i in link:
|
116 |
+
if "https" in i: output.append(i)
|
117 |
+
json[soup.find_all('h2')[h2Pos].text].extend(output)
|
118 |
+
return json
|
119 |
+
def extractTable(self):
|
120 |
+
soup = self.openHTMLFile()
|
121 |
+
df = []
|
122 |
+
try:
|
123 |
+
df = pd.read_html(str(soup))
|
124 |
+
except ValueError:
|
125 |
+
df = []
|
126 |
+
print("No tables found in HTML file")
|
127 |
+
return df
|
128 |
+
def mergeTextInJson(self,jsonHTML):
|
129 |
+
#cl = cleanText.cleanGenText()
|
130 |
+
cl = cleanGenText()
|
131 |
+
htmlText = ""
|
132 |
+
for sec in jsonHTML:
|
133 |
+
# section is "\n\n"
|
134 |
+
if len(jsonHTML[sec]) > 0:
|
135 |
+
for i in range(len(jsonHTML[sec])):
|
136 |
+
# same section is just a dot.
|
137 |
+
text = jsonHTML[sec][i]
|
138 |
+
if len(text)>0:
|
139 |
+
#text = cl.removeTabWhiteSpaceNewLine(text)
|
140 |
+
#text = cl.removeExtraSpaceBetweenWords(text)
|
141 |
+
text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
|
142 |
+
jsonHTML[sec][i] = text
|
143 |
+
if i-1 >= 0:
|
144 |
+
if len(jsonHTML[sec][i-1])>0:
|
145 |
+
if jsonHTML[sec][i-1][-1] != ".":
|
146 |
+
htmlText += ". "
|
147 |
+
htmlText += jsonHTML[sec][i]
|
148 |
+
if len(jsonHTML[sec][i]) > 0:
|
149 |
+
if jsonHTML[sec][i][-1]!=".":
|
150 |
+
htmlText += "."
|
151 |
+
htmlText += "\n\n"
|
152 |
+
return htmlText
|
153 |
+
def removeHeaders(self):
|
154 |
+
pass
|
155 |
+
def removeFooters(self):
|
156 |
+
pass
|
157 |
+
def removeReferences(self):
|
158 |
+
pass
|
NER/word2Vec/word2vec.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''WORD TO VECTOR'''
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import gensim
|
5 |
+
import spacy
|
6 |
+
from DefaultPackages import openFile, saveFile
|
7 |
+
from NER import cleanText
|
8 |
+
from gensim.models.keyedvectors import KeyedVectors
|
9 |
+
from gensim.test.utils import common_texts
|
10 |
+
from gensim.models.word2vec import Word2Vec
|
11 |
+
from gensim.scripts.glove2word2vec import glove2word2vec
|
12 |
+
from gensim.test.utils import datapath, get_tmpfile
|
13 |
+
import sys
|
14 |
+
import subprocess
|
15 |
+
# can try multiprocessing to run quicker
|
16 |
+
import multiprocessing
|
17 |
+
import copy
|
18 |
+
sys.setrecursionlimit(1000)
|
19 |
+
# creat folder word2Vec
|
20 |
+
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
|
21 |
+
# create word2vec model
|
22 |
+
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
|
23 |
+
'''Some notes for this model
|
24 |
+
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
|
25 |
+
a similar word to the word we are finding, so can we try to preprocess text so that
|
26 |
+
we make the corpus more effective and only contains the important words. Then when we
|
27 |
+
train the model, the important words will be seen as important. Or
|
28 |
+
when we already have the similar list of words, we can remove the words in there
|
29 |
+
that are stopwords/unnecessary words.'''
|
30 |
+
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
|
31 |
+
class word2Vec():
|
32 |
+
def __init__(self, nameFile=None, modelName=None):
|
33 |
+
self.nameFile = nameFile
|
34 |
+
self.modelName = modelName
|
35 |
+
def spacy_similarity(self, word):
|
36 |
+
# when use word2vec, try medium or large is better
|
37 |
+
# maybe try odc similarity?
|
38 |
+
nlp = spacy.load("en_core_web_lg")
|
39 |
+
doc = nlp(word)
|
40 |
+
for token1 in doc:
|
41 |
+
for token2 in doc:
|
42 |
+
print(token1.text, token2.text, token1.similarity(token2))
|
43 |
+
pass
|
44 |
+
# clean text before transform to corpus
|
45 |
+
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
46 |
+
cl = cleanText.cleanGenText()
|
47 |
+
#cl = cleanGenText()
|
48 |
+
output = ""
|
49 |
+
alreadyRemoveDoi = False
|
50 |
+
for word in oriText.split(" "):
|
51 |
+
# remove DOI
|
52 |
+
if doi != None and doi in oriText:
|
53 |
+
if alreadyRemoveDoi == False:
|
54 |
+
newWord = cl.removeDOI(word,doi)
|
55 |
+
if len(newWord) > 0 and newWord != word:
|
56 |
+
alreadyRemoveDoi = True
|
57 |
+
word = newWord
|
58 |
+
# remove punctuation
|
59 |
+
# split the sticked words
|
60 |
+
#word = cl.splitStickWords(word)
|
61 |
+
# remove punctuation
|
62 |
+
word = cl.removePunct(word,True)
|
63 |
+
# remove URL
|
64 |
+
word = cl.removeURL(word)
|
65 |
+
# remove HTMLTag
|
66 |
+
word = cl.removeHTMLTag(word)
|
67 |
+
# remove tab, white space, newline
|
68 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
69 |
+
# optional: remove stopwords
|
70 |
+
#word = cl.removeStopWords(word)
|
71 |
+
if len(word)>0:
|
72 |
+
output += word + " "
|
73 |
+
return output
|
74 |
+
def cleanAllTextBeforeCorpus(self, allText, doi=None):
|
75 |
+
cleanOutput = ""
|
76 |
+
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
77 |
+
if len(allText) > 0:
|
78 |
+
corpusText = allText
|
79 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
80 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
81 |
+
lines = corpusText.split("\n\n")[pos]
|
82 |
+
for line in lines.split("\n"):
|
83 |
+
if remove in line: line = line.replace(remove, "")
|
84 |
+
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
85 |
+
cleanOutput += clean_text + "\n"
|
86 |
+
cleanOutput += "\n\n"
|
87 |
+
return cleanOutput
|
88 |
+
def tableTransformToCorpusText(self, df, excelFile=None):
|
89 |
+
# PDF, Excel, WordDoc
|
90 |
+
#cl = cleanText.cleanGenText()
|
91 |
+
corpus = {}
|
92 |
+
# PDF or df
|
93 |
+
if excelFile == None:
|
94 |
+
if len(df) > 0:
|
95 |
+
try:
|
96 |
+
for i in range(len(df)):
|
97 |
+
# each new dimension/page is considered to be a sentence which ends with the period.
|
98 |
+
# each new line is a new list, and each new df is a new corpus
|
99 |
+
outputDF = []
|
100 |
+
text = df[i].values.tolist()
|
101 |
+
if len(text) > 0:
|
102 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
103 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
104 |
+
outputDF.extend(outputRowDF)
|
105 |
+
#outputDF.extend(outputColDF)
|
106 |
+
if len(outputDF) > 0:
|
107 |
+
corpus["corpus" + str(i)] = outputDF
|
108 |
+
except:
|
109 |
+
outputDF = []
|
110 |
+
text = df.values.tolist()
|
111 |
+
if len(text) > 0:
|
112 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
113 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
114 |
+
outputDF.extend(outputRowDF)
|
115 |
+
#outputDF.extend(outputColDF)
|
116 |
+
if len(outputDF) > 0:
|
117 |
+
corpus["corpus0"] = outputDF
|
118 |
+
else:
|
119 |
+
df = pd.ExcelFile(excelFile)
|
120 |
+
sheetNames = df.sheet_names
|
121 |
+
output = []
|
122 |
+
if len(sheetNames) > 0:
|
123 |
+
for s in range(len(sheetNames)):
|
124 |
+
outputDF = []
|
125 |
+
with pd.ExcelFile(excelFile) as xls:
|
126 |
+
data = pd.read_excel(xls, sheetNames[s])
|
127 |
+
if sheetNames[s] != 'Evaluation Warning':
|
128 |
+
text = data.values.tolist()
|
129 |
+
if len(text) > 0:
|
130 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
131 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
132 |
+
outputDF.extend(outputRowDF)
|
133 |
+
#outputDF.extend(outputColDF)
|
134 |
+
if len(outputDF) > 0:
|
135 |
+
corpus["corpus" + str(s)] = outputDF
|
136 |
+
return corpus
|
137 |
+
def helperRowTableToCorpus(self, textList):
|
138 |
+
#cl = cleanGenText()
|
139 |
+
cl = cleanText.cleanGenText()
|
140 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
141 |
+
outputDF = []
|
142 |
+
for line in textList:
|
143 |
+
outputLine = []
|
144 |
+
for words in line:
|
145 |
+
words = str(words)
|
146 |
+
if len(words) > 0:
|
147 |
+
for word in words.split(" "):
|
148 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
149 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
150 |
+
#word = cl.splitStickWords(word)
|
151 |
+
word = cl.removePunct(word)
|
152 |
+
word = " ".join(cl.removeStopWords(word))
|
153 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
154 |
+
if len(word) > 1:
|
155 |
+
if len(word.split(" ")) > 1:
|
156 |
+
for x in word.split(" "):
|
157 |
+
if len(x) > 1 and x.isnumeric()==False:
|
158 |
+
outputLine.append(x.lower())
|
159 |
+
else:
|
160 |
+
if word.isnumeric() == False:
|
161 |
+
outputLine.append(word.lower())
|
162 |
+
if len(outputLine) > 0:
|
163 |
+
outputDF.append(outputLine)
|
164 |
+
return outputDF
|
165 |
+
def helperColTableToCorpus(self, dfList):
|
166 |
+
#cl = cleanGenText()
|
167 |
+
cl = cleanText.cleanGenText()
|
168 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
169 |
+
outputDF = []
|
170 |
+
# use the first length line as the column ref
|
171 |
+
for pos in range(len(dfList[0])):
|
172 |
+
outputLine = []
|
173 |
+
for line in dfList:
|
174 |
+
if pos < len(line):
|
175 |
+
words = line[pos]
|
176 |
+
words = str(words)
|
177 |
+
else: words = ""
|
178 |
+
if len(words) > 0:
|
179 |
+
for word in words.split(" "):
|
180 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
181 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
182 |
+
#word = cl.splitStickWords(word)
|
183 |
+
word = cl.removePunct(word)
|
184 |
+
word = " ".join(cl.removeStopWords(word))
|
185 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
186 |
+
if len(word) > 1:
|
187 |
+
if len(word.split(" ")) > 1:
|
188 |
+
for x in word.split(" "):
|
189 |
+
if len(x) > 1 and x.isnumeric()==False:
|
190 |
+
outputLine.append(x.lower())
|
191 |
+
else:
|
192 |
+
if word.isnumeric() == False:
|
193 |
+
outputLine.append(word.lower())
|
194 |
+
if len(outputLine) > 0:
|
195 |
+
outputDF.append(outputLine)
|
196 |
+
return outputDF
|
197 |
+
# create a corpus
|
198 |
+
def createCorpusText(self, corpusText):
|
199 |
+
'''ex: "Tom is cat. Jerry is mouse."
|
200 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
201 |
+
# the output should be like this:
|
202 |
+
'''texts = {
|
203 |
+
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
|
204 |
+
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
|
205 |
+
}
|
206 |
+
'''
|
207 |
+
# separate paragraph
|
208 |
+
'''Ex: Cat is an animal. Tom is cat.
|
209 |
+
|
210 |
+
Mouse is an animal.
|
211 |
+
Jerry is mouse.'''
|
212 |
+
texts = {}
|
213 |
+
cl = cleanText.cleanGenText()
|
214 |
+
#cl = cleanGenText()
|
215 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
216 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
217 |
+
texts["Paragraph "+str(pos)] = []
|
218 |
+
lines = corpusText.split("\n\n")[pos]
|
219 |
+
for line in lines.split("\n"):
|
220 |
+
for l in line.split("."):
|
221 |
+
if len(l) > 0:
|
222 |
+
cl.removeTabWhiteSpaceNewLine(l)
|
223 |
+
l = l.lower()
|
224 |
+
newL = []
|
225 |
+
for word in l.split(" "):
|
226 |
+
if len(word) > 0:
|
227 |
+
word = cl.removeStopWords(word)
|
228 |
+
for w in word:
|
229 |
+
if len(w) > 0 and w.isnumeric()==False:
|
230 |
+
newL.append(w)
|
231 |
+
if len(newL)>0:
|
232 |
+
texts["Paragraph "+str(pos)].append(newL)
|
233 |
+
if len(texts["Paragraph "+str(pos)]) == 0:
|
234 |
+
del texts["Paragraph "+str(pos)]
|
235 |
+
return texts
|
236 |
+
def selectParaForWC(self,corpus):
|
237 |
+
''' corpus should be in the format:
|
238 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
239 |
+
corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
|
240 |
+
corSize = len(corpus)
|
241 |
+
# less than 2000
|
242 |
+
if 0 < corSize < 2000:
|
243 |
+
window=3.5
|
244 |
+
vector_size=75
|
245 |
+
sample=1e-3
|
246 |
+
negative=10
|
247 |
+
epochs=10
|
248 |
+
sg=1
|
249 |
+
# 2000 - 100000
|
250 |
+
elif 2000 <= corSize < 100000:
|
251 |
+
window=3.5
|
252 |
+
vector_size=75
|
253 |
+
sample=1e-5
|
254 |
+
negative=10
|
255 |
+
epochs=10
|
256 |
+
sg=1
|
257 |
+
elif 100000 <=corSize < 1000000:
|
258 |
+
window=7.5
|
259 |
+
vector_size=150
|
260 |
+
sample=1e-5
|
261 |
+
negative=10
|
262 |
+
epochs=6
|
263 |
+
sg=0
|
264 |
+
return window, vector_size, sample, negative, epochs, sg
|
265 |
+
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
|
266 |
+
vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
|
267 |
+
# if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
|
268 |
+
jsonFile = ""
|
269 |
+
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
270 |
+
cores = multiprocessing.cpu_count()
|
271 |
+
combinedCorpus = []
|
272 |
+
window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
|
273 |
+
if len(jsonFile) > 0:
|
274 |
+
for key in jsonFile:
|
275 |
+
combinedCorpus.extend(jsonFile[key])
|
276 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
277 |
+
# # min_count=1 ensures all words are included
|
278 |
+
'''w2vModel = Word2Vec(
|
279 |
+
min_count=1,
|
280 |
+
window=window,
|
281 |
+
vector_size=vector_size,
|
282 |
+
sample=sample,
|
283 |
+
alpha=0.03,
|
284 |
+
min_alpha=0.0007,
|
285 |
+
negative=negative,
|
286 |
+
workers=cores-1,
|
287 |
+
epochs = epochs,
|
288 |
+
sg=sg)'''
|
289 |
+
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
290 |
+
accept = False
|
291 |
+
while not accept:
|
292 |
+
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
293 |
+
try:
|
294 |
+
w2vModel = Word2Vec(
|
295 |
+
min_count=1,
|
296 |
+
window=window,
|
297 |
+
vector_size=vector_size,
|
298 |
+
sample=sample,
|
299 |
+
alpha=0.03,
|
300 |
+
min_alpha=0.0007,
|
301 |
+
negative=negative,
|
302 |
+
workers=cores-1,
|
303 |
+
epochs = epochs,
|
304 |
+
sg=sg)
|
305 |
+
w2vModel.build_vocab(combinedCorpus)
|
306 |
+
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
307 |
+
accept = True
|
308 |
+
except:
|
309 |
+
for key in jsonFile:
|
310 |
+
combinedCorpus.extend(jsonFile[key])
|
311 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
312 |
+
print("next is " + str(len(combinedCorpus)))
|
313 |
+
else:
|
314 |
+
print("no parameter to train")
|
315 |
+
break
|
316 |
+
#w2vModel.build_vocab(combinedCorpus)
|
317 |
+
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
318 |
+
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
319 |
+
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
320 |
+
w2vModel.save(saveFolder+"/"+modelName+".model")
|
321 |
+
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
322 |
+
print("done w2v")
|
323 |
+
else: print("no corpus to train")
|
324 |
+
#return combinedCorpus
|
325 |
+
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
326 |
+
# might not be a meaningful keyword
|
327 |
+
#stopWords = ["show"]
|
328 |
+
# same word but just plural nouns, tense
|
329 |
+
simWords = [word+"s",word+"es",word+"ing",word+"ed"]
|
330 |
+
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
|
331 |
+
results = model.most_similar(positive=[word],topn=n)
|
332 |
+
#removeIndex = []
|
333 |
+
#currN = copy.deepcopy(n)
|
334 |
+
'''for r in range(len(results)):
|
335 |
+
if len(results[r][0]) < 2:
|
336 |
+
removeIndex.append(results[r])
|
337 |
+
# remove the same word but just plural and singular noun and lower than the cos_thres
|
338 |
+
elif results[r][0] == word:
|
339 |
+
removeIndex.append(results[r])
|
340 |
+
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
|
341 |
+
removeIndex.append(results[r])
|
342 |
+
for rem in removeIndex:
|
343 |
+
results.remove(rem)
|
344 |
+
while len(results)!=n and len(results) != 0:
|
345 |
+
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
|
346 |
+
if moreNewResult not in results and len(moreNewResult[0])>1:
|
347 |
+
if moreNewResult[0] not in stopWords and results[0] != word:
|
348 |
+
results.append(moreNewResult)
|
349 |
+
currN +=1'''
|
350 |
+
return results
|
351 |
+
# adding our model into spacy
|
352 |
+
# this deals with command line; but instead of using it, we write python script to run command line
|
353 |
+
def loadWordVec(self,modelName,wordVec):
|
354 |
+
# modelName is the name you want to save into spacy
|
355 |
+
# wordVec is the trained word2vec in txt format
|
356 |
+
subprocess.run([sys.executable,
|
357 |
+
"-m",
|
358 |
+
"spacy",
|
359 |
+
"init-model",
|
360 |
+
"en",
|
361 |
+
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
362 |
+
"--vectors-loc",
|
363 |
+
wordVec])
|
364 |
+
print("done")
|
app.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from collections import Counter
|
5 |
+
import csv
|
6 |
+
import os
|
7 |
+
from functools import lru_cache
|
8 |
+
from mtdna_classifier import classify_sample_location
|
9 |
+
@lru_cache(maxsize=128)
|
10 |
+
def classify_sample_location_cached(accession):
|
11 |
+
return classify_sample_location(accession)
|
12 |
+
|
13 |
+
# Count and suggest final location
|
14 |
+
def compute_final_suggested_location(rows):
|
15 |
+
candidates = [
|
16 |
+
row.get("Predicted Location", "").strip()
|
17 |
+
for row in rows
|
18 |
+
if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found"]
|
19 |
+
] + [
|
20 |
+
row.get("Inferred Region", "").strip()
|
21 |
+
for row in rows
|
22 |
+
if row.get("Inferred Region", "").strip().lower() not in ["","unknown"]
|
23 |
+
]
|
24 |
+
|
25 |
+
if not candidates:
|
26 |
+
return Counter(), ("Unknown", 0)
|
27 |
+
|
28 |
+
counts = Counter(candidates)
|
29 |
+
top_location, count = counts.most_common(1)[0]
|
30 |
+
return counts, (top_location, count)
|
31 |
+
|
32 |
+
# Store feedback (with required fields)
|
33 |
+
def store_feedback_to_drive(accession, answer1, answer2, contact=""):
|
34 |
+
if not answer1.strip() or not answer2.strip():
|
35 |
+
return "⚠️ Please answer both questions before submitting."
|
36 |
+
|
37 |
+
feedback_file = "/content/drive/MyDrive/Customers/feedback_mtdna.csv"
|
38 |
+
header = ["accession", "helpful", "improvement", "contact"]
|
39 |
+
row = [accession, answer1, answer2, contact]
|
40 |
+
file_exists = os.path.isfile(feedback_file)
|
41 |
+
with open(feedback_file, "a", newline="") as f:
|
42 |
+
writer = csv.writer(f)
|
43 |
+
if not file_exists:
|
44 |
+
writer.writerow(header)
|
45 |
+
writer.writerow(row)
|
46 |
+
return "✅ Feedback submitted. Thank you!"
|
47 |
+
|
48 |
+
def summarize_results(accession):
|
49 |
+
try:
|
50 |
+
output = classify_sample_location_cached(accession)
|
51 |
+
except Exception as e:
|
52 |
+
return [], f"❌ Error: {e}"
|
53 |
+
|
54 |
+
if accession not in output:
|
55 |
+
return [], "❌ Accession not found in results."
|
56 |
+
|
57 |
+
isolate = next((k for k in output if k != accession), None)
|
58 |
+
row_score = []
|
59 |
+
rows = []
|
60 |
+
|
61 |
+
for key in [accession, isolate]:
|
62 |
+
if key not in output:
|
63 |
+
continue
|
64 |
+
sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
|
65 |
+
for section, techniques in output[key].items():
|
66 |
+
for technique, content in techniques.items():
|
67 |
+
source = content.get("source", "")
|
68 |
+
predicted = content.get("predicted_location", "")
|
69 |
+
haplogroup = content.get("haplogroup", "")
|
70 |
+
inferred = content.get("inferred_location", "")
|
71 |
+
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
72 |
+
|
73 |
+
row = {
|
74 |
+
"Sample ID": sample_id_label,
|
75 |
+
"Technique": technique,
|
76 |
+
"Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
|
77 |
+
"Predicted Location": "" if technique == "haplogroup" else predicted,
|
78 |
+
"Haplogroup": haplogroup if technique == "haplogroup" else "",
|
79 |
+
"Inferred Region": inferred if technique == "haplogroup" else "",
|
80 |
+
"Context Snippet": context
|
81 |
+
}
|
82 |
+
|
83 |
+
row_score.append(row)
|
84 |
+
rows.append(list(row.values()))
|
85 |
+
|
86 |
+
location_counts, (final_location, count) = compute_final_suggested_location(row_score)
|
87 |
+
summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
|
88 |
+
summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
|
89 |
+
summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
|
90 |
+
summary = "\n".join(summary_lines)
|
91 |
+
|
92 |
+
return rows, summary
|
93 |
+
|
94 |
+
# Gradio UI
|
95 |
+
with gr.Blocks() as interface:
|
96 |
+
gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
|
97 |
+
gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
|
101 |
+
run_button = gr.Button("🔍 Submit and Classify")
|
102 |
+
reset_button = gr.Button("🔄 Reset")
|
103 |
+
|
104 |
+
status = gr.Markdown(visible=False)
|
105 |
+
headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
|
106 |
+
output_table = gr.Dataframe(headers=headers, interactive=False)
|
107 |
+
output_summary = gr.Markdown()
|
108 |
+
|
109 |
+
gr.Markdown("---")
|
110 |
+
gr.Markdown("### 💬 Feedback (required)")
|
111 |
+
q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
|
112 |
+
q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
|
113 |
+
contact = gr.Textbox(label="📧 Your email or institution (optional)")
|
114 |
+
submit_feedback = gr.Button("✅ Submit Feedback")
|
115 |
+
feedback_status = gr.Markdown()
|
116 |
+
|
117 |
+
def classify_with_loading(accession):
|
118 |
+
return gr.update(value="⏳ Please wait... processing...", visible=True)
|
119 |
+
|
120 |
+
def classify_main(accession):
|
121 |
+
table, summary = summarize_results(accession)
|
122 |
+
return table, summary, gr.update(visible=False)
|
123 |
+
|
124 |
+
def reset_fields():
|
125 |
+
return "", "", "", "", "", [], "", gr.update(visible=False)
|
126 |
+
|
127 |
+
run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
|
128 |
+
run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
|
129 |
+
submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
|
130 |
+
reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
|
131 |
+
|
132 |
+
interface.launch()
|
data/haplogroup_regions_extended.csv
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
haplogroup,region,source
|
2 |
+
H,Western Europe,PhyloTree
|
3 |
+
U,Eurasia,PhyloTree
|
4 |
+
L0,Southern Africa,EMPOP
|
5 |
+
L1,Central Africa,EMPOP
|
6 |
+
L2,West Africa,EMPOP
|
7 |
+
L3,East Africa,EMPOP
|
8 |
+
B4,Southeast Asia,EMPOP
|
9 |
+
A2,Native North America,PhyloTree
|
10 |
+
C1,Siberia and Americas,PhyloTree
|
11 |
+
D4,East Asia,PhyloTree
|
12 |
+
X,Western Eurasia / North America,PhyloTree
|
13 |
+
J,Europe and Near East,PhyloTree
|
14 |
+
K,Europe,PhyloTree
|
15 |
+
T,Europe and Central Asia,PhyloTree
|
16 |
+
M,Asia,EMPOP
|
17 |
+
N,Worldwide (basal),PhyloTree
|
18 |
+
I,Europe,PhyloTree
|
19 |
+
W,Eurasia,PhyloTree
|
20 |
+
Z,North and East Asia,PhyloTree
|
21 |
+
Y,Southeast Asia,EMPOP
|
22 |
+
E,Oceania and Southeast Asia,PhyloTree
|
23 |
+
F,East and Southeast Asia,EMPOP
|
24 |
+
B2,Native South America,EMPOP
|
25 |
+
A1,Central Asia,EMPOP
|
26 |
+
C4,Siberia,PhyloTree
|
27 |
+
D1,South America,PhyloTree
|
28 |
+
M7,East Asia,EMPOP
|
29 |
+
M8,Japan,EMPOP
|
30 |
+
G,Siberia,PhyloTree
|
31 |
+
HV,Europe and Middle East,PhyloTree
|
32 |
+
U5,Northern Europe,PhyloTree
|
33 |
+
U6,North Africa,PhyloTree
|
34 |
+
U7,South Asia,PhyloTree
|
35 |
+
U8,Central Europe,PhyloTree
|
36 |
+
R0,Arabian Peninsula,PhyloTree
|
37 |
+
R9,Southeast Asia,PhyloTree
|
38 |
+
H1,Iberian Peninsula,PhyloTree
|
39 |
+
H2,Eastern Europe,PhyloTree
|
40 |
+
H3,Western Europe,PhyloTree
|
41 |
+
H5,Balkans,PhyloTree
|
42 |
+
J1,Europe,PhyloTree
|
43 |
+
J2,Middle East,PhyloTree
|
44 |
+
T1,Eastern Europe,PhyloTree
|
45 |
+
T2,Near East,PhyloTree
|
46 |
+
M1,North Africa,PhyloTree
|
47 |
+
M2,South Asia,PhyloTree
|
48 |
+
M3,South Asia,PhyloTree
|
49 |
+
M4,South Asia,PhyloTree
|
50 |
+
M5,South Asia,PhyloTree
|
51 |
+
M6,South Asia,PhyloTree
|
mtdna_classifier.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# mtDNA Location Classifier MVP (Google Colab)
|
2 |
+
# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
import re
|
6 |
+
from Bio import Entrez
|
7 |
+
import fitz
|
8 |
+
import spacy
|
9 |
+
from NER.PDF import pdf
|
10 |
+
from NER.WordDoc import wordDoc
|
11 |
+
from NER.html import extractHTML
|
12 |
+
from NER.word2Vec import word2vec
|
13 |
+
from transformers import pipeline
|
14 |
+
# Set your email (required by NCBI Entrez)
|
15 |
+
#Entrez.email = "[email protected]"
|
16 |
+
|
17 |
+
# Step 1: Get PubMed ID from Accession using EDirect
|
18 |
+
|
19 |
+
def get_info_from_accession(accession):
|
20 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
|
21 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
22 |
+
output = result.stdout
|
23 |
+
pubmedID, isolate = "", ""
|
24 |
+
for line in output.split("\n"):
|
25 |
+
if len(line) > 0:
|
26 |
+
if "PUBMED" in line:
|
27 |
+
pubmedID = line.split()[-1]
|
28 |
+
if "isolate" in line: # Check for isolate information
|
29 |
+
# Try direct GenBank annotation: /isolate="XXX"
|
30 |
+
match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
|
31 |
+
if match1:
|
32 |
+
isolate = match1.group(1)
|
33 |
+
else:
|
34 |
+
# Try from DEFINITION line: ...isolate XXX...
|
35 |
+
match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
|
36 |
+
if match2:
|
37 |
+
isolate = match2.group(1)
|
38 |
+
|
39 |
+
# Return the values, even if they are empty strings
|
40 |
+
return pubmedID, isolate
|
41 |
+
# Step 2: Get doi link to access the paper
|
42 |
+
def get_doi_from_pubmed_id(id):
|
43 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {id} -format medline | grep -i "AID"'
|
44 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
45 |
+
output = result.stdout
|
46 |
+
doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
|
47 |
+
match = re.search(doi_pattern, output, re.IGNORECASE)
|
48 |
+
return match.group(0)
|
49 |
+
|
50 |
+
# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
|
51 |
+
# Step 3.1: Extract Text
|
52 |
+
def get_paper_text(doi,id):
|
53 |
+
# create the temporary folder to contain the texts
|
54 |
+
cmd = f'mkdir {id}'
|
55 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
56 |
+
saveLinkFolder = "/mtDNALocation/data/"+id
|
57 |
+
|
58 |
+
link = 'https://doi.org/' + doi
|
59 |
+
'''textsToExtract = { "doiLink":"paperText"
|
60 |
+
"file1.pdf":"text1",
|
61 |
+
"file2.doc":"text2",
|
62 |
+
"file3.xlsx":excelText3'''
|
63 |
+
textsToExtract = {}
|
64 |
+
# get the file to create listOfFile for each id
|
65 |
+
html = extractHTML.HTML("",link)
|
66 |
+
jsonSM = html.getSupMaterial()
|
67 |
+
text = ""
|
68 |
+
links = [link] + sum((jsonSM[key] for key in jsonSM),[])
|
69 |
+
#print(links)
|
70 |
+
for l in links:
|
71 |
+
# get the main paper
|
72 |
+
if l == link:
|
73 |
+
text = html.getListSection()
|
74 |
+
textsToExtract[link] = text
|
75 |
+
elif l.endswith(".pdf"):
|
76 |
+
p = pdf.PDF(l,saveLinkFolder,doi)
|
77 |
+
f = p.openPDFFile()
|
78 |
+
pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
|
79 |
+
doc = fitz.open(pdf_path)
|
80 |
+
text = "\n".join([page.get_text() for page in doc])
|
81 |
+
textsToExtract[l] = text
|
82 |
+
elif l.endswith(".doc") or l.endswith(".docx"):
|
83 |
+
d = wordDoc.wordDoc(l,saveLinkFolder)
|
84 |
+
text = d.extractTextByPage()
|
85 |
+
textsToExtract[l] = text
|
86 |
+
elif l.split(".")[-1].lower() in "xlsx":
|
87 |
+
wc = word2vec.word2Vec()
|
88 |
+
corpus = wc.tableTransformToCorpusText([],l)
|
89 |
+
text = ''
|
90 |
+
for c in corpus:
|
91 |
+
para = corpus[c]
|
92 |
+
for words in para:
|
93 |
+
text += " ".join(words)
|
94 |
+
textsToExtract[l] = text
|
95 |
+
# delete folder after finishing getting text
|
96 |
+
cmd = f'rm -r /mtDNALocation/data/{id}'
|
97 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
98 |
+
return textsToExtract
|
99 |
+
# Step 3.2: Extract context
|
100 |
+
def extract_context(text, keyword, window=500):
|
101 |
+
idx = text.find(keyword)
|
102 |
+
if idx == -1:
|
103 |
+
return "Sample ID not found."
|
104 |
+
return text[max(0, idx-window): idx+window]
|
105 |
+
# Step 4: Classification for now (demo purposes)
|
106 |
+
# 4.1: Using a HuggingFace model (question-answering)
|
107 |
+
def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
|
108 |
+
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
109 |
+
result = qa({"context": context, "question": question})
|
110 |
+
return result["answer"]
|
111 |
+
# 4.2: Infer from haplogroup
|
112 |
+
# Load pre-trained spaCy model for NER
|
113 |
+
nlp = spacy.load("en_core_web_sm")
|
114 |
+
# Define the haplogroup-to-region mapping (simple rule-based)
|
115 |
+
import csv
|
116 |
+
|
117 |
+
def load_haplogroup_mapping(csv_path):
|
118 |
+
mapping = {}
|
119 |
+
with open(csv_path) as f:
|
120 |
+
reader = csv.DictReader(f)
|
121 |
+
for row in reader:
|
122 |
+
mapping[row["haplogroup"]] = [row["region"],row["source"]]
|
123 |
+
return mapping
|
124 |
+
|
125 |
+
# Function to extract haplogroup from the text
|
126 |
+
def extract_haplogroup(text):
|
127 |
+
# 1. Try to find a haplogroup preceded by the word "haplogroup"
|
128 |
+
match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
|
129 |
+
if match:
|
130 |
+
return re.match(r'^[A-Z][0-9]*', match.group(1)).group(0)
|
131 |
+
#return match.group(1) # This is the actual haplogroup code like U5b1
|
132 |
+
|
133 |
+
# 2. Fallback: try to find isolated uppercase-letter haplogroup codes
|
134 |
+
fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
|
135 |
+
if fallback:
|
136 |
+
return fallback.group(1)
|
137 |
+
|
138 |
+
return None # If nothing found
|
139 |
+
|
140 |
+
# Function to extract location based on NER
|
141 |
+
def extract_location(text):
|
142 |
+
doc = nlp(text)
|
143 |
+
locations = []
|
144 |
+
for ent in doc.ents:
|
145 |
+
if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
|
146 |
+
locations.append(ent.text)
|
147 |
+
return locations
|
148 |
+
|
149 |
+
# Function to infer location from haplogroup
|
150 |
+
def infer_location_from_haplogroup(haplogroup):
|
151 |
+
haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
|
152 |
+
return haplo_map.get(haplogroup, ["Unknown","Unknown"])
|
153 |
+
|
154 |
+
# Function to classify the mtDNA sample
|
155 |
+
def classify_mtDNA_sample_from_haplo(text):
|
156 |
+
# Extract haplogroup
|
157 |
+
haplogroup = extract_haplogroup(text)
|
158 |
+
# Extract location based on NER
|
159 |
+
locations = extract_location(text)
|
160 |
+
# Infer location based on haplogroup
|
161 |
+
inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
|
162 |
+
return {
|
163 |
+
"source":sourceHaplo,
|
164 |
+
"locations_found_in_context": locations,
|
165 |
+
"haplogroup": haplogroup,
|
166 |
+
"inferred_location": inferred_location
|
167 |
+
|
168 |
+
}
|
169 |
+
# 4.3 Get from available NCBI
|
170 |
+
def infer_location_fromNCBI(accession):
|
171 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "location|country|geo"'
|
172 |
+
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
173 |
+
output, location = "",""
|
174 |
+
output = result.stdout
|
175 |
+
if "location" in output or "country" in output or "geo" in output:
|
176 |
+
location = output.split('"')[1]
|
177 |
+
output = output.split()[0]
|
178 |
+
else:
|
179 |
+
location = "Unknown"
|
180 |
+
output = "No location information found in NCBI."
|
181 |
+
return location, output
|
182 |
+
|
183 |
+
# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
|
184 |
+
def classify_sample_location(accession):
|
185 |
+
outputs = {}
|
186 |
+
keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
|
187 |
+
# Step 1: get pubmed id and isolate
|
188 |
+
pubmedID, isolate = get_info_from_accession(accession)
|
189 |
+
# Step 2: get doi
|
190 |
+
doi = get_doi_from_pubmed_id(pubmedID)
|
191 |
+
# Step 3: get text
|
192 |
+
'''textsToExtract = { "doiLink":"paperText"
|
193 |
+
"file1.pdf":"text1",
|
194 |
+
"file2.doc":"text2",
|
195 |
+
"file3.xlsx":excelText3'''
|
196 |
+
textsToExtract = get_paper_text(doi,pubmedID)
|
197 |
+
# Step 4: prediction
|
198 |
+
outputs[accession] = {}
|
199 |
+
outputs[isolate] = {}
|
200 |
+
# 4.0 Infer from NCBI
|
201 |
+
location, outputNCBI = infer_location_fromNCBI(accession)
|
202 |
+
NCBI_result = {
|
203 |
+
"source": "NCBI",
|
204 |
+
"sample_id": accession,
|
205 |
+
"predicted_location": location,
|
206 |
+
"context_snippet": outputNCBI}
|
207 |
+
outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
|
208 |
+
for key in textsToExtract:
|
209 |
+
text = textsToExtract[key]
|
210 |
+
# try accession number first
|
211 |
+
outputs[accession][key] = {}
|
212 |
+
keyword = accession
|
213 |
+
context = extract_context(text, keyword, window=500)
|
214 |
+
# 4.1: Using a HuggingFace model (question-answering)
|
215 |
+
location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
|
216 |
+
qa_result = {
|
217 |
+
"source": key,
|
218 |
+
"sample_id": keyword,
|
219 |
+
"predicted_location": location,
|
220 |
+
"context_snippet": context
|
221 |
+
}
|
222 |
+
outputs[keyword][key]["QAModel"] = qa_result
|
223 |
+
# 4.2: Infer from haplogroup
|
224 |
+
haplo_result = classify_mtDNA_sample_from_haplo(context)
|
225 |
+
outputs[keyword][key]["haplogroup"] = haplo_result
|
226 |
+
# try isolate
|
227 |
+
keyword = isolate
|
228 |
+
outputs[isolate][key] = {}
|
229 |
+
context = extract_context(text, keyword, window=500)
|
230 |
+
# 4.1.1: Using a HuggingFace model (question-answering)
|
231 |
+
location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
|
232 |
+
qa_result = {
|
233 |
+
"source": key,
|
234 |
+
"sample_id": keyword,
|
235 |
+
"predicted_location": location,
|
236 |
+
"context_snippet": context
|
237 |
+
}
|
238 |
+
outputs[keyword][key]["QAModel"] = qa_result
|
239 |
+
# 4.2.1: Infer from haplogroup
|
240 |
+
haplo_result = classify_mtDNA_sample_from_haplo(context)
|
241 |
+
outputs[keyword][key]["haplogroup"] = haplo_result
|
242 |
+
return outputs
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
pandas
|
5 |
+
scikit-learn
|
6 |
+
spacy
|
7 |
+
pymupdf
|
8 |
+
requests
|
9 |
+
biopython
|
10 |
+
openpyxl
|
11 |
+
bs4
|
12 |
+
pdfreader
|
13 |
+
tabula-py
|
14 |
+
python-docx
|
15 |
+
thefuzz
|
16 |
+
wordsegment
|
17 |
+
spacy-lookups-data
|
18 |
+
gensim
|
19 |
+
nltk
|
setup.sh
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Install Entrez Direct automatically with yes
|
4 |
+
yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
|
5 |
+
|
6 |
+
# Add EDirect to PATH for the current session
|
7 |
+
echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
|
8 |
+
source ~/.bashrc
|