VyLala commited on
Commit
2621d77
·
verified ·
1 Parent(s): f758879

Upload 19 files

Browse files
DefaultPackages/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __all__ = [
2
+ 'openFile',
3
+ 'saveFile',
4
+ ]
DefaultPackages/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (193 Bytes). View file
 
DefaultPackages/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (212 Bytes). View file
 
DefaultPackages/__pycache__/openFile.cpython-310.pyc ADDED
Binary file (566 Bytes). View file
 
DefaultPackages/__pycache__/openFile.cpython-311.pyc ADDED
Binary file (989 Bytes). View file
 
DefaultPackages/__pycache__/saveFile.cpython-310.pyc ADDED
Binary file (590 Bytes). View file
 
DefaultPackages/__pycache__/saveFile.cpython-311.pyc ADDED
Binary file (1.02 kB). View file
 
DefaultPackages/openFile.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def openFile(file):
2
+ with open(file) as f:
3
+ openFile = f.read()
4
+ return openFile
5
+
6
+ def openJsonFile(file):
7
+ import json
8
+ # Opening JSON file
9
+ with open(file, 'r') as openfile:
10
+ # Reading from json file
11
+ json_object = json.load(openfile)
12
+ return json_object
DefaultPackages/saveFile.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ def saveFile(name,content):
3
+ Name = name
4
+ fi = open(Name, "a")
5
+ # Add new change in to saved file
6
+ with open(Name, "w") as external_file:
7
+ add_text = content
8
+ print(add_text, file=external_file)
9
+ external_file.close()
10
+ def saveJsonFile(name,content):
11
+ saveFile(name,json.dumps(content))
NER/PDF/pdf.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!pip install pdfreader
2
+ import pdfreader
3
+ from pdfreader import PDFDocument, SimplePDFViewer
4
+ #!pip install bs4
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+ from NER import cleanText
8
+ #!pip install tabula-py
9
+
10
+ import tabula
11
+ class PDF(): # using PyPDF2
12
+ def __init__(self, pdf, saveFolder, doi=None):
13
+ self.pdf = pdf
14
+ self.doi = doi
15
+ self.saveFolder = saveFolder
16
+ def openPDFFile(self):
17
+ if "https" in self.pdf:
18
+ name = self.pdf.split("/")[-1]
19
+ name = self.downloadPDF(self.saveFolder)
20
+ if name != "no pdfLink to download":
21
+ fileToOpen = self.saveFolder + "/" + name
22
+ else: fileToOpen = self.pdf
23
+ else: fileToOpen = self.pdf
24
+ return open(fileToOpen, "rb")
25
+ def downloadPDF(self, saveFolder):
26
+ pdfLink = ''
27
+ if ".pdf" not in self.pdf and "https" not in self.pdf: # the download link is a general URL not pdf link
28
+ r = requests.get(self.pdf)
29
+ soup = BeautifulSoup(r.content, 'html.parser')
30
+ links = soup.find_all("a")
31
+ for link in links:
32
+ if ".pdf" in link.get("href"):
33
+ if self.doi in link.get("href"):
34
+ pdfLink = link.get("href")
35
+ break
36
+ else:
37
+ pdfLink = self.pdf
38
+ if pdfLink != '':
39
+ response = requests.get(pdfLink)
40
+ name = pdfLink.split("/")[-1]
41
+ pdf = open(saveFolder+"/"+name, 'wb')
42
+ pdf.write(response.content)
43
+ pdf.close()
44
+ print("pdf downloaded")
45
+ return name
46
+ else:
47
+ return "no pdfLink to download"
48
+ def extractText(self):
49
+ jsonPage = {}
50
+ pdf = self.openPDFFile()
51
+ doc = PDFDocument(pdf)
52
+ viewer = SimplePDFViewer(pdf)
53
+ all_pages = [p for p in doc.pages()]
54
+ cl = cleanText.cleanGenText()
55
+ for page in range(1,len(all_pages)):
56
+ viewer.navigate(page)
57
+ viewer.render()
58
+ if str(page) not in jsonPage:
59
+ jsonPage[str(page)] = {}
60
+ # text
61
+ text = "".join(viewer.canvas.strings)
62
+ clean, filteredWord = cl.textPreprocessing(text) #cleanText.cleanGenText(text).cleanText()
63
+ # save the text of filtered words which remove "a", the, "an", "is", etc.
64
+ jsonPage[str(page)]["normalText"] = [text]
65
+ jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
66
+ #image
67
+ image = viewer.canvas.images
68
+ jsonPage[str(page)]["image"] = [image]
69
+ #form
70
+ form = viewer.canvas.forms
71
+ jsonPage[str(page)]["form"] = [form]
72
+ # content based on PDF adobe
73
+ content = viewer.canvas.text_content
74
+ jsonPage[str(page)]["content"] = [content]
75
+ # inline_image:'''
76
+ '''Inline images are aligned with the text,
77
+ and are usually content images like photos, charts, or graphs.'''
78
+ inline_image = viewer.canvas.inline_images
79
+ jsonPage[str(page)]["inline_image"] = [inline_image]
80
+ pdf.close()
81
+ '''Output Format:
82
+ jsonPage[str(page)]["normalText"]
83
+ jsonPage[str(page)]["cleanText"]
84
+ jsonPage[str(page)]["image"]
85
+ jsonPage[str(page)]["form"]
86
+ jsonPage[str(page)]["content"]'''
87
+ return jsonPage
88
+ def extractTable(self,pages,saveFile=None,outputFormat=None):
89
+ '''pages (str, int, iterable of int, optional) –
90
+ An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
91
+ Examples: '1-2,3', 'all', [1,2]'''
92
+ df = []
93
+ if "https" in self.pdf:
94
+ name = self.pdf.split("/")[-1]
95
+ name = self.downloadPDF(self.saveFolder)
96
+ if name != "no pdfLink to download":
97
+ fileToOpen = self.saveFolder + "/" + name
98
+ else: fileToOpen = self.pdf
99
+ else: fileToOpen = self.pdf
100
+ try:
101
+ df = tabula.read_pdf(fileToOpen, pages=pages)
102
+ # saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv"
103
+ # outputFormat: "csv"
104
+ #tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages)
105
+ except:# ValueError:
106
+ df = []
107
+ print("No tables found in PDF file")
108
+ return df
109
+ def mergeTextinJson(self,jsonPDF):
110
+ # pdf
111
+ #cl = cleanGenText()
112
+ cl = cleanText.cleanGenText()
113
+ pdfText = ""
114
+ for page in jsonPDF:
115
+ # page is "\n\n"
116
+ if len(jsonPDF[page]["normalText"]) > 0:
117
+ for i in range(len(jsonPDF[page]["normalText"])):
118
+ text = jsonPDF[page]["normalText"][i]
119
+ if len(text)>0:
120
+ text = cl.removeTabWhiteSpaceNewLine(text)
121
+ text = cl.removeExtraSpaceBetweenWords(text)
122
+ jsonPDF[page]["normalText"][i] = text
123
+ # same page is just a dot.
124
+ if i-1 > 0:
125
+ if jsonPDF[page]["normalText"][i-1][-1] != ".":
126
+ pdfText += ". "
127
+ pdfText += jsonPDF[page]["normalText"][i]
128
+ if len(jsonPDF[page]["normalText"][i])>0:
129
+ if jsonPDF[page]["normalText"][i][-1]!=".":
130
+ pdfText += "."
131
+ pdfText += "\n\n"
132
+ return pdfText
133
+ def getReference(self):
134
+ pass
135
+ def getSupMaterial(self):
136
+ pass
137
+ def removeHeaders(self):
138
+ pass
139
+ def removeFooters(self):
140
+ pass
141
+ def removeReference(self):
142
+ pass
NER/WordDoc/wordDoc.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! pip install spire.doc
2
+ #! pip install Spire.XLS
3
+ import pandas as pd
4
+ from spire.doc import *
5
+ from spire.doc.common import *
6
+ from spire.xls import *
7
+ from spire.xls.common import *
8
+ from NER import cleanText
9
+ import requests
10
+ class wordDoc(): # using python-docx
11
+ def __init__(self, wordDoc,saveFolder):
12
+ self.wordDoc = wordDoc
13
+ self.saveFolder = saveFolder
14
+ def openFile(self):
15
+ document = Document()
16
+ return document.LoadFromFile(self.wordDoc)
17
+ def extractTextByPage(self):
18
+ # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
19
+ json = {}
20
+ #doc = self.openFile()
21
+ # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
22
+ try:
23
+ doc = Document()
24
+ doc.LoadFromFile(self.wordDoc)
25
+ except:
26
+ response = requests.get(self.wordDoc)
27
+ name = self.wordDoc.split("/")[-1]
28
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
29
+ temp_file.write(response.content)
30
+ doc = Document()
31
+ doc.LoadFromFile(self.saveFolder+"/" + name)
32
+ text = doc.GetText()
33
+ return text
34
+ def extractTableAsText(self):
35
+ getDoc = ''
36
+ try:
37
+ # reference:
38
+ # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
39
+ doc = Document()
40
+ doc.LoadFromFile(self.wordDoc)
41
+ getDoc = "have document"
42
+ except:
43
+ response = requests.get(self.wordDoc)
44
+ name = self.wordDoc.split("/")[-1]
45
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
46
+ temp_file.write(response.content)
47
+ doc = Document()
48
+ doc.LoadFromFile(self.saveFolder+"/" + name)
49
+ getDoc = "have document"
50
+ json = {}
51
+ if len(getDoc) > 0:
52
+ # Loop through the sections
53
+ for s in range(doc.Sections.Count):
54
+ # Get a section
55
+ section = doc.Sections.get_Item(s)
56
+ # Get the tables in the section
57
+ json["Section" + str(s)] = {}
58
+ tables = section.Tables
59
+ # Loop through the tables
60
+ for i in range(0, tables.Count):
61
+ # Get a table
62
+ table = tables.get_Item(i)
63
+ # Initialize a string to store the table data
64
+ tableData = ''
65
+ # Loop through the rows of the table
66
+ for j in range(0, table.Rows.Count):
67
+ # Loop through the cells of the row
68
+ for k in range(0, table.Rows.get_Item(j).Cells.Count):
69
+ # Get a cell
70
+ cell = table.Rows.get_Item(j).Cells.get_Item(k)
71
+ # Get the text in the cell
72
+ cellText = ''
73
+ for para in range(cell.Paragraphs.Count):
74
+ paragraphText = cell.Paragraphs.get_Item(para).Text
75
+ cellText += (paragraphText + ' ')
76
+ # Add the text to the string
77
+ tableData += cellText
78
+ if k < table.Rows.get_Item(j).Cells.Count - 1:
79
+ tableData += '\t'
80
+ # Add a new line
81
+ tableData += '\n'
82
+ json["Section" + str(s)]["Table"+str(i)] = tableData
83
+ return json
84
+ def extractTableAsExcel(self):
85
+ getDoc = ''
86
+ try:
87
+ # reference:
88
+ # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
89
+ doc = Document()
90
+ doc.LoadFromFile(self.wordDoc)
91
+ getDoc = "have document"
92
+ except:
93
+ response = requests.get(self.wordDoc)
94
+ name = self.wordDoc.split("/")[-1]
95
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
96
+ temp_file.write(response.content)
97
+ doc = Document()
98
+ doc.LoadFromFile(self.saveFolder+"/" + name)
99
+ getDoc = "have document"
100
+ if len(getDoc) > 0:
101
+ try:
102
+ # Create an instance of Workbook
103
+ wb = Workbook()
104
+ wb.Worksheets.Clear()
105
+
106
+ # Loop through sections in the document
107
+ for i in range(doc.Sections.Count):
108
+ # Get a section
109
+ section = doc.Sections.get_Item(i)
110
+ # Loop through tables in the section
111
+ for j in range(section.Tables.Count):
112
+ # Get a table
113
+ table = section.Tables.get_Item(j)
114
+ # Create a worksheet
115
+ ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
116
+ # Write the table to the worksheet
117
+ for row in range(table.Rows.Count):
118
+ # Get a row
119
+ tableRow = table.Rows.get_Item(row)
120
+ # Loop through cells in the row
121
+ for cell in range(tableRow.Cells.Count):
122
+ # Get a cell
123
+ tableCell = tableRow.Cells.get_Item(cell)
124
+ # Get the text in the cell
125
+ cellText = ''
126
+ for paragraph in range(tableCell.Paragraphs.Count):
127
+ paragraph = tableCell.Paragraphs.get_Item(paragraph)
128
+ cellText = cellText + (paragraph.Text + ' ')
129
+ # Write the cell text to the worksheet
130
+ ws.SetCellValue(row + 1, cell + 1, cellText)
131
+
132
+ # Save the workbook
133
+ name = self.wordDoc.split("/")[-1]
134
+ if self.saveFolder == None:
135
+ wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
136
+ nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
137
+ else:
138
+ wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
139
+ nameFile = self.saveFolder+'/'+name + ".xlsx"
140
+ doc.Close()
141
+ wb.Dispose()
142
+ return nameFile
143
+ except: return "No table found on word doc"
144
+ else:
145
+ return "No table found on word doc"
146
+ def getReference(self):
147
+ pass
148
+ def getSupMaterial(self):
149
+ pass
NER/cleanText.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference:
2
+ # https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
3
+ import re
4
+ import nltk
5
+ #nltk.download('stopwords')
6
+ #nltk.download()
7
+ from DefaultPackages import openFile, saveFile
8
+ import json
9
+ from nltk.corpus import stopwords
10
+ from nltk.corpus.reader.api import wordpunct_tokenize
11
+ from nltk.tokenize import word_tokenize
12
+ #from wordsegment import load, segment
13
+ from wordsegment import load, segment
14
+ class cleanGenText():
15
+ def __init__(self):
16
+ #self.text = text
17
+ load()
18
+ pass
19
+ def removePunct(self,text,KeepPeriod=False):
20
+ punctuation = r'[^\w\s]'
21
+ if KeepPeriod==True:
22
+ punctuation = r'[^\w\s\.]'
23
+ return re.sub(punctuation, '', text)
24
+ def removeURL(self,text):
25
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
26
+ return url_pattern.sub(r'', text)
27
+ def removeHTMLTag(self,text):
28
+ html_tags_pattern = r'<.*?>'
29
+ return re.sub(html_tags_pattern, '', text)
30
+ def removeTabWhiteSpaceNewLine(self,text):
31
+ # remove \n or \t and unnecessary white space
32
+ cleanText = text.replace("\n\n","")
33
+ cleanText = text.replace("\n","")
34
+ cleanText = cleanText.replace("\t","")
35
+ cleanText = cleanText.strip()
36
+ return cleanText
37
+ def removeExtraSpaceBetweenWords(self,text):
38
+ return re.sub(r'\s+', ' ',text).strip()
39
+ def removeStopWords(self,text):
40
+ #extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
41
+ filteredWord = []
42
+ stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
43
+ textWords = word_tokenize(text)
44
+ for word in textWords:
45
+ if word.lower() not in stopWords:
46
+ filteredWord.append(word) # and w.isalpha()==True]
47
+ return filteredWord
48
+ def removeLowercaseBetweenUppercase(self,segment):
49
+ # segment such as "Myanmar (formerly Burma)"
50
+ # but not change anything for "Viet Nam"
51
+ # for special cases:
52
+ # the capital letter:
53
+ # When there is a lowercase word between:
54
+ # e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"
55
+ # When there is no lowercase word or uppercase words in a row:
56
+ # e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
57
+ outputUp = []
58
+ segment = self.removeTabWhiteSpaceNewLine(segment)
59
+ segments = segment.split(" ")
60
+ for w in range(len(segments)):
61
+ word = segments[w]
62
+ cleanWord = self.removePunct(word)
63
+ cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
64
+ prevWord = ""
65
+ if w > 0:
66
+ prevWord = segments[w-1]
67
+ cleanPreWord = self.removePunct(prevWord)
68
+ cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
69
+ if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
70
+ if len(prevWord)>0 and prevWord[0].isupper() == True:
71
+ outputUp[-1] += " " + cleanWord
72
+ else:
73
+ outputUp.append(cleanWord)
74
+ return outputUp
75
+ def textPreprocessing(self, text, keepPeriod=False):
76
+ # lowercase
77
+ #lowerText = self.text.lower()
78
+ # remove punctuation & special characacters
79
+ cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
80
+ # removal of URLs in text
81
+ cleanText = self.removeURL(cleanText)
82
+ # removal of HTML Tags
83
+ cleanText = self.removeHTMLTag(cleanText)
84
+ # remove \n or \t and unnecessary white space
85
+ cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
86
+ # stop-words removal
87
+ filteredWord = self.removeStopWords(cleanText)
88
+ # a sentence or the capital word behind a period "."
89
+ return cleanText, filteredWord
90
+ #generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
91
+ #saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
92
+ def splitStickWords(self,word):
93
+ #output = []
94
+ split_words = segment(word)
95
+ '''for w in split_words:
96
+ pos = word.lower().find(w)
97
+ if word[pos].isupper() == True:
98
+ output.append(w[0].upper() + w[1:])
99
+ else:
100
+ output.append(w)
101
+ if pos >=0:
102
+ if pos+len(w)<len(word):
103
+ if word[pos+len(w)] == ".":
104
+ output[-1] = output[-1] + "." '''
105
+ return " ".join(split_words)
106
+ def removeDOI(self, word, doiLink=None):
107
+ # if they have the word DOI in that: ex: 1368598DOI after general clean
108
+ if "DOI" in word:
109
+ word = word.replace(word,"")
110
+ # if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
111
+ if doiLink != None:
112
+ w = self.splitStickWords(word)
113
+ cleanDOI = self.removePunct(doiLink)
114
+ if cleanDOI in w:
115
+ word = w.replace(cleanDOI,"")
116
+ return word
NER/html/extractHTML.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!pip install bs4
2
+ # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ from DefaultPackages import openFile, saveFile
6
+ from NER import cleanText
7
+ import pandas as pd
8
+ class HTML():
9
+ def __init__(self, htmlFile, htmlLink):
10
+ self.htmlLink = htmlLink
11
+ self.htmlFile = htmlFile
12
+ def openHTMLFile(self):
13
+ if self.htmlLink != "None":
14
+ r = requests.get(self.htmlLink)
15
+ soup = BeautifulSoup(r.content, 'html.parser')
16
+ else:
17
+ with open(self.htmlFile) as fp:
18
+ soup = BeautifulSoup(fp, 'html.parser')
19
+ return soup
20
+ def getText(self):
21
+ soup = self.openHTMLFile()
22
+ s = soup.find_all("html")
23
+ for t in range(len(s)):
24
+ text = s[t].get_text()
25
+ cl = cleanText.cleanGenText()
26
+ text = cl.removeExtraSpaceBetweenWords(text)
27
+ return text
28
+ def getListSection(self, scienceDirect=None):
29
+ json = {}
30
+ text = ""
31
+ textJson, textHTML = "",""
32
+ if scienceDirect == None:
33
+ soup = self.openHTMLFile()
34
+ # get list of section
35
+ json = {}
36
+ for h2Pos in range(len(soup.find_all('h2'))):
37
+ if soup.find_all('h2')[h2Pos].text not in json:
38
+ json[soup.find_all('h2')[h2Pos].text] = []
39
+ if h2Pos + 1 < len(soup.find_all('h2')):
40
+ content = soup.find_all('h2')[h2Pos].find_next("p")
41
+ nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
42
+ while content.text != nexth2Content.text:
43
+ json[soup.find_all('h2')[h2Pos].text].append(content.text)
44
+ content = content.find_next("p")
45
+ else:
46
+ content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
47
+ json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
48
+ # format
49
+ '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
50
+ 'Results':[], 'Discussion':[], 'References':[],
51
+ 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
52
+ 'Additional information':[], 'Electronic supplementary material':[],
53
+ 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
54
+ if scienceDirect!= None or len(json)==0:
55
+ # Replace with your actual Elsevier API key
56
+ api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
57
+ # ScienceDirect article DOI or PI (Example DOI)
58
+ doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
59
+ # Base URL for the Elsevier API
60
+ base_url = "https://api.elsevier.com/content/article/doi/"
61
+ # Set headers with API key
62
+ headers = {
63
+ "Accept": "application/json",
64
+ "X-ELS-APIKey": api_key
65
+ }
66
+ # Make the API request
67
+ response = requests.get(base_url + doi, headers=headers)
68
+ # Check if the request was successful
69
+ if response.status_code == 200:
70
+ data = response.json()
71
+ supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
72
+ if "originalText" in list(supp_data.keys()):
73
+ if type(supp_data["originalText"])==str:
74
+ json["originalText"] = [supp_data["originalText"]]
75
+ if type(supp_data["originalText"])==dict:
76
+ json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
77
+ else:
78
+ if type(supp_data)==dict:
79
+ for key in supp_data:
80
+ json[key] = [supp_data[key]]
81
+
82
+ textJson = self.mergeTextInJson(json)
83
+ textHTML = self.getText()
84
+ if len(textHTML) > len(textJson):
85
+ text = textHTML
86
+ else: text = textJson
87
+ return text #json
88
+ def getReference(self):
89
+ # get reference to collect more next data
90
+ ref = []
91
+ json = self.getListSection()
92
+ for key in json["References"]:
93
+ ct = cleanText.cleanGenText(key)
94
+ cleanText, filteredWord = ct.cleanText()
95
+ if cleanText not in ref:
96
+ ref.append(cleanText)
97
+ return ref
98
+ def getSupMaterial(self):
99
+ # check if there is material or not
100
+ json = {}
101
+ soup = self.openHTMLFile()
102
+ for h2Pos in range(len(soup.find_all('h2'))):
103
+ if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
104
+ #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
105
+ link, output = [],[]
106
+ if soup.find_all('h2')[h2Pos].text not in json:
107
+ json[soup.find_all('h2')[h2Pos].text] = []
108
+ for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
109
+ link.append(l["href"])
110
+ if h2Pos + 1 < len(soup.find_all('h2')):
111
+ nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
112
+ if nexth2Link in link:
113
+ link = link[:link.index(nexth2Link)]
114
+ # only take links having "https" in that
115
+ for i in link:
116
+ if "https" in i: output.append(i)
117
+ json[soup.find_all('h2')[h2Pos].text].extend(output)
118
+ return json
119
+ def extractTable(self):
120
+ soup = self.openHTMLFile()
121
+ df = []
122
+ try:
123
+ df = pd.read_html(str(soup))
124
+ except ValueError:
125
+ df = []
126
+ print("No tables found in HTML file")
127
+ return df
128
+ def mergeTextInJson(self,jsonHTML):
129
+ #cl = cleanText.cleanGenText()
130
+ cl = cleanGenText()
131
+ htmlText = ""
132
+ for sec in jsonHTML:
133
+ # section is "\n\n"
134
+ if len(jsonHTML[sec]) > 0:
135
+ for i in range(len(jsonHTML[sec])):
136
+ # same section is just a dot.
137
+ text = jsonHTML[sec][i]
138
+ if len(text)>0:
139
+ #text = cl.removeTabWhiteSpaceNewLine(text)
140
+ #text = cl.removeExtraSpaceBetweenWords(text)
141
+ text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
142
+ jsonHTML[sec][i] = text
143
+ if i-1 >= 0:
144
+ if len(jsonHTML[sec][i-1])>0:
145
+ if jsonHTML[sec][i-1][-1] != ".":
146
+ htmlText += ". "
147
+ htmlText += jsonHTML[sec][i]
148
+ if len(jsonHTML[sec][i]) > 0:
149
+ if jsonHTML[sec][i][-1]!=".":
150
+ htmlText += "."
151
+ htmlText += "\n\n"
152
+ return htmlText
153
+ def removeHeaders(self):
154
+ pass
155
+ def removeFooters(self):
156
+ pass
157
+ def removeReferences(self):
158
+ pass
NER/word2Vec/word2vec.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''WORD TO VECTOR'''
2
+ import pandas as pd
3
+ import json
4
+ import gensim
5
+ import spacy
6
+ from DefaultPackages import openFile, saveFile
7
+ from NER import cleanText
8
+ from gensim.models.keyedvectors import KeyedVectors
9
+ from gensim.test.utils import common_texts
10
+ from gensim.models.word2vec import Word2Vec
11
+ from gensim.scripts.glove2word2vec import glove2word2vec
12
+ from gensim.test.utils import datapath, get_tmpfile
13
+ import sys
14
+ import subprocess
15
+ # can try multiprocessing to run quicker
16
+ import multiprocessing
17
+ import copy
18
+ sys.setrecursionlimit(1000)
19
+ # creat folder word2Vec
20
+ #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
21
+ # create word2vec model
22
+ #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
23
+ '''Some notes for this model
24
+ sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
25
+ a similar word to the word we are finding, so can we try to preprocess text so that
26
+ we make the corpus more effective and only contains the important words. Then when we
27
+ train the model, the important words will be seen as important. Or
28
+ when we already have the similar list of words, we can remove the words in there
29
+ that are stopwords/unnecessary words.'''
30
+ ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
31
+ class word2Vec():
32
+ def __init__(self, nameFile=None, modelName=None):
33
+ self.nameFile = nameFile
34
+ self.modelName = modelName
35
+ def spacy_similarity(self, word):
36
+ # when use word2vec, try medium or large is better
37
+ # maybe try odc similarity?
38
+ nlp = spacy.load("en_core_web_lg")
39
+ doc = nlp(word)
40
+ for token1 in doc:
41
+ for token2 in doc:
42
+ print(token1.text, token2.text, token1.similarity(token2))
43
+ pass
44
+ # clean text before transform to corpus
45
+ def cleanTextBeforeCorpus(self,oriText, doi=None):
46
+ cl = cleanText.cleanGenText()
47
+ #cl = cleanGenText()
48
+ output = ""
49
+ alreadyRemoveDoi = False
50
+ for word in oriText.split(" "):
51
+ # remove DOI
52
+ if doi != None and doi in oriText:
53
+ if alreadyRemoveDoi == False:
54
+ newWord = cl.removeDOI(word,doi)
55
+ if len(newWord) > 0 and newWord != word:
56
+ alreadyRemoveDoi = True
57
+ word = newWord
58
+ # remove punctuation
59
+ # split the sticked words
60
+ #word = cl.splitStickWords(word)
61
+ # remove punctuation
62
+ word = cl.removePunct(word,True)
63
+ # remove URL
64
+ word = cl.removeURL(word)
65
+ # remove HTMLTag
66
+ word = cl.removeHTMLTag(word)
67
+ # remove tab, white space, newline
68
+ word = cl.removeTabWhiteSpaceNewLine(word)
69
+ # optional: remove stopwords
70
+ #word = cl.removeStopWords(word)
71
+ if len(word)>0:
72
+ output += word + " "
73
+ return output
74
+ def cleanAllTextBeforeCorpus(self, allText, doi=None):
75
+ cleanOutput = ""
76
+ remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
77
+ if len(allText) > 0:
78
+ corpusText = allText
79
+ for pos in range(len(corpusText.split("\n\n"))):
80
+ if len(corpusText.split("\n\n")[pos]) > 0:
81
+ lines = corpusText.split("\n\n")[pos]
82
+ for line in lines.split("\n"):
83
+ if remove in line: line = line.replace(remove, "")
84
+ clean_text = self.cleanTextBeforeCorpus(line, doi)
85
+ cleanOutput += clean_text + "\n"
86
+ cleanOutput += "\n\n"
87
+ return cleanOutput
88
+ def tableTransformToCorpusText(self, df, excelFile=None):
89
+ # PDF, Excel, WordDoc
90
+ #cl = cleanText.cleanGenText()
91
+ corpus = {}
92
+ # PDF or df
93
+ if excelFile == None:
94
+ if len(df) > 0:
95
+ try:
96
+ for i in range(len(df)):
97
+ # each new dimension/page is considered to be a sentence which ends with the period.
98
+ # each new line is a new list, and each new df is a new corpus
99
+ outputDF = []
100
+ text = df[i].values.tolist()
101
+ if len(text) > 0:
102
+ outputRowDF = self.helperRowTableToCorpus(text)
103
+ #outputColDF = self.helperColTableToCorpus(text)
104
+ outputDF.extend(outputRowDF)
105
+ #outputDF.extend(outputColDF)
106
+ if len(outputDF) > 0:
107
+ corpus["corpus" + str(i)] = outputDF
108
+ except:
109
+ outputDF = []
110
+ text = df.values.tolist()
111
+ if len(text) > 0:
112
+ outputRowDF = self.helperRowTableToCorpus(text)
113
+ #outputColDF = self.helperColTableToCorpus(text)
114
+ outputDF.extend(outputRowDF)
115
+ #outputDF.extend(outputColDF)
116
+ if len(outputDF) > 0:
117
+ corpus["corpus0"] = outputDF
118
+ else:
119
+ df = pd.ExcelFile(excelFile)
120
+ sheetNames = df.sheet_names
121
+ output = []
122
+ if len(sheetNames) > 0:
123
+ for s in range(len(sheetNames)):
124
+ outputDF = []
125
+ with pd.ExcelFile(excelFile) as xls:
126
+ data = pd.read_excel(xls, sheetNames[s])
127
+ if sheetNames[s] != 'Evaluation Warning':
128
+ text = data.values.tolist()
129
+ if len(text) > 0:
130
+ outputRowDF = self.helperRowTableToCorpus(text)
131
+ #outputColDF = self.helperColTableToCorpus(text)
132
+ outputDF.extend(outputRowDF)
133
+ #outputDF.extend(outputColDF)
134
+ if len(outputDF) > 0:
135
+ corpus["corpus" + str(s)] = outputDF
136
+ return corpus
137
+ def helperRowTableToCorpus(self, textList):
138
+ #cl = cleanGenText()
139
+ cl = cleanText.cleanGenText()
140
+ stopWords = ["NaN","Unnamed:","nan"]
141
+ outputDF = []
142
+ for line in textList:
143
+ outputLine = []
144
+ for words in line:
145
+ words = str(words)
146
+ if len(words) > 0:
147
+ for word in words.split(" "):
148
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
149
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
150
+ #word = cl.splitStickWords(word)
151
+ word = cl.removePunct(word)
152
+ word = " ".join(cl.removeStopWords(word))
153
+ word = cl.removeTabWhiteSpaceNewLine(word)
154
+ if len(word) > 1:
155
+ if len(word.split(" ")) > 1:
156
+ for x in word.split(" "):
157
+ if len(x) > 1 and x.isnumeric()==False:
158
+ outputLine.append(x.lower())
159
+ else:
160
+ if word.isnumeric() == False:
161
+ outputLine.append(word.lower())
162
+ if len(outputLine) > 0:
163
+ outputDF.append(outputLine)
164
+ return outputDF
165
+ def helperColTableToCorpus(self, dfList):
166
+ #cl = cleanGenText()
167
+ cl = cleanText.cleanGenText()
168
+ stopWords = ["NaN","Unnamed:","nan"]
169
+ outputDF = []
170
+ # use the first length line as the column ref
171
+ for pos in range(len(dfList[0])):
172
+ outputLine = []
173
+ for line in dfList:
174
+ if pos < len(line):
175
+ words = line[pos]
176
+ words = str(words)
177
+ else: words = ""
178
+ if len(words) > 0:
179
+ for word in words.split(" "):
180
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
181
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
182
+ #word = cl.splitStickWords(word)
183
+ word = cl.removePunct(word)
184
+ word = " ".join(cl.removeStopWords(word))
185
+ word = cl.removeTabWhiteSpaceNewLine(word)
186
+ if len(word) > 1:
187
+ if len(word.split(" ")) > 1:
188
+ for x in word.split(" "):
189
+ if len(x) > 1 and x.isnumeric()==False:
190
+ outputLine.append(x.lower())
191
+ else:
192
+ if word.isnumeric() == False:
193
+ outputLine.append(word.lower())
194
+ if len(outputLine) > 0:
195
+ outputDF.append(outputLine)
196
+ return outputDF
197
+ # create a corpus
198
+ def createCorpusText(self, corpusText):
199
+ '''ex: "Tom is cat. Jerry is mouse."
200
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
201
+ # the output should be like this:
202
+ '''texts = {
203
+ "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
204
+ "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
205
+ }
206
+ '''
207
+ # separate paragraph
208
+ '''Ex: Cat is an animal. Tom is cat.
209
+
210
+ Mouse is an animal.
211
+ Jerry is mouse.'''
212
+ texts = {}
213
+ cl = cleanText.cleanGenText()
214
+ #cl = cleanGenText()
215
+ for pos in range(len(corpusText.split("\n\n"))):
216
+ if len(corpusText.split("\n\n")[pos]) > 0:
217
+ texts["Paragraph "+str(pos)] = []
218
+ lines = corpusText.split("\n\n")[pos]
219
+ for line in lines.split("\n"):
220
+ for l in line.split("."):
221
+ if len(l) > 0:
222
+ cl.removeTabWhiteSpaceNewLine(l)
223
+ l = l.lower()
224
+ newL = []
225
+ for word in l.split(" "):
226
+ if len(word) > 0:
227
+ word = cl.removeStopWords(word)
228
+ for w in word:
229
+ if len(w) > 0 and w.isnumeric()==False:
230
+ newL.append(w)
231
+ if len(newL)>0:
232
+ texts["Paragraph "+str(pos)].append(newL)
233
+ if len(texts["Paragraph "+str(pos)]) == 0:
234
+ del texts["Paragraph "+str(pos)]
235
+ return texts
236
+ def selectParaForWC(self,corpus):
237
+ ''' corpus should be in the format:
238
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
239
+ corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
240
+ corSize = len(corpus)
241
+ # less than 2000
242
+ if 0 < corSize < 2000:
243
+ window=3.5
244
+ vector_size=75
245
+ sample=1e-3
246
+ negative=10
247
+ epochs=10
248
+ sg=1
249
+ # 2000 - 100000
250
+ elif 2000 <= corSize < 100000:
251
+ window=3.5
252
+ vector_size=75
253
+ sample=1e-5
254
+ negative=10
255
+ epochs=10
256
+ sg=1
257
+ elif 100000 <=corSize < 1000000:
258
+ window=7.5
259
+ vector_size=150
260
+ sample=1e-5
261
+ negative=10
262
+ epochs=6
263
+ sg=0
264
+ return window, vector_size, sample, negative, epochs, sg
265
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
266
+ vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
267
+ # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
268
+ jsonFile = ""
269
+ jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
270
+ cores = multiprocessing.cpu_count()
271
+ combinedCorpus = []
272
+ window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
273
+ if len(jsonFile) > 0:
274
+ for key in jsonFile:
275
+ combinedCorpus.extend(jsonFile[key])
276
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
277
+ # # min_count=1 ensures all words are included
278
+ '''w2vModel = Word2Vec(
279
+ min_count=1,
280
+ window=window,
281
+ vector_size=vector_size,
282
+ sample=sample,
283
+ alpha=0.03,
284
+ min_alpha=0.0007,
285
+ negative=negative,
286
+ workers=cores-1,
287
+ epochs = epochs,
288
+ sg=sg)'''
289
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
290
+ accept = False
291
+ while not accept:
292
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
293
+ try:
294
+ w2vModel = Word2Vec(
295
+ min_count=1,
296
+ window=window,
297
+ vector_size=vector_size,
298
+ sample=sample,
299
+ alpha=0.03,
300
+ min_alpha=0.0007,
301
+ negative=negative,
302
+ workers=cores-1,
303
+ epochs = epochs,
304
+ sg=sg)
305
+ w2vModel.build_vocab(combinedCorpus)
306
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
307
+ accept = True
308
+ except:
309
+ for key in jsonFile:
310
+ combinedCorpus.extend(jsonFile[key])
311
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
312
+ print("next is " + str(len(combinedCorpus)))
313
+ else:
314
+ print("no parameter to train")
315
+ break
316
+ #w2vModel.build_vocab(combinedCorpus)
317
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
318
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
319
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
320
+ w2vModel.save(saveFolder+"/"+modelName+".model")
321
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
322
+ print("done w2v")
323
+ else: print("no corpus to train")
324
+ #return combinedCorpus
325
+ def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
326
+ # might not be a meaningful keyword
327
+ #stopWords = ["show"]
328
+ # same word but just plural nouns, tense
329
+ simWords = [word+"s",word+"es",word+"ing",word+"ed"]
330
+ model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
331
+ results = model.most_similar(positive=[word],topn=n)
332
+ #removeIndex = []
333
+ #currN = copy.deepcopy(n)
334
+ '''for r in range(len(results)):
335
+ if len(results[r][0]) < 2:
336
+ removeIndex.append(results[r])
337
+ # remove the same word but just plural and singular noun and lower than the cos_thres
338
+ elif results[r][0] == word:
339
+ removeIndex.append(results[r])
340
+ elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
341
+ removeIndex.append(results[r])
342
+ for rem in removeIndex:
343
+ results.remove(rem)
344
+ while len(results)!=n and len(results) != 0:
345
+ moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
346
+ if moreNewResult not in results and len(moreNewResult[0])>1:
347
+ if moreNewResult[0] not in stopWords and results[0] != word:
348
+ results.append(moreNewResult)
349
+ currN +=1'''
350
+ return results
351
+ # adding our model into spacy
352
+ # this deals with command line; but instead of using it, we write python script to run command line
353
+ def loadWordVec(self,modelName,wordVec):
354
+ # modelName is the name you want to save into spacy
355
+ # wordVec is the trained word2vec in txt format
356
+ subprocess.run([sys.executable,
357
+ "-m",
358
+ "spacy",
359
+ "init-model",
360
+ "en",
361
+ modelName, # this modelName comes from the saved modelName of function trainWord2Vec
362
+ "--vectors-loc",
363
+ wordVec])
364
+ print("done")
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ✅ Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
2
+
3
+ import gradio as gr
4
+ from collections import Counter
5
+ import csv
6
+ import os
7
+ from functools import lru_cache
8
+ from mtdna_classifier import classify_sample_location
9
+ @lru_cache(maxsize=128)
10
+ def classify_sample_location_cached(accession):
11
+ return classify_sample_location(accession)
12
+
13
+ # Count and suggest final location
14
+ def compute_final_suggested_location(rows):
15
+ candidates = [
16
+ row.get("Predicted Location", "").strip()
17
+ for row in rows
18
+ if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found"]
19
+ ] + [
20
+ row.get("Inferred Region", "").strip()
21
+ for row in rows
22
+ if row.get("Inferred Region", "").strip().lower() not in ["","unknown"]
23
+ ]
24
+
25
+ if not candidates:
26
+ return Counter(), ("Unknown", 0)
27
+
28
+ counts = Counter(candidates)
29
+ top_location, count = counts.most_common(1)[0]
30
+ return counts, (top_location, count)
31
+
32
+ # Store feedback (with required fields)
33
+ def store_feedback_to_drive(accession, answer1, answer2, contact=""):
34
+ if not answer1.strip() or not answer2.strip():
35
+ return "⚠️ Please answer both questions before submitting."
36
+
37
+ feedback_file = "/content/drive/MyDrive/Customers/feedback_mtdna.csv"
38
+ header = ["accession", "helpful", "improvement", "contact"]
39
+ row = [accession, answer1, answer2, contact]
40
+ file_exists = os.path.isfile(feedback_file)
41
+ with open(feedback_file, "a", newline="") as f:
42
+ writer = csv.writer(f)
43
+ if not file_exists:
44
+ writer.writerow(header)
45
+ writer.writerow(row)
46
+ return "✅ Feedback submitted. Thank you!"
47
+
48
+ def summarize_results(accession):
49
+ try:
50
+ output = classify_sample_location_cached(accession)
51
+ except Exception as e:
52
+ return [], f"❌ Error: {e}"
53
+
54
+ if accession not in output:
55
+ return [], "❌ Accession not found in results."
56
+
57
+ isolate = next((k for k in output if k != accession), None)
58
+ row_score = []
59
+ rows = []
60
+
61
+ for key in [accession, isolate]:
62
+ if key not in output:
63
+ continue
64
+ sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
65
+ for section, techniques in output[key].items():
66
+ for technique, content in techniques.items():
67
+ source = content.get("source", "")
68
+ predicted = content.get("predicted_location", "")
69
+ haplogroup = content.get("haplogroup", "")
70
+ inferred = content.get("inferred_location", "")
71
+ context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
72
+
73
+ row = {
74
+ "Sample ID": sample_id_label,
75
+ "Technique": technique,
76
+ "Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
77
+ "Predicted Location": "" if technique == "haplogroup" else predicted,
78
+ "Haplogroup": haplogroup if technique == "haplogroup" else "",
79
+ "Inferred Region": inferred if technique == "haplogroup" else "",
80
+ "Context Snippet": context
81
+ }
82
+
83
+ row_score.append(row)
84
+ rows.append(list(row.values()))
85
+
86
+ location_counts, (final_location, count) = compute_final_suggested_location(row_score)
87
+ summary_lines = [f"### 🧭 Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
88
+ summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
89
+ summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
90
+ summary = "\n".join(summary_lines)
91
+
92
+ return rows, summary
93
+
94
+ # Gradio UI
95
+ with gr.Blocks() as interface:
96
+ gr.Markdown("# 🧬 mtDNA Location Classifier (MVP)")
97
+ gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
98
+
99
+ with gr.Row():
100
+ accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
101
+ run_button = gr.Button("🔍 Submit and Classify")
102
+ reset_button = gr.Button("🔄 Reset")
103
+
104
+ status = gr.Markdown(visible=False)
105
+ headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
106
+ output_table = gr.Dataframe(headers=headers, interactive=False)
107
+ output_summary = gr.Markdown()
108
+
109
+ gr.Markdown("---")
110
+ gr.Markdown("### 💬 Feedback (required)")
111
+ q1 = gr.Textbox(label="1️⃣ Was the inferred location accurate or helpful? Please explain.")
112
+ q2 = gr.Textbox(label="2️⃣ What would improve your experience with this tool?")
113
+ contact = gr.Textbox(label="📧 Your email or institution (optional)")
114
+ submit_feedback = gr.Button("✅ Submit Feedback")
115
+ feedback_status = gr.Markdown()
116
+
117
+ def classify_with_loading(accession):
118
+ return gr.update(value="⏳ Please wait... processing...", visible=True)
119
+
120
+ def classify_main(accession):
121
+ table, summary = summarize_results(accession)
122
+ return table, summary, gr.update(visible=False)
123
+
124
+ def reset_fields():
125
+ return "", "", "", "", "", [], "", gr.update(visible=False)
126
+
127
+ run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
128
+ run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
129
+ submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
130
+ reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
131
+
132
+ interface.launch()
data/haplogroup_regions_extended.csv ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ haplogroup,region,source
2
+ H,Western Europe,PhyloTree
3
+ U,Eurasia,PhyloTree
4
+ L0,Southern Africa,EMPOP
5
+ L1,Central Africa,EMPOP
6
+ L2,West Africa,EMPOP
7
+ L3,East Africa,EMPOP
8
+ B4,Southeast Asia,EMPOP
9
+ A2,Native North America,PhyloTree
10
+ C1,Siberia and Americas,PhyloTree
11
+ D4,East Asia,PhyloTree
12
+ X,Western Eurasia / North America,PhyloTree
13
+ J,Europe and Near East,PhyloTree
14
+ K,Europe,PhyloTree
15
+ T,Europe and Central Asia,PhyloTree
16
+ M,Asia,EMPOP
17
+ N,Worldwide (basal),PhyloTree
18
+ I,Europe,PhyloTree
19
+ W,Eurasia,PhyloTree
20
+ Z,North and East Asia,PhyloTree
21
+ Y,Southeast Asia,EMPOP
22
+ E,Oceania and Southeast Asia,PhyloTree
23
+ F,East and Southeast Asia,EMPOP
24
+ B2,Native South America,EMPOP
25
+ A1,Central Asia,EMPOP
26
+ C4,Siberia,PhyloTree
27
+ D1,South America,PhyloTree
28
+ M7,East Asia,EMPOP
29
+ M8,Japan,EMPOP
30
+ G,Siberia,PhyloTree
31
+ HV,Europe and Middle East,PhyloTree
32
+ U5,Northern Europe,PhyloTree
33
+ U6,North Africa,PhyloTree
34
+ U7,South Asia,PhyloTree
35
+ U8,Central Europe,PhyloTree
36
+ R0,Arabian Peninsula,PhyloTree
37
+ R9,Southeast Asia,PhyloTree
38
+ H1,Iberian Peninsula,PhyloTree
39
+ H2,Eastern Europe,PhyloTree
40
+ H3,Western Europe,PhyloTree
41
+ H5,Balkans,PhyloTree
42
+ J1,Europe,PhyloTree
43
+ J2,Middle East,PhyloTree
44
+ T1,Eastern Europe,PhyloTree
45
+ T2,Near East,PhyloTree
46
+ M1,North Africa,PhyloTree
47
+ M2,South Asia,PhyloTree
48
+ M3,South Asia,PhyloTree
49
+ M4,South Asia,PhyloTree
50
+ M5,South Asia,PhyloTree
51
+ M6,South Asia,PhyloTree
mtdna_classifier.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mtDNA Location Classifier MVP (Google Colab)
2
+ # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
+ import os
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ import fitz
8
+ import spacy
9
+ from NER.PDF import pdf
10
+ from NER.WordDoc import wordDoc
11
+ from NER.html import extractHTML
12
+ from NER.word2Vec import word2vec
13
+ from transformers import pipeline
14
+ # Set your email (required by NCBI Entrez)
15
+ #Entrez.email = "[email protected]"
16
+
17
+ # Step 1: Get PubMed ID from Accession using EDirect
18
+
19
+ def get_info_from_accession(accession):
20
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
21
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
22
+ output = result.stdout
23
+ pubmedID, isolate = "", ""
24
+ for line in output.split("\n"):
25
+ if len(line) > 0:
26
+ if "PUBMED" in line:
27
+ pubmedID = line.split()[-1]
28
+ if "isolate" in line: # Check for isolate information
29
+ # Try direct GenBank annotation: /isolate="XXX"
30
+ match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
31
+ if match1:
32
+ isolate = match1.group(1)
33
+ else:
34
+ # Try from DEFINITION line: ...isolate XXX...
35
+ match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
36
+ if match2:
37
+ isolate = match2.group(1)
38
+
39
+ # Return the values, even if they are empty strings
40
+ return pubmedID, isolate
41
+ # Step 2: Get doi link to access the paper
42
+ def get_doi_from_pubmed_id(id):
43
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {id} -format medline | grep -i "AID"'
44
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
45
+ output = result.stdout
46
+ doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
47
+ match = re.search(doi_pattern, output, re.IGNORECASE)
48
+ return match.group(0)
49
+
50
+ # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
51
+ # Step 3.1: Extract Text
52
+ def get_paper_text(doi,id):
53
+ # create the temporary folder to contain the texts
54
+ cmd = f'mkdir {id}'
55
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
56
+ saveLinkFolder = "/mtDNALocation/data/"+id
57
+
58
+ link = 'https://doi.org/' + doi
59
+ '''textsToExtract = { "doiLink":"paperText"
60
+ "file1.pdf":"text1",
61
+ "file2.doc":"text2",
62
+ "file3.xlsx":excelText3'''
63
+ textsToExtract = {}
64
+ # get the file to create listOfFile for each id
65
+ html = extractHTML.HTML("",link)
66
+ jsonSM = html.getSupMaterial()
67
+ text = ""
68
+ links = [link] + sum((jsonSM[key] for key in jsonSM),[])
69
+ #print(links)
70
+ for l in links:
71
+ # get the main paper
72
+ if l == link:
73
+ text = html.getListSection()
74
+ textsToExtract[link] = text
75
+ elif l.endswith(".pdf"):
76
+ p = pdf.PDF(l,saveLinkFolder,doi)
77
+ f = p.openPDFFile()
78
+ pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
79
+ doc = fitz.open(pdf_path)
80
+ text = "\n".join([page.get_text() for page in doc])
81
+ textsToExtract[l] = text
82
+ elif l.endswith(".doc") or l.endswith(".docx"):
83
+ d = wordDoc.wordDoc(l,saveLinkFolder)
84
+ text = d.extractTextByPage()
85
+ textsToExtract[l] = text
86
+ elif l.split(".")[-1].lower() in "xlsx":
87
+ wc = word2vec.word2Vec()
88
+ corpus = wc.tableTransformToCorpusText([],l)
89
+ text = ''
90
+ for c in corpus:
91
+ para = corpus[c]
92
+ for words in para:
93
+ text += " ".join(words)
94
+ textsToExtract[l] = text
95
+ # delete folder after finishing getting text
96
+ cmd = f'rm -r /mtDNALocation/data/{id}'
97
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
98
+ return textsToExtract
99
+ # Step 3.2: Extract context
100
+ def extract_context(text, keyword, window=500):
101
+ idx = text.find(keyword)
102
+ if idx == -1:
103
+ return "Sample ID not found."
104
+ return text[max(0, idx-window): idx+window]
105
+ # Step 4: Classification for now (demo purposes)
106
+ # 4.1: Using a HuggingFace model (question-answering)
107
+ def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
108
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
109
+ result = qa({"context": context, "question": question})
110
+ return result["answer"]
111
+ # 4.2: Infer from haplogroup
112
+ # Load pre-trained spaCy model for NER
113
+ nlp = spacy.load("en_core_web_sm")
114
+ # Define the haplogroup-to-region mapping (simple rule-based)
115
+ import csv
116
+
117
+ def load_haplogroup_mapping(csv_path):
118
+ mapping = {}
119
+ with open(csv_path) as f:
120
+ reader = csv.DictReader(f)
121
+ for row in reader:
122
+ mapping[row["haplogroup"]] = [row["region"],row["source"]]
123
+ return mapping
124
+
125
+ # Function to extract haplogroup from the text
126
+ def extract_haplogroup(text):
127
+ # 1. Try to find a haplogroup preceded by the word "haplogroup"
128
+ match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
129
+ if match:
130
+ return re.match(r'^[A-Z][0-9]*', match.group(1)).group(0)
131
+ #return match.group(1) # This is the actual haplogroup code like U5b1
132
+
133
+ # 2. Fallback: try to find isolated uppercase-letter haplogroup codes
134
+ fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
135
+ if fallback:
136
+ return fallback.group(1)
137
+
138
+ return None # If nothing found
139
+
140
+ # Function to extract location based on NER
141
+ def extract_location(text):
142
+ doc = nlp(text)
143
+ locations = []
144
+ for ent in doc.ents:
145
+ if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
146
+ locations.append(ent.text)
147
+ return locations
148
+
149
+ # Function to infer location from haplogroup
150
+ def infer_location_from_haplogroup(haplogroup):
151
+ haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
152
+ return haplo_map.get(haplogroup, ["Unknown","Unknown"])
153
+
154
+ # Function to classify the mtDNA sample
155
+ def classify_mtDNA_sample_from_haplo(text):
156
+ # Extract haplogroup
157
+ haplogroup = extract_haplogroup(text)
158
+ # Extract location based on NER
159
+ locations = extract_location(text)
160
+ # Infer location based on haplogroup
161
+ inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
162
+ return {
163
+ "source":sourceHaplo,
164
+ "locations_found_in_context": locations,
165
+ "haplogroup": haplogroup,
166
+ "inferred_location": inferred_location
167
+
168
+ }
169
+ # 4.3 Get from available NCBI
170
+ def infer_location_fromNCBI(accession):
171
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "location|country|geo"'
172
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
173
+ output, location = "",""
174
+ output = result.stdout
175
+ if "location" in output or "country" in output or "geo" in output:
176
+ location = output.split('"')[1]
177
+ output = output.split()[0]
178
+ else:
179
+ location = "Unknown"
180
+ output = "No location information found in NCBI."
181
+ return location, output
182
+
183
+ # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
184
+ def classify_sample_location(accession):
185
+ outputs = {}
186
+ keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
187
+ # Step 1: get pubmed id and isolate
188
+ pubmedID, isolate = get_info_from_accession(accession)
189
+ # Step 2: get doi
190
+ doi = get_doi_from_pubmed_id(pubmedID)
191
+ # Step 3: get text
192
+ '''textsToExtract = { "doiLink":"paperText"
193
+ "file1.pdf":"text1",
194
+ "file2.doc":"text2",
195
+ "file3.xlsx":excelText3'''
196
+ textsToExtract = get_paper_text(doi,pubmedID)
197
+ # Step 4: prediction
198
+ outputs[accession] = {}
199
+ outputs[isolate] = {}
200
+ # 4.0 Infer from NCBI
201
+ location, outputNCBI = infer_location_fromNCBI(accession)
202
+ NCBI_result = {
203
+ "source": "NCBI",
204
+ "sample_id": accession,
205
+ "predicted_location": location,
206
+ "context_snippet": outputNCBI}
207
+ outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
208
+ for key in textsToExtract:
209
+ text = textsToExtract[key]
210
+ # try accession number first
211
+ outputs[accession][key] = {}
212
+ keyword = accession
213
+ context = extract_context(text, keyword, window=500)
214
+ # 4.1: Using a HuggingFace model (question-answering)
215
+ location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
216
+ qa_result = {
217
+ "source": key,
218
+ "sample_id": keyword,
219
+ "predicted_location": location,
220
+ "context_snippet": context
221
+ }
222
+ outputs[keyword][key]["QAModel"] = qa_result
223
+ # 4.2: Infer from haplogroup
224
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
225
+ outputs[keyword][key]["haplogroup"] = haplo_result
226
+ # try isolate
227
+ keyword = isolate
228
+ outputs[isolate][key] = {}
229
+ context = extract_context(text, keyword, window=500)
230
+ # 4.1.1: Using a HuggingFace model (question-answering)
231
+ location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
232
+ qa_result = {
233
+ "source": key,
234
+ "sample_id": keyword,
235
+ "predicted_location": location,
236
+ "context_snippet": context
237
+ }
238
+ outputs[keyword][key]["QAModel"] = qa_result
239
+ # 4.2.1: Infer from haplogroup
240
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
241
+ outputs[keyword][key]["haplogroup"] = haplo_result
242
+ return outputs
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ pandas
5
+ scikit-learn
6
+ spacy
7
+ pymupdf
8
+ requests
9
+ biopython
10
+ openpyxl
11
+ bs4
12
+ pdfreader
13
+ tabula-py
14
+ python-docx
15
+ thefuzz
16
+ wordsegment
17
+ spacy-lookups-data
18
+ gensim
19
+ nltk
setup.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Install Entrez Direct automatically with yes
4
+ yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
5
+
6
+ # Add EDirect to PATH for the current session
7
+ echo 'export PATH=$HOME/edirect:$PATH' >> ~/.bashrc
8
+ source ~/.bashrc