#! pip install spire.doc #! pip install Spire.XLS import pandas as pd from spire.doc import * from spire.doc.common import * from spire.xls import * from spire.xls.common import * from NER import cleanText import requests class wordDoc(): # using python-docx def __init__(self, wordDoc,saveFolder): self.wordDoc = wordDoc self.saveFolder = saveFolder def openFile(self): document = Document() return document.LoadFromFile(self.wordDoc) def extractTextByPage(self): # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph. json = {} #doc = self.openFile() # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter try: doc = Document() doc.LoadFromFile(self.wordDoc) except: response = requests.get(self.wordDoc) name = self.wordDoc.split("/")[-1] with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data temp_file.write(response.content) doc = Document() doc.LoadFromFile(self.saveFolder+"/" + name) text = doc.GetText() return text def extractTableAsText(self): getDoc = '' try: # reference: # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB doc = Document() doc.LoadFromFile(self.wordDoc) getDoc = "have document" except: response = requests.get(self.wordDoc) name = self.wordDoc.split("/")[-1] with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data temp_file.write(response.content) doc = Document() doc.LoadFromFile(self.saveFolder+"/" + name) getDoc = "have document" json = {} if len(getDoc) > 0: # Loop through the sections for s in range(doc.Sections.Count): # Get a section section = doc.Sections.get_Item(s) # Get the tables in the section json["Section" + str(s)] = {} tables = section.Tables # Loop through the tables for i in range(0, tables.Count): # Get a table table = tables.get_Item(i) # Initialize a string to store the table data tableData = '' # Loop through the rows of the table for j in range(0, table.Rows.Count): # Loop through the cells of the row for k in range(0, table.Rows.get_Item(j).Cells.Count): # Get a cell cell = table.Rows.get_Item(j).Cells.get_Item(k) # Get the text in the cell cellText = '' for para in range(cell.Paragraphs.Count): paragraphText = cell.Paragraphs.get_Item(para).Text cellText += (paragraphText + ' ') # Add the text to the string tableData += cellText if k < table.Rows.get_Item(j).Cells.Count - 1: tableData += '\t' # Add a new line tableData += '\n' json["Section" + str(s)]["Table"+str(i)] = tableData return json def extractTableAsList(self): tables = [] try: doc = Document() doc.LoadFromFile(self.wordDoc) except: response = requests.get(self.wordDoc) name = self.wordDoc.split("/")[-1] with open(os.path.join(self.saveFolder, name), "wb") as f: f.write(response.content) doc = Document() doc.LoadFromFile(os.path.join(self.saveFolder, name)) for s in range(doc.Sections.Count): section = doc.Sections.get_Item(s) for i in range(section.Tables.Count): table = section.Tables.get_Item(i) table_data = [] for row in range(table.Rows.Count): row_data = [] for cell in range(table.Rows.get_Item(row).Cells.Count): cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell) cell_text = "" for p in range(cell_obj.Paragraphs.Count): cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " " row_data.append(cell_text.strip()) table_data.append(row_data) tables.append(table_data) return tables def extractTableAsExcel(self): getDoc = '' try: # reference: # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB doc = Document() doc.LoadFromFile(self.wordDoc) getDoc = "have document" except: response = requests.get(self.wordDoc) name = self.wordDoc.split("/")[-1] with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data temp_file.write(response.content) doc = Document() doc.LoadFromFile(self.saveFolder+"/" + name) getDoc = "have document" if len(getDoc) > 0: try: # Create an instance of Workbook wb = Workbook() wb.Worksheets.Clear() # Loop through sections in the document for i in range(doc.Sections.Count): # Get a section section = doc.Sections.get_Item(i) # Loop through tables in the section for j in range(section.Tables.Count): # Get a table table = section.Tables.get_Item(j) # Create a worksheet ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}') # Write the table to the worksheet for row in range(table.Rows.Count): # Get a row tableRow = table.Rows.get_Item(row) # Loop through cells in the row for cell in range(tableRow.Cells.Count): # Get a cell tableCell = tableRow.Cells.get_Item(cell) # Get the text in the cell cellText = '' for paragraph in range(tableCell.Paragraphs.Count): paragraph = tableCell.Paragraphs.get_Item(paragraph) cellText = cellText + (paragraph.Text + ' ') # Write the cell text to the worksheet ws.SetCellValue(row + 1, cell + 1, cellText) # Save the workbook name = self.wordDoc.split("/")[-1] if self.saveFolder == None: wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016) nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx" else: wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016) nameFile = self.saveFolder+'/'+name + ".xlsx" doc.Close() wb.Dispose() return nameFile except: return "No table found on word doc" else: return "No table found on word doc" def getReference(self): pass def getSupMaterial(self): pass