Spaces:

VyLala
/

mtDNALocation

Running

File size: 7,858 Bytes

#! pip install spire.doc
#! pip install Spire.XLS
import pandas as pd
from spire.doc import *
from spire.doc.common import *
from spire.xls import *
from spire.xls.common import *
from NER import cleanText
import requests 
class wordDoc(): # using python-docx
  def __init__(self, wordDoc,saveFolder):
    self.wordDoc = wordDoc
    self.saveFolder = saveFolder
  def openFile(self):
    document = Document()
    return document.LoadFromFile(self.wordDoc)
  def extractTextByPage(self):
    # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
    json = {}
    #doc = self.openFile()
    # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
    try:
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)
    text = doc.GetText()
    return text
  def extractTableAsText(self):
    getDoc = ''
    try:
      # reference:
      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
      getDoc = "have document"
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)  
      getDoc = "have document"
    json = {}
    if len(getDoc) > 0:
      # Loop through the sections
      for s in range(doc.Sections.Count):
        # Get a section
          section = doc.Sections.get_Item(s)
          # Get the tables in the section
          json["Section" + str(s)] = {}
          tables = section.Tables
          # Loop through the tables
          for i in range(0, tables.Count):
              # Get a table
              table = tables.get_Item(i)
              # Initialize a string to store the table data
              tableData = ''
              # Loop through the rows of the table
              for j in range(0, table.Rows.Count):
                  # Loop through the cells of the row
                  for k in range(0, table.Rows.get_Item(j).Cells.Count):
                      # Get a cell
                      cell = table.Rows.get_Item(j).Cells.get_Item(k)
                      # Get the text in the cell
                      cellText = ''
                      for para in range(cell.Paragraphs.Count):
                          paragraphText = cell.Paragraphs.get_Item(para).Text
                          cellText += (paragraphText + ' ')
                      # Add the text to the string
                      tableData += cellText
                      if k < table.Rows.get_Item(j).Cells.Count - 1:
                          tableData += '\t'
                  # Add a new line
                  tableData += '\n'
              json["Section" + str(s)]["Table"+str(i)] = tableData
    return json
  def extractTableAsList(self):
    tables = []
    try:
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(os.path.join(self.saveFolder, name), "wb") as f:
        f.write(response.content)
      doc = Document()
      doc.LoadFromFile(os.path.join(self.saveFolder, name))

    for s in range(doc.Sections.Count):
      section = doc.Sections.get_Item(s)
      for i in range(section.Tables.Count):
        table = section.Tables.get_Item(i)
        table_data = []
        for row in range(table.Rows.Count):
          row_data = []
          for cell in range(table.Rows.get_Item(row).Cells.Count):
            cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell)
            cell_text = ""
            for p in range(cell_obj.Paragraphs.Count):
              cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " "
            row_data.append(cell_text.strip())
          table_data.append(row_data)
        tables.append(table_data)
    return tables  
  def extractTableAsExcel(self):
    getDoc = ''
    try:
      # reference:
      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
      getDoc = "have document"
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)  
      getDoc = "have document"
    if len(getDoc) > 0:
      try:
        # Create an instance of Workbook
        wb = Workbook()
        wb.Worksheets.Clear()

        # Loop through sections in the document
        for i in range(doc.Sections.Count):
            # Get a section
            section = doc.Sections.get_Item(i)
            # Loop through tables in the section
            for j in range(section.Tables.Count):
                # Get a table
                table = section.Tables.get_Item(j)
                # Create a worksheet
                ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
                # Write the table to the worksheet
                for row in range(table.Rows.Count):
                    # Get a row
                    tableRow = table.Rows.get_Item(row)
                    # Loop through cells in the row
                    for cell in range(tableRow.Cells.Count):
                        # Get a cell
                        tableCell = tableRow.Cells.get_Item(cell)
                        # Get the text in the cell
                        cellText = ''
                        for paragraph in range(tableCell.Paragraphs.Count):
                            paragraph = tableCell.Paragraphs.get_Item(paragraph)
                            cellText = cellText + (paragraph.Text + ' ')
                        # Write the cell text to the worksheet
                        ws.SetCellValue(row + 1, cell + 1, cellText)

        # Save the workbook
        name = self.wordDoc.split("/")[-1]
        if self.saveFolder == None:
          wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
          nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
        else:
          wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
          nameFile = self.saveFolder+'/'+name + ".xlsx"
        doc.Close()
        wb.Dispose()
        return nameFile
      except: return "No table found on word doc"  
    else:
      return "No table found on word doc"     
  def getReference(self):
    pass
  def getSupMaterial(self):
    pass