File size: 7,858 Bytes
2621d77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8835144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2621d77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#! pip install spire.doc
#! pip install Spire.XLS
import pandas as pd
from spire.doc import *
from spire.doc.common import *
from spire.xls import *
from spire.xls.common import *
from NER import cleanText
import requests 
class wordDoc(): # using python-docx
  def __init__(self, wordDoc,saveFolder):
    self.wordDoc = wordDoc
    self.saveFolder = saveFolder
  def openFile(self):
    document = Document()
    return document.LoadFromFile(self.wordDoc)
  def extractTextByPage(self):
    # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
    json = {}
    #doc = self.openFile()
    # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
    try:
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)
    text = doc.GetText()
    return text
  def extractTableAsText(self):
    getDoc = ''
    try:
      # reference:
      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
      getDoc = "have document"
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)  
      getDoc = "have document"
    json = {}
    if len(getDoc) > 0:
      # Loop through the sections
      for s in range(doc.Sections.Count):
        # Get a section
          section = doc.Sections.get_Item(s)
          # Get the tables in the section
          json["Section" + str(s)] = {}
          tables = section.Tables
          # Loop through the tables
          for i in range(0, tables.Count):
              # Get a table
              table = tables.get_Item(i)
              # Initialize a string to store the table data
              tableData = ''
              # Loop through the rows of the table
              for j in range(0, table.Rows.Count):
                  # Loop through the cells of the row
                  for k in range(0, table.Rows.get_Item(j).Cells.Count):
                      # Get a cell
                      cell = table.Rows.get_Item(j).Cells.get_Item(k)
                      # Get the text in the cell
                      cellText = ''
                      for para in range(cell.Paragraphs.Count):
                          paragraphText = cell.Paragraphs.get_Item(para).Text
                          cellText += (paragraphText + ' ')
                      # Add the text to the string
                      tableData += cellText
                      if k < table.Rows.get_Item(j).Cells.Count - 1:
                          tableData += '\t'
                  # Add a new line
                  tableData += '\n'
              json["Section" + str(s)]["Table"+str(i)] = tableData
    return json
  def extractTableAsList(self):
    tables = []
    try:
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(os.path.join(self.saveFolder, name), "wb") as f:
        f.write(response.content)
      doc = Document()
      doc.LoadFromFile(os.path.join(self.saveFolder, name))

    for s in range(doc.Sections.Count):
      section = doc.Sections.get_Item(s)
      for i in range(section.Tables.Count):
        table = section.Tables.get_Item(i)
        table_data = []
        for row in range(table.Rows.Count):
          row_data = []
          for cell in range(table.Rows.get_Item(row).Cells.Count):
            cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell)
            cell_text = ""
            for p in range(cell_obj.Paragraphs.Count):
              cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " "
            row_data.append(cell_text.strip())
          table_data.append(row_data)
        tables.append(table_data)
    return tables  
  def extractTableAsExcel(self):
    getDoc = ''
    try:
      # reference:
      # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
      doc = Document()
      doc.LoadFromFile(self.wordDoc)
      getDoc = "have document"
    except:
      response = requests.get(self.wordDoc)
      name = self.wordDoc.split("/")[-1]
      with open(self.saveFolder+"/" + name, "wb") as temp_file:  # Create a temporary file to store the downloaded data
        temp_file.write(response.content)  
      doc = Document()
      doc.LoadFromFile(self.saveFolder+"/" + name)  
      getDoc = "have document"
    if len(getDoc) > 0:
      try:
        # Create an instance of Workbook
        wb = Workbook()
        wb.Worksheets.Clear()

        # Loop through sections in the document
        for i in range(doc.Sections.Count):
            # Get a section
            section = doc.Sections.get_Item(i)
            # Loop through tables in the section
            for j in range(section.Tables.Count):
                # Get a table
                table = section.Tables.get_Item(j)
                # Create a worksheet
                ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
                # Write the table to the worksheet
                for row in range(table.Rows.Count):
                    # Get a row
                    tableRow = table.Rows.get_Item(row)
                    # Loop through cells in the row
                    for cell in range(tableRow.Cells.Count):
                        # Get a cell
                        tableCell = tableRow.Cells.get_Item(cell)
                        # Get the text in the cell
                        cellText = ''
                        for paragraph in range(tableCell.Paragraphs.Count):
                            paragraph = tableCell.Paragraphs.get_Item(paragraph)
                            cellText = cellText + (paragraph.Text + ' ')
                        # Write the cell text to the worksheet
                        ws.SetCellValue(row + 1, cell + 1, cellText)

        # Save the workbook
        name = self.wordDoc.split("/")[-1]
        if self.saveFolder == None:
          wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
          nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
        else:
          wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
          nameFile = self.saveFolder+'/'+name + ".xlsx"
        doc.Close()
        wb.Dispose()
        return nameFile
      except: return "No table found on word doc"  
    else:
      return "No table found on word doc"     
  def getReference(self):
    pass
  def getSupMaterial(self):
    pass