efdemo001A

Sleeping

allinaigc commited on Jun 16, 2024

Commit

3cab13c

verified ·

1 Parent(s): 59c301a

Upload 2 files

Files changed (2) hide show

app.py CHANGED Viewed

@@ -307,8 +307,9 @@ def upload_file(uploaded_file):
 ## streamlit中显示上传文件的模块
 try:
     uploaded_file = st.file_uploader(
-        "选择需要处理的文件（注：可一次选择多个文件）", type=(["txt", "docx", "PDF", "CSV", "xlsx","xls","json"]), accept_multiple_files=True)
     ## 获得上传所有文件的大小。
     uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)

 ## streamlit中显示上传文件的模块
 try:
+    ### 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
     uploaded_file = st.file_uploader(
+        "选择需要处理的文件（注：可一次选择多个文件）", type=(["txt", "PDF", "CSV", "xlsx","xls","json"]), accept_multiple_files=True)
     ## 获得上传所有文件的大小。
     uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)

st_data_parser.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
 1. 完成了多个文件类型的解析，包括pdf, docx, xlsx, csv, json等。
 1. csv，json, xls, xlxs, docx, pdf文件，直接读取文件内容。
 """
 # -*- coding: utf-8 -*-
 import numpy as np
@@ -20,8 +21,8 @@ from rich import print
 import warnings
 warnings.filterwarnings('ignore')
 # style.use('seaborn')
-import docx ## read docx file. from docx import Document
-from docx import Document
 import pandas as pd
 import PyPDF2
@@ -41,10 +42,10 @@ def parser(file):
             file_content += page_obj.extract_text()
         # pdf_file_obj.close()
-    elif '.docx' in file.name:
-        print('Microsoft Word file detected')
-        doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
-        file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
     # elif '.xlsx' in file or '.xls' in file:
     elif '.xlsx' in file or '.xls' in file.name:

 """
 1. 完成了多个文件类型的解析，包括pdf, docx, xlsx, csv, json等。
 1. csv，json, xls, xlxs, docx, pdf文件，直接读取文件内容。
+    2. 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
 """
 # -*- coding: utf-8 -*-
 import numpy as np
 import warnings
 warnings.filterwarnings('ignore')
 # style.use('seaborn')
+# import docx ## read docx file. from docx import Document
+# from docx import Document
 import pandas as pd
 import PyPDF2
             file_content += page_obj.extract_text()
         # pdf_file_obj.close()
+    # elif '.docx' in file.name:
+    #     print('Microsoft Word file detected')
+    #     doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
+    #     file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
     # elif '.xlsx' in file or '.xls' in file:
     elif '.xlsx' in file or '.xls' in file.name: