Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +2 -1
- st_data_parser.py +8 -7
app.py
CHANGED
@@ -307,8 +307,9 @@ def upload_file(uploaded_file):
|
|
307 |
|
308 |
## streamlit中显示上传文件的模块
|
309 |
try:
|
|
|
310 |
uploaded_file = st.file_uploader(
|
311 |
-
"选择需要处理的文件(注:可一次选择多个文件)", type=(["txt", "
|
312 |
|
313 |
## 获得上传所有文件的大小。
|
314 |
uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)
|
|
|
307 |
|
308 |
## streamlit中显示上传文件的模块
|
309 |
try:
|
310 |
+
### 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
|
311 |
uploaded_file = st.file_uploader(
|
312 |
+
"选择需要处理的文件(注:可一次选择多个文件)", type=(["txt", "PDF", "CSV", "xlsx","xls","json"]), accept_multiple_files=True)
|
313 |
|
314 |
## 获得上传所有文件的大小。
|
315 |
uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)
|
st_data_parser.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
"""
|
2 |
1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
|
3 |
1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
|
4 |
-
|
|
|
5 |
"""
|
6 |
# -*- coding: utf-8 -*-
|
7 |
import numpy as np
|
@@ -20,8 +21,8 @@ from rich import print
|
|
20 |
import warnings
|
21 |
warnings.filterwarnings('ignore')
|
22 |
# style.use('seaborn')
|
23 |
-
import docx ## read docx file. from docx import Document
|
24 |
-
from docx import Document
|
25 |
import pandas as pd
|
26 |
import PyPDF2
|
27 |
|
@@ -41,10 +42,10 @@ def parser(file):
|
|
41 |
file_content += page_obj.extract_text()
|
42 |
# pdf_file_obj.close()
|
43 |
|
44 |
-
elif '.docx' in file.name:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
# elif '.xlsx' in file or '.xls' in file:
|
50 |
elif '.xlsx' in file or '.xls' in file.name:
|
|
|
1 |
"""
|
2 |
1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
|
3 |
1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
|
4 |
+
2. 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
|
5 |
+
|
6 |
"""
|
7 |
# -*- coding: utf-8 -*-
|
8 |
import numpy as np
|
|
|
21 |
import warnings
|
22 |
warnings.filterwarnings('ignore')
|
23 |
# style.use('seaborn')
|
24 |
+
# import docx ## read docx file. from docx import Document
|
25 |
+
# from docx import Document
|
26 |
import pandas as pd
|
27 |
import PyPDF2
|
28 |
|
|
|
42 |
file_content += page_obj.extract_text()
|
43 |
# pdf_file_obj.close()
|
44 |
|
45 |
+
# elif '.docx' in file.name:
|
46 |
+
# print('Microsoft Word file detected')
|
47 |
+
# doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
|
48 |
+
# file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
|
49 |
|
50 |
# elif '.xlsx' in file or '.xls' in file:
|
51 |
elif '.xlsx' in file or '.xls' in file.name:
|