allinaigc commited on
Commit
3cab13c
1 Parent(s): 59c301a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. st_data_parser.py +8 -7
app.py CHANGED
@@ -307,8 +307,9 @@ def upload_file(uploaded_file):
307
 
308
  ## streamlit中显示上传文件的模块
309
  try:
 
310
  uploaded_file = st.file_uploader(
311
- "选择需要处理的文件(注:可一次选择多个文件)", type=(["txt", "docx", "PDF", "CSV", "xlsx","xls","json"]), accept_multiple_files=True)
312
 
313
  ## 获得上传所有文件的大小。
314
  uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)
 
307
 
308
  ## streamlit中显示上传文件的模块
309
  try:
310
+ ### 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
311
  uploaded_file = st.file_uploader(
312
+ "选择需要处理的文件(注:可一次选择多个文件)", type=(["txt", "PDF", "CSV", "xlsx","xls","json"]), accept_multiple_files=True)
313
 
314
  ## 获得上传所有文件的大小。
315
  uploaded_filesize = round(sum(file.size for file in uploaded_file) / 1000, 2)
st_data_parser.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
  1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
3
  1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
4
-
 
5
  """
6
  # -*- coding: utf-8 -*-
7
  import numpy as np
@@ -20,8 +21,8 @@ from rich import print
20
  import warnings
21
  warnings.filterwarnings('ignore')
22
  # style.use('seaborn')
23
- import docx ## read docx file. from docx import Document
24
- from docx import Document
25
  import pandas as pd
26
  import PyPDF2
27
 
@@ -41,10 +42,10 @@ def parser(file):
41
  file_content += page_obj.extract_text()
42
  # pdf_file_obj.close()
43
 
44
- elif '.docx' in file.name:
45
- print('Microsoft Word file detected')
46
- doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
47
- file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
48
 
49
  # elif '.xlsx' in file or '.xls' in file:
50
  elif '.xlsx' in file or '.xls' in file.name:
 
1
  """
2
  1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
3
  1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
4
+ 2. 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
5
+
6
  """
7
  # -*- coding: utf-8 -*-
8
  import numpy as np
 
21
  import warnings
22
  warnings.filterwarnings('ignore')
23
  # style.use('seaborn')
24
+ # import docx ## read docx file. from docx import Document
25
+ # from docx import Document
26
  import pandas as pd
27
  import PyPDF2
28
 
 
42
  file_content += page_obj.extract_text()
43
  # pdf_file_obj.close()
44
 
45
+ # elif '.docx' in file.name:
46
+ # print('Microsoft Word file detected')
47
+ # doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
48
+ # file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
49
 
50
  # elif '.xlsx' in file or '.xls' in file:
51
  elif '.xlsx' in file or '.xls' in file.name: