allinaigc commited on
Commit
0174b8e
1 Parent(s): eefa3af

Upload st_data_parser.py

Browse files
Files changed (1) hide show
  1. st_data_parser.py +93 -0
st_data_parser.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
3
+ 1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
4
+
5
+ """
6
+ # -*- coding: utf-8 -*-
7
+ import numpy as np
8
+ import pandas as pd
9
+ import re
10
+ from re import sub
11
+ import smtplib
12
+ import matplotlib.pyplot as plt
13
+ from itertools import product
14
+ from tqdm import tqdm_notebook, tqdm, trange
15
+ import time
16
+ import pretty_errors
17
+ import seaborn as sns
18
+ from matplotlib.pyplot import style
19
+ from rich import print
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
+ # style.use('seaborn')
23
+ import docx ## read docx file. from docx import Document
24
+ from docx import Document
25
+ import pandas as pd
26
+ import PyPDF2
27
+
28
+ ### 解析文件,返回文件内容,包括pdf, docx, xlsx, csv, json等。
29
+ def parser(file):
30
+ file_content = ''
31
+ if '.pdf' in file.name:
32
+ print('PDF file detected')
33
+ # Add your PDF parsing code here
34
+ # pdf_file_obj = open(file, 'rb')
35
+ # pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
36
+ pdf_reader = PyPDF2.PdfReader(file)
37
+ num_pages = len(pdf_reader.pages)
38
+ file_content = ''
39
+ for page in range(num_pages):
40
+ page_obj = pdf_reader.pages[page]
41
+ file_content += page_obj.extract_text()
42
+ # pdf_file_obj.close()
43
+
44
+ elif '.docx' in file.name:
45
+ print('Microsoft Word file detected')
46
+ doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
47
+ file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
48
+
49
+ # elif '.xlsx' in file or '.xls' in file:
50
+ elif '.xlsx' in file or '.xls' in file.name:
51
+ print('Excel file detected')
52
+ df = pd.read_excel(file) ## 这里streamlit中的上传格式与普通格式一致。
53
+ file_content = df.to_string()
54
+
55
+ # elif '.csv' in file:
56
+ elif '.csv' in file.name:
57
+ print('CSV file detected')
58
+ # df = pd.read_csv(csv_file)
59
+ # file_content = df.to_string()
60
+
61
+ ## streamlit中的获得上传文件的内容,与一般的情况不一样。
62
+ csv_file = file.getvalue().decode('utf-8')
63
+ file_content = csv_file
64
+
65
+ ### streamlit中的获得上传文件的内容,与一般的情况不一样。
66
+ # elif '.json' in file:
67
+ elif '.json' in file.name:
68
+ # print('JSON file detected')
69
+ json_file = file.getvalue() ## 在streamlit中获得上传文件的json文件内容。这里不能用file.read().
70
+ json_file = json_file.decode('utf-8')
71
+ df = pd.read_json(json_file)
72
+ file_content = df.to_string()
73
+ # print('file_content:', file_content)
74
+
75
+ return file_content
76
+
77
+ # res_1 = parser('summary_qwen.csv')
78
+ # print(res_1) ## pass csv file
79
+
80
+ # res_2 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/text_mining/训练数据/13011800166202403111112051850.json')
81
+ # print(res_2) ## pass json file
82
+
83
+ # res_3 = parser('/Users/yunshi/Downloads/360Data/Data Center/Consulting Material/第二份资料/2 (H)/北大纵横/北大纵横2/北大纵横—-涟钢团ERP管理咨询项目组织结构设计与主业务流程设计报告/过程文件/涟钢资料/8)公司岗位设置/2002年新定员库2/生服公司.xls')
84
+ # print(res_3) ## pass xls file
85
+
86
+ # res_4 = parser('/Users/yunshi/Downloads/同步空间/LLM/2023ChatGPT/Coding/code_interpreter/rawdata/模拟数据.xlsx')
87
+ # print(res_4) ## pass xlsx file
88
+
89
+ # res_5 = parser('/Users/yunshi/Downloads/360Data/Data Center/Business Force/Project/中移在线/客户资料/201806 广州中心基线评审申报资料/各类附件/1.4 信息系统规划-1.docx')
90
+ # print(res_5) ## pass docx file
91
+
92
+ # res_6 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/在线国产大模型演示与测试站点.pdf')
93
+ # print(res_6) ## pass docx file