File size: 4,005 Bytes
0174b8e
 
 
3cab13c
 
0174b8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cab13c
 
0174b8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cab13c
 
 
 
0174b8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
    2. 目前docx模块在huggingface的python3.10报错。暂时不支持docx文件。
    
"""
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import re
from re import sub
import smtplib
import matplotlib.pyplot as plt
from itertools import product
from tqdm import tqdm_notebook, tqdm, trange
import time
import pretty_errors
import seaborn as sns
from matplotlib.pyplot import style
from rich import print
import warnings
warnings.filterwarnings('ignore')
# style.use('seaborn')
# import docx ## read docx file. from docx import Document
# from docx import Document
import pandas as pd
import PyPDF2

### 解析文件,返回文件内容,包括pdf, docx, xlsx, csv, json等。
def parser(file):
    file_content = ''
    if '.pdf' in file.name:
        print('PDF file detected')
        # Add your PDF parsing code here
        # pdf_file_obj = open(file, 'rb')
        # pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        file_content = ''
        for page in range(num_pages):
            page_obj = pdf_reader.pages[page]
            file_content += page_obj.extract_text()
        # pdf_file_obj.close()
    
    # elif '.docx' in file.name:
    #     print('Microsoft Word file detected')
    #     doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
    #     file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # elif '.xlsx' in file or '.xls' in file:
    elif '.xlsx' in file or '.xls' in file.name:
        print('Excel file detected')
        df = pd.read_excel(file) ## 这里streamlit中的上传格式与普通格式一致。
        file_content = df.to_string()
        
    # elif '.csv' in file:
    elif '.csv' in file.name:
        print('CSV file detected')
        # df = pd.read_csv(csv_file)
        # file_content = df.to_string()
        
        ## streamlit中的获得上传文件的内容,与一般的情况不一样。
        csv_file = file.getvalue().decode('utf-8')
        file_content = csv_file
    
    ### streamlit中的获得上传文件的内容,与一般的情况不一样。
    # elif '.json' in file:
    elif '.json' in file.name:
        # print('JSON file detected')
        json_file = file.getvalue() ## 在streamlit中获得上传文件的json文件内容。这里不能用file.read().
        json_file = json_file.decode('utf-8')
        df = pd.read_json(json_file)
        file_content = df.to_string()
        # print('file_content:', file_content)

    return file_content

# res_1 = parser('summary_qwen.csv')
# print(res_1) ## pass csv file

# res_2 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/text_mining/训练数据/13011800166202403111112051850.json')
# print(res_2) ## pass json file

# res_3 = parser('/Users/yunshi/Downloads/360Data/Data Center/Consulting Material/第二份资料/2 (H)/北大纵横/北大纵横2/北大纵横—-涟钢团ERP管理咨询项目组织结构设计与主业务流程设计报告/过程文件/涟钢资料/8)公司岗位设置/2002年新定员库2/生服公司.xls')
# print(res_3) ## pass xls file

# res_4 = parser('/Users/yunshi/Downloads/同步空间/LLM/2023ChatGPT/Coding/code_interpreter/rawdata/模拟数据.xlsx')
# print(res_4) ## pass xlsx file

# res_5 = parser('/Users/yunshi/Downloads/360Data/Data Center/Business Force/Project/中移在线/客户资料/201806 广州中心基线评审申报资料/各类附件/1.4 信息系统规划-1.docx')
# print(res_5) ## pass docx file

# res_6 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/在线国产大模型演示与测试站点.pdf')
# print(res_6) ## pass docx file