Marathon23 commited on
Commit
5d1e7cd
·
verified ·
1 Parent(s): d3ebb94

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -0
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import openai
5
+ import gradio as gr
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.document_loaders import PyMuPDFLoader, PyPDFLoader
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.embeddings.openai import OpenAIEmbeddings
11
+ from langchain.chat_models import ChatOpenAI
12
+ import shutil # 用於文件複製
13
+
14
+ # 獲取 OpenAI API 密鑰
15
+ api_key = os.getenv("OPENAI_API_KEY")
16
+ if not api_key:
17
+ raise ValueError("未能獲取 OpenAI_API_KEY。請在 Hugging Face Spaces 的 Secrets 中設置它。")
18
+ openai.api_key = api_key
19
+ print("OpenAI API 密鑰已設置。")
20
+
21
+ # 確保向量資料庫目錄存在且有寫入權限
22
+ VECTORDB_DIR = os.path.abspath("./data")
23
+ os.makedirs(VECTORDB_DIR, exist_ok=True)
24
+ os.chmod(VECTORDB_DIR, 0o755) # 設置適當的權限
25
+ print(f"VECTORDB_DIR set to: {VECTORDB_DIR}")
26
+
27
+ # 定義測試 PDF 加載器的函數
28
+ def test_pdf_loader(file_path, loader_type='PyMuPDFLoader'):
29
+ print(f"Testing PDF loader ({loader_type}) with file: {file_path}")
30
+ try:
31
+ if loader_type == 'PyMuPDFLoader':
32
+ loader = PyMuPDFLoader(file_path)
33
+ elif loader_type == 'PyPDFLoader':
34
+ loader = PyPDFLoader(file_path)
35
+ else:
36
+ print(f"Unknown loader type: {loader_type}")
37
+ return
38
+ loaded_docs = loader.load()
39
+ if loaded_docs:
40
+ print(f"Successfully loaded {file_path} with {len(loaded_docs)} documents.")
41
+ print(f"Document content (first 500 chars): {loaded_docs[0].page_content[:500]}")
42
+ else:
43
+ print(f"No documents loaded from {file_path}.")
44
+ except Exception as e:
45
+ print(f"Error loading {file_path} with {loader_type}: {e}")
46
+
47
+ # 定義載入和處理 PDF 文件的函數
48
+ def load_and_process_documents(file_paths, loader_type='PyMuPDFLoader'):
49
+ documents = []
50
+ print("開始載入上傳的 PDF 文件。")
51
+
52
+ for file_path in file_paths:
53
+ print(f"載入 PDF 文件: {file_path}")
54
+ if not os.path.exists(file_path):
55
+ print(f"文件不存在: {file_path}")
56
+ continue
57
+ try:
58
+ if loader_type == 'PyMuPDFLoader':
59
+ loader = PyMuPDFLoader(file_path)
60
+ elif loader_type == 'PyPDFLoader':
61
+ loader = PyPDFLoader(file_path)
62
+ else:
63
+ print(f"Unknown loader type: {loader_type}")
64
+ continue
65
+ loaded_docs = loader.load()
66
+ if loaded_docs:
67
+ print(f"載入 {file_path} 成功,包含 {len(loaded_docs)} 個文檔。")
68
+ # 打印第一個文檔的部分內容以確認
69
+ print(f"第一個文檔內容: {loaded_docs[0].page_content[:500]}")
70
+ documents.extend(loaded_docs)
71
+ else:
72
+ print(f"載入 {file_path} 但未找到任何文檔。")
73
+ except Exception as e:
74
+ print(f"載入 {file_path} 時出現錯誤: {e}")
75
+
76
+ if not documents:
77
+ raise ValueError("沒有找到任何 PDF 文件或 PDF 文件無法載入。")
78
+ else:
79
+ print(f"總共載入了 {len(documents)} 個文檔。")
80
+
81
+ # 分割長文本
82
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
83
+ documents = text_splitter.split_documents(documents)
84
+ print(f"分割後的文檔數量: {len(documents)}")
85
+
86
+ if not documents:
87
+ raise ValueError("分割後的文檔列表為空。請檢查 PDF 文件內容。")
88
+
89
+ # 初始化向量資料庫
90
+ try:
91
+ embeddings = OpenAIEmbeddings(openai_api_key=api_key) # 直接傳遞 API 密鑰
92
+ print("初始化 OpenAIEmbeddings 成功。")
93
+ except Exception as e:
94
+ raise ValueError(f"初始化 OpenAIEmbeddings 時出現錯誤: {e}")
95
+
96
+ try:
97
+ vectordb = Chroma.from_documents(
98
+ documents,
99
+ embedding=embeddings,
100
+ persist_directory=VECTORDB_DIR
101
+ )
102
+ print("初始化 Chroma 向量資料庫成功。")
103
+ except Exception as e:
104
+ raise ValueError(f"初始化 Chroma 向量資料庫時出現錯誤: {e}")
105
+
106
+ return vectordb
107
+
108
+ # 定義聊天處理函數
109
+ def handle_query(user_message, chat_history, vectordb):
110
+ try:
111
+ if not user_message:
112
+ return chat_history
113
+
114
+ # 添加角色指令前綴
115
+ preface = """
116
+ 指令: 以繁體中文回答問題,200字以內。你是一位專業心理學家與調酒師,專精於 MBTI 人格與經典調酒主題。
117
+ 非相關問題,請回應:「目前僅支援 MBTI 分析與經典調酒主題。」。
118
+ """
119
+ query = f"{preface} 查詢內容:{user_message}"
120
+
121
+ # 初始化 ConversationalRetrievalChain,並傳遞 openai_api_key
122
+ pdf_qa = ConversationalRetrievalChain.from_llm(
123
+ ChatOpenAI(temperature=0.7, model="gpt-4", openai_api_key=api_key),
124
+ retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
125
+ return_source_documents=True
126
+ )
127
+
128
+ # 呼叫模型並處理查詢
129
+ result = pdf_qa.invoke({"question": query, "chat_history": chat_history})
130
+
131
+ # 檢查結果並更新聊天歷史
132
+ if "answer" in result:
133
+ chat_history = chat_history + [(user_message, result["answer"])]
134
+ else:
135
+ chat_history = chat_history + [(user_message, "抱歉,未能獲得有效回應。")]
136
+ return chat_history
137
+
138
+ except Exception as e:
139
+ return chat_history + [("系統", f"出現錯誤: {str(e)}")]
140
+
141
+ # 定義 Gradio 的處理函數
142
+ def process_files(files, state):
143
+ print("process_files called")
144
+ if files:
145
+ try:
146
+ print(f"Received {len(files)} files")
147
+ saved_file_paths = []
148
+ for file_path in files:
149
+ print(f"Processing file: {file_path}")
150
+ save_path = os.path.join(VECTORDB_DIR, os.path.basename(file_path))
151
+ # 複製文件到 VECTORDB_DIR
152
+ shutil.copy(file_path, save_path)
153
+ # 確認文件是否存在
154
+ if os.path.exists(save_path):
155
+ print(f"File successfully saved to: {save_path}")
156
+ else:
157
+ print(f"Failed to save file to: {save_path}")
158
+ saved_file_paths.append(save_path)
159
+ # 測試 PDF 加載器
160
+ test_pdf_loader(save_path, loader_type='PyMuPDFLoader')
161
+ # 列出 VECTORDB_DIR 中的所有文件
162
+ saved_files = os.listdir(VECTORDB_DIR)
163
+ print(f"Files in VECTORDB_DIR ({VECTORDB_DIR}): {saved_files}")
164
+ vectordb = load_and_process_documents(saved_file_paths, loader_type='PyMuPDFLoader')
165
+ state['vectordb'] = vectordb
166
+ return "PDF 文件已成功上傳並處理。您現在可以開始提問。", state
167
+ except Exception as e:
168
+ print(f"Error in process_files: {e}")
169
+ return f"處理文件時出現錯誤: {e}", state
170
+ else:
171
+ return "請上傳至少一個 PDF 文件。", state
172
+
173
+ def chat_interface(user_message, chat_history, state):
174
+ vectordb = state.get('vectordb', None)
175
+ if not vectordb:
176
+ return chat_history, state, "請先上傳 PDF 文件以進行處理。"
177
+
178
+ # 處理查詢
179
+ updated_history = handle_query(user_message, chat_history, vectordb)
180
+ return updated_history, state, ""
181
+
182
+ # 設計 Gradio 介面
183
+ with gr.Blocks() as demo:
184
+ gr.Markdown("<h1 style='text-align: center;'>MBTI 與經典調酒 AI 助理</h1>")
185
+
186
+ # 定義共享的 state
187
+ state = gr.State({"vectordb": None})
188
+
189
+ with gr.Tab("上傳 PDF 文件"):
190
+ with gr.Row():
191
+ with gr.Column(scale=1):
192
+ upload = gr.File(
193
+ file_count="multiple",
194
+ file_types=[".pdf"],
195
+ label="上傳 PDF 文件",
196
+ interactive=True,
197
+ type="filepath" # 保持為 "filepath"
198
+ )
199
+ upload_btn = gr.Button("上傳並處理")
200
+ upload_status = gr.Textbox(label="上傳狀態", interactive=False)
201
+
202
+ with gr.Tab("聊天機器人"):
203
+ chatbot = gr.Chatbot()
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=0.85):
207
+ txt = gr.Textbox(show_label=False, placeholder="請輸入您的問題...")
208
+ with gr.Column(scale=0.15, min_width=0):
209
+ submit_btn = gr.Button("提問")
210
+
211
+ # 綁定提問按鈕
212
+ submit_btn.click(
213
+ chat_interface,
214
+ inputs=[txt, chatbot, state],
215
+ outputs=[chatbot, state, txt]
216
+ )
217
+
218
+ # 綁定輸入框的提交事件
219
+ txt.submit(
220
+ chat_interface,
221
+ inputs=[txt, chatbot, state],
222
+ outputs=[chatbot, state, txt]
223
+ )
224
+
225
+ # 綁定上傳按鈕
226
+ upload_btn.click(
227
+ process_files,
228
+ inputs=[upload, state],
229
+ outputs=[upload_status, state]
230
+ )
231
+
232
+ # 啟動 Gradio 應用
233
+ demo.launch()