Spaces:

dseditor
/

PDFTojson

Sleeping

App Files Files Community

dseditor commited on 4 days ago

Commit

8822081

verified ·

1 Parent(s): 718071b

Upload app.py

Browse files

Files changed (1) hide show

app.py +191 -51

app.py CHANGED Viewed

@@ -1,4 +1,12 @@
-import gradio as gr
 import requests
 import json
 import os
@@ -170,7 +178,7 @@ def extract_abstract(text: str) -> str:
     return abstract_text if abstract_text else "無摘要資訊"
-def process_json_data(json_input: str) -> str:
     """處理JSON數據，補充缺失欄位"""
     try:
         # 解析輸入的JSON
@@ -196,6 +204,9 @@ def process_json_data(json_input: str) -> str:
                 "摘要": ""
             }
             # 如果有下載位置，嘗試下載並提取資訊
             if download_url:
                 print(f"正在處理: {collection_name}")
@@ -225,8 +236,11 @@ def process_json_data(json_input: str) -> str:
                             if not author or author == "犯罪防治研究中心彙編":
                                 processed_item["作者"] = pdf_author
-                        # 設定摘要
-                        processed_item["摘要"] = extracted_info.get("abstract", "無摘要資訊")
                     finally:
                         # 清理臨時文件
@@ -235,11 +249,11 @@ def process_json_data(json_input: str) -> str:
                 else:
                     # 如果無法下載PDF，使用現有資訊
                     processed_item["名稱"] = extract_title_from_collection(collection_name)
-                    processed_item["摘要"] = "無法獲取摘要資訊"
             else:
                 # 如果沒有下載位置，使用現有資訊
                 processed_item["名稱"] = extract_title_from_collection(collection_name)
-                processed_item["摘要"] = "無下載位置，無法提取摘要"
             processed_data.append(processed_item)
@@ -249,27 +263,51 @@ def process_json_data(json_input: str) -> str:
     except Exception as e:
         return f"處理錯誤: {str(e)}"
-def save_json_file(json_data: str, filename: str) -> str:
-    """保存JSON文件"""
     try:
-        if not filename:
             filename = "processed_data.json"
         if not filename.endswith('.json'):
             filename += '.json'
         # 確保文件名安全
-        filename = re.sub(r'[^\w\-_\.]', '_', filename)
         with open(filename, 'w', encoding='utf-8') as f:
             f.write(json_data)
-        return f"文件已保存: {filename}"
     except Exception as e:
-        return f"保存失敗: {str(e)}"
 # PDF網址處理函數
-def process_pdf_urls(urls_text: str) -> str:
     """處理PDF網址列表���直接提取資訊"""
     try:
         # 解析網址
@@ -279,6 +317,7 @@ def process_pdf_urls(urls_text: str) -> str:
             return "請輸入至少一個PDF網址"
         processed_data = []
         for i, url in enumerate(urls, 1):
             print(f"正在處理第 {i}/{len(urls)} 個PDF: {url}")
@@ -295,7 +334,7 @@ def process_pdf_urls(urls_text: str) -> str:
                     item = {
                         "名稱": extracted_info.get("title", f"PDF文件 {i}"),
                         "作者": extracted_info.get("author", "未知作者"),
-                        "摘要": extracted_info.get("abstract", "無摘要資訊"),
                         "下載位置": url,
                         "論文集名稱": f"直接處理PDF {i}"
                     }
@@ -311,7 +350,7 @@ def process_pdf_urls(urls_text: str) -> str:
                 item = {
                     "名稱": f"無法下載的PDF {i}",
                     "作者": "未知作者",
-                    "摘要": "PDF下載失敗，無法提取摘要",
                     "下載位置": url,
                     "論文集名稱": f"處理失敗 {i}"
                 }
@@ -345,14 +384,27 @@ with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo:
                     json_input = gr.Textbox(
                         label="輸入JSON資料",
                         placeholder="請貼上您的JSON資料...",
-                        lines=10,
                         value='[\n    {\n        "論文集名稱": "刑事政策與犯罪防治研究36",\n        "作者": "犯罪防治研究中心彙編",\n        "下載位置": "https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"\n    }\n]'
                     )
                     filename_input1 = gr.Textbox(
-                        label="保存文件名",
                         placeholder="例: processed_papers.json",
-                        value="processed_papers.json"
                     )
                     process_json_btn = gr.Button("處理JSON資料", variant="primary", size="lg")
@@ -360,7 +412,7 @@ with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo:
                 with gr.Column(scale=2):
                     output_json1 = gr.Textbox(
                         label="處理結果",
-                        lines=20,
                         show_copy_button=True
                     )
@@ -381,14 +433,27 @@ with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo:
                     pdf_urls_input = gr.Textbox(
                         label="輸入PDF網址",
                         placeholder="請輸入PDF網址，每行一個...\n\n例如：\nhttps://example.com/paper1.pdf\nhttps://example.com/paper2.pdf\nhttps://example.com/paper3.pdf",
-                        lines=12,
                         value="https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"
                     )
                     filename_input2 = gr.Textbox(
-                        label="保存文件名",
                         placeholder="例: pdf_extracted_data.json",
-                        value="pdf_extracted_data.json"
                     )
                     process_urls_btn = gr.Button("處理PDF網址", variant="primary", size="lg")
@@ -396,7 +461,7 @@ with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo:
                 with gr.Column(scale=2):
                     output_json2 = gr.Textbox(
                         label="處理結果",
-                        lines=20,
                         show_copy_button=True
                     )
@@ -410,76 +475,151 @@ with gr.Blocks(title="PDF資料處理器", theme=gr.themes.Soft()) as demo:
                         visible=False
                     )
-    def process_and_save_json(json_input, filename):
         # 處理JSON資料
-        result = process_json_data(json_input)
         # 保存文件
-        save_msg = save_json_file(result, filename)
         # 如果保存成功，提供下載
-        if "已保存" in save_msg:
-            return result, save_msg, gr.update(visible=True, value=filename)
         else:
             return result, save_msg, gr.update(visible=False)
-    def process_and_save_urls(urls_input, filename):
         # 處理PDF網址
-        result = process_pdf_urls(urls_input)
         # 保存文件
-        save_msg = save_json_file(result, filename)
         # 如果保存成功，提供下載
-        if "已保存" in save_msg:
-            return result, save_msg, gr.update(visible=True, value=filename)
         else:
             return result, save_msg, gr.update(visible=False)
     # JSON處理按鈕事件
     process_json_btn.click(
         process_and_save_json,
-        inputs=[json_input, filename_input1],
         outputs=[output_json1, save_status1, download_file1]
     )
     # PDF網址處理按鈕事件
     process_urls_btn.click(
         process_and_save_urls,
-        inputs=[pdf_urls_input, filename_input2],
         outputs=[output_json2, save_status2, download_file2]
     )
     gr.Markdown("""
     ## 使用說明：
     ### JSON資料處理模式：
     1. 將現有的JSON資料貼入文本框
-    2. 系統會根據下載位置自動獲取PDF並補充缺失欄位
-    3. 適合處理已有部分資訊的資料集
     ### PDF網址直接處理模式：
     1. 直接貼入PDF網址，每行一個
-    2. 系統會自動下載並提取完整資訊
-    3. 適合批量處理新的PDF文件
-    ### 通用功能：
-    - 自動提取標題、作者、摘要
-    - 避免重複內容
-    - 生成標準化JSON格式
-    - 支援文件下載
     ## 注意事項：
-    - 處理時間取決於PDF文件大小和數量
-    - 系統會自動清理重複的標題內容
-    - 如果PDF無法下載，會使用預設資訊
     - 建議分批處理大量文件以獲得最佳效果
-    ## 範例PDF網址格式：
     ```
-    https://example.com/paper1.pdf
-    https://example.com/paper2.pdf
-    https://example.com/paper3.pdf
     ```
     """)

+    gr.Markdown("""
+    ## 使用說明：
+    ### 🆕 新功能特色：
+    - **🔄 智能檔名**: 自動使用論文集名稱或PDF標題作為JSON檔名
+    - **📝 手動摘要**: 可貼上現有摘要，避免重複提取
+    - **⚡ 高效處理**: 有摘要就不用下載PDF，節省時間
+    ### JSON資料import gradio as gr
 import requests
 import json
 import os
     return abstract_text if abstract_text else "無摘要資訊"
+def process_json_data(json_input: str, manual_abstract: str = "") -> str:
     """處理JSON數據，補充缺失欄位"""
     try:
         # 解析輸入的JSON
                 "摘要": ""
             }
+            # 如果有手動輸入的摘要，優先使用
+            use_manual_abstract = manual_abstract.strip()
             # 如果有下載位置，嘗試下載並提取資訊
             if download_url:
                 print(f"正在處理: {collection_name}")
                             if not author or author == "犯罪防治研究中心彙編":
                                 processed_item["作者"] = pdf_author
+                        # 設定摘要：優先使用手動輸入，否則使用PDF提取的
+                        if use_manual_abstract:
+                            processed_item["摘要"] = use_manual_abstract
+                        else:
+                            processed_item["摘要"] = extracted_info.get("abstract", "無摘要資訊")
                     finally:
                         # 清理臨時文件
                 else:
                     # 如果無法下載PDF，使用現有資訊
                     processed_item["名稱"] = extract_title_from_collection(collection_name)
+                    processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無法獲取摘要資訊"
             else:
                 # 如果沒有下載位置，使用現有資訊
                 processed_item["名稱"] = extract_title_from_collection(collection_name)
+                processed_item["摘要"] = use_manual_abstract if use_manual_abstract else "無下載位置，無法提取摘要"
             processed_data.append(processed_item)
     except Exception as e:
         return f"處理錯誤: {str(e)}"
+def generate_filename_from_collection(collection_name: str) -> str:
+    """根據論文集名稱生成安全的文件名"""
+    if not collection_name:
+        return "processed_data.json"
+    # 移除特殊字符，保留中英文、數字、連字符
+    safe_name = re.sub(r'[^\w\u4e00-\u9fff\-]', '_', collection_name)
+    # 移除多餘的下劃線
+    safe_name = re.sub(r'_+', '_', safe_name).strip('_')
+    # 限制長度
+    if len(safe_name) > 50:
+        safe_name = safe_name[:50]
+    # 確保有副檔名
+    if not safe_name.endswith('.json'):
+        safe_name += '.json'
+    return safe_name
+def save_json_file(json_data: str, filename: str = None, auto_filename: bool = False, collection_name: str = None) -> tuple:
+    """保存JSON文件，支持自動文件名生成"""
     try:
+        # 如果啟用自動文件名且有論文集名稱，使用論文集名稱
+        if auto_filename and collection_name:
+            filename = generate_filename_from_collection(collection_name)
+        elif not filename:
             filename = "processed_data.json"
         if not filename.endswith('.json'):
             filename += '.json'
         # 確保文件名安全
+        filename = re.sub(r'[^\w\u4e00-\u9fff\-_\.]', '_', filename)
         with open(filename, 'w', encoding='utf-8') as f:
             f.write(json_data)
+        return f"文件已保存: {filename}", filename
     except Exception as e:
+        return f"保存失敗: {str(e)}", None
 # PDF網址處理函數
+def process_pdf_urls(urls_text: str, manual_abstract: str = "") -> str:
     """處理PDF網址列表���直接提取資訊"""
     try:
         # 解析網址
             return "請輸入至少一個PDF網址"
         processed_data = []
+        use_manual_abstract = manual_abstract.strip()
         for i, url in enumerate(urls, 1):
             print(f"正在處理第 {i}/{len(urls)} 個PDF: {url}")
                     item = {
                         "名稱": extracted_info.get("title", f"PDF文件 {i}"),
                         "作者": extracted_info.get("author", "未知作者"),
+                        "摘要": use_manual_abstract if use_manual_abstract else extracted_info.get("abstract", "無摘要資訊"),
                         "下載位置": url,
                         "論文集名稱": f"直接處理PDF {i}"
                     }
                 item = {
                     "名稱": f"無法下載的PDF {i}",
                     "作者": "未知作者",
+                    "摘要": use_manual_abstract if use_manual_abstract else "PDF下載失敗，無法提取摘要",
                     "下載位置": url,
                     "論文集名稱": f"處理失敗 {i}"
                 }
                     json_input = gr.Textbox(
                         label="輸入JSON資料",
                         placeholder="請貼上您的JSON資料...",
+                        lines=8,
                         value='[\n    {\n        "論文集名稱": "刑事政策與犯罪防治研究36",\n        "作者": "犯罪防治研究中心彙編",\n        "下載位置": "https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"\n    }\n]'
                     )
+                    manual_abstract1 = gr.Textbox(
+                        label="手動輸入摘要 (選填)",
+                        placeholder="如果有摘要，請在此輸入。留空則自動從PDF中提取。",
+                        lines=4
+                    )
+                    with gr.Row():
+                        auto_filename1 = gr.Checkbox(
+                            label="自動使用論文集名稱作為檔名",
+                            value=True
+                        )
                     filename_input1 = gr.Textbox(
+                        label="自訂文件名 (僅在未勾選自動檔名時使用)",
                         placeholder="例: processed_papers.json",
+                        value="processed_papers.json",
+                        visible=False
                     )
                     process_json_btn = gr.Button("處理JSON資料", variant="primary", size="lg")
                 with gr.Column(scale=2):
                     output_json1 = gr.Textbox(
                         label="處理結果",
+                        lines=18,
                         show_copy_button=True
                     )
                     pdf_urls_input = gr.Textbox(
                         label="輸入PDF網址",
                         placeholder="請輸入PDF網址，每行一個...\n\n例如：\nhttps://example.com/paper1.pdf\nhttps://example.com/paper2.pdf\nhttps://example.com/paper3.pdf",
+                        lines=8,
                         value="https://www.cprc.moj.gov.tw/media/20213330/3_36%E6%9C%9F%E5%85%A8%E5%88%8A%E4%B8%8B%E8%BC%89.pdf?mediaDL=true"
                     )
+                    manual_abstract2 = gr.Textbox(
+                        label="手動輸入摘要 (選填)",
+                        placeholder="如果有摘要，請在此輸入。留空則自動從PDF中提取。",
+                        lines=4
+                    )
+                    with gr.Row():
+                        auto_filename2 = gr.Checkbox(
+                            label="自動使用第一個PDF標題作為檔名",
+                            value=True
+                        )
                     filename_input2 = gr.Textbox(
+                        label="自訂文件名 (僅在未勾選自動檔名時使用)",
                         placeholder="例: pdf_extracted_data.json",
+                        value="pdf_extracted_data.json",
+                        visible=False
                     )
                     process_urls_btn = gr.Button("處理PDF網址", variant="primary", size="lg")
                 with gr.Column(scale=2):
                     output_json2 = gr.Textbox(
                         label="處理結果",
+                        lines=18,
                         show_copy_button=True
                     )
                         visible=False
                     )
+    def process_and_save_json(json_input, manual_abstract, auto_filename, custom_filename):
         # 處理JSON資料
+        result = process_json_data(json_input, manual_abstract)
+        # 獲取論文集名稱用於自動檔名
+        collection_name = ""
+        try:
+            data = json.loads(json_input)
+            if isinstance(data, list) and len(data) > 0:
+                collection_name = data[0].get("論文集名稱", "")
+            elif isinstance(data, dict):
+                collection_name = data.get("論文集名稱", "")
+        except:
+            pass
         # 保存文件
+        save_msg, actual_filename = save_json_file(
+            result,
+            custom_filename if not auto_filename else None,
+            auto_filename,
+            collection_name
+        )
         # 如果保存成功，提供下載
+        if actual_filename and "已保存" in save_msg:
+            return result, save_msg, gr.update(visible=True, value=actual_filename)
         else:
             return result, save_msg, gr.update(visible=False)
+    def process_and_save_urls(urls_input, manual_abstract, auto_filename, custom_filename):
         # 處理PDF網址
+        result = process_pdf_urls(urls_input, manual_abstract)
+        # 獲取第一個PDF的標題用於自動檔名
+        title_for_filename = ""
+        if auto_filename:
+            try:
+                data = json.loads(result)
+                if isinstance(data, list) and len(data) > 0:
+                    title_for_filename = data[0].get("名稱", "")
+            except:
+                pass
         # 保存文件
+        save_msg, actual_filename = save_json_file(
+            result,
+            custom_filename if not auto_filename else None,
+            auto_filename,
+            title_for_filename
+        )
         # 如果保存成功，提供下載
+        if actual_filename and "已保存" in save_msg:
+            return result, save_msg, gr.update(visible=True, value=actual_filename)
         else:
             return result, save_msg, gr.update(visible=False)
+    # 控制文件名輸入框的顯示/隱藏
+    def toggle_filename_input1(auto_filename):
+        return gr.update(visible=not auto_filename)
+    def toggle_filename_input2(auto_filename):
+        return gr.update(visible=not auto_filename)
+    # 綁定checkbox事件
+    auto_filename1.change(
+        toggle_filename_input1,
+        inputs=[auto_filename1],
+        outputs=[filename_input1]
+    )
+    auto_filename2.change(
+        toggle_filename_input2,
+        inputs=[auto_filename2],
+        outputs=[filename_input2]
+    )
     # JSON處理按鈕事件
     process_json_btn.click(
         process_and_save_json,
+        inputs=[json_input, manual_abstract1, auto_filename1, filename_input1],
         outputs=[output_json1, save_status1, download_file1]
     )
     # PDF網址處理按鈕事件
     process_urls_btn.click(
         process_and_save_urls,
+        inputs=[pdf_urls_input, manual_abstract2, auto_filename2, filename_input2],
         outputs=[output_json2, save_status2, download_file2]
     )
     gr.Markdown("""
     ## 使用說明：
+    ### 🆕 新功能特色：
+    - **🔄 智能檔名**: 自動使用論文集名稱或PDF標題作為JSON檔名
+    - **📝 手動摘要**: 可貼上現有摘要，避免重複提取
+    - **⚡ 高效處理**: 有摘要就不用下載PDF，���省時間
     ### JSON資料處理模式：
     1. 將現有的JSON資料貼入文本框
+    2. **可選**: 在摘要欄位貼上已知的摘要內容
+    3. 勾選"自動使用論文集名稱作為檔名"（推薦）
+    4. 系統會根據下載位置自動獲取PDF並補充缺失欄位
+    5. 如果已提供摘要，系統將跳過PDF摘要提取，節省處理時間
     ### PDF網址直接處理模式：
     1. 直接貼入PDF網址，每行一個
+    2. **可選**: 在摘要欄位貼上統一的摘要內容
+    3. 勾選"自動使用第一個PDF標題作為檔名"（推薦）
+    4. 系統會自動下載並提取完整資訊
+    5. 如果已提供摘要，將應用到所有PDF條目
+    ### 檔名規則：
+    - **自動檔名**: 使用論文集名稱或PDF標題，自動清理特殊字符
+    - **自訂檔名**: 取消勾選自動檔名後可手動指定
+    - 檔名會自動加上`.json`副檔名
+    - 特殊字符會被替換為安全字符
+    ### 摘要處理優先順序：
+    1. **手動輸入的摘要** (最高優先級)
+    2. PDF中提取的摘要
+    3. 預設訊息 ("無摘要資訊")
     ## 注意事項：
+    - 提供手動摘要可大幅加快處理速度
+    - 自動檔名會避免檔案系統不支援的字符
     - 建議分批處理大量文件以獲得最佳效果
+    - 系統會自動清理重複的標題內容
+    ## 範例使用情境：
+    ### 情境1: 已知摘要的快速處理
+    ```
+    有現成的論文摘要 → 貼入摘要欄位 → 勾選自動檔名 → 快速處理
+    ```
+    ### 情境2: 完全自動化處理
+    ```
+    只有PDF網址 → 留空摘要欄位 → 勾選自動檔名 → 全自動提取
+    ```
+    ### 情境3: 批量處理相同摘要
     ```
+    多個PDF同一主題 → 輸入統一摘要 → 批量處理 → 節省時間
     ```
     """)