Spaces:

technicolor
/

InteractiveSurvey

Sleeping

App Files Files Community

technicolor commited on 4 days ago

Commit

80c0e03

1 Parent(s): d8c6d94

update

Browse files

Files changed (11) hide show

README_FIXES.md +107 -0
requirements.txt +42 -0
src/demo/asg_retriever.py +2 -2
src/demo/main.py +1 -1
src/demo/path_utils.py +0 -1
src/demo/survey_generation_pipeline/asg_retriever.py +2 -2
src/demo/survey_generation_pipeline/main.py +1 -1
src/demo/survey_generator_api.py +1 -2
src/demo/views.py +36 -2
startup.py +100 -0
test_cache_fix.py +127 -0

README_FIXES.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# 项目修复说明
+## 修复的问题
+### 1. 导入路径问题
+- **问题**: 使用了已弃用的 `langchain_huggingface` 和 `langchain.embeddings` 导入
+- **修复**: 统一使用 `langchain_community.embeddings` 导入 `HuggingFaceEmbeddings`
+- **影响文件**:
+  - `src/demo/views.py`
+  - `src/demo/asg_retriever.py`
+  - `src/demo/main.py`
+  - `src/demo/survey_generator_api.py`
+  - `src/demo/survey_generation_pipeline/main.py`
+  - `src/demo/survey_generation_pipeline/asg_retriever.py`
+  - `test_cache_fix.py`
+### 2. 文本分割器导入问题
+- **问题**: 使用了已弃用的 `langchain.text_splitter` 导入
+- **修复**: 使用新的 `langchain_text_splitters` 包
+- **影响文件**:
+  - `src/demo/asg_retriever.py`
+  - `src/demo/survey_generation_pipeline/asg_retriever.py`
+### 3. 环境变量和权限问题
+- **问题**: 在 Hugging Face Spaces 中遇到 matplotlib、fontconfig、numba 缓存权限错误
+- **修复**: 动态设置环境变量，在 HF Spaces 中使用临时目录
+- **影响文件**:
+  - `src/demo/views.py` (主要修复)
+  - `src/demo/path_utils.py` (移除 TRANSFORMERS_CACHE)
+### 4. Transformers 缓存警告
+- **问题**: `TRANSFORMERS_CACHE` 环境变量已弃用
+- **修复**: 只设置 `HF_HOME` 和 `HF_HUB_CACHE`
+- **影响文件**:
+  - `src/demo/path_utils.py`
+## 新增文件
+### 1. `startup.py`
+- 环境设置脚本，用于在启动时设置所有必要的环境变量
+- 检查关键导入是否正常工作
+- 支持本地和 Hugging Face Spaces 环境
+### 2. `requirements.txt`
+- 完整的依赖列表，包含所有必要的包和版本
+- 确保兼容性和稳定性
+### 3. `README_FIXES.md`
+- 本文档，记录所有修复内容
+## 环境变量设置
+### Hugging Face Spaces 环境
+当检测到 `SPACE_ID` 或 `HF_SPACE_ID` 环境变量时，自动设置：
+```python
+# 临时目录
+temp_base = tempfile.mkdtemp()
+# matplotlib 配置
+os.environ["MPLCONFIGDIR"] = os.path.join(temp_base, "matplotlib")
+# XDG 缓存
+os.environ["XDG_CACHE_HOME"] = temp_base
+# numba 缓存
+os.environ["NUMBA_CACHE_DIR"] = os.path.join(temp_base, "numba_cache")
+# Hugging Face 缓存
+os.environ["HF_HOME"] = os.path.join(temp_base, "hf_cache")
+os.environ["HF_HUB_CACHE"] = os.path.join(temp_base, "hf_cache/hub")
+```
+### 本地环境
+使用默认缓存目录，不进行特殊设置。
+## 使用方法
+### 1. 安装依赖
+```bash
+pip install -r requirements.txt
+```
+### 2. 运行环境检查
+```bash
+python startup.py
+```
+### 3. 启动应用
+```bash
+python manage.py runserver
+```
+## 注意事项
+1. **缓存目录**: 在 Hugging Face Spaces 中，所有缓存都存储在临时目录中，重启后会清除
+2. **权限**: 修复后的代码会自动处理权限问题，无需手动设置
+3. **兼容性**: 代码同时支持本地开发和 Hugging Face Spaces 部署
+4. **依赖**: 确保使用 `requirements.txt` 中的版本，避免兼容性问题
+## 验证修复
+运行 `startup.py` 脚本，应该看到：
+- ✅ 所有导入成功
+- ✅ 环境变量正确设置
+- ✅ 无权限错误
+- ✅ 无弃用警告

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+# Core dependencies
+django>=4.0.0
+langchain>=0.1.0
+langchain-community>=0.0.10
+langchain-huggingface>=0.0.6
+langchain-text-splitters>=0.0.1
+# Hugging Face and ML libraries
+transformers>=4.30.0
+sentence-transformers>=2.2.0
+torch>=2.0.0
+torchvision>=0.15.0
+# Data processing and visualization
+pandas>=1.5.0
+numpy>=1.24.0
+matplotlib>=3.6.0
+seaborn>=0.12.0
+scikit-learn>=1.2.0
+# UMAP and clustering
+umap-learn>=0.5.3
+bertopic>=0.15.0
+numba>=0.57.0
+# Vector database
+chromadb>=0.4.0
+# PDF processing
+pypdf2>=3.0.0
+markdown-pdf>=0.1.0
+# API and networking
+requests>=2.28.0
+openai>=1.0.0
+# Environment and configuration
+python-dotenv>=1.0.0
+# Additional utilities
+tqdm>=4.64.0
+Pillow>=9.4.0

src/demo/asg_retriever.py CHANGED Viewed

@@ -5,11 +5,11 @@ import os
 import json
 import chromadb
 from .asg_splitter import TextSplitting
-from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 from .path_utils import get_path, setup_hf_cache
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 # 设置 Hugging Face 缓存目录
 cache_dir = setup_hf_cache()

 import json
 import chromadb
 from .asg_splitter import TextSplitting
+from langchain_community.embeddings import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 from .path_utils import get_path, setup_hf_cache
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 # 设置 Hugging Face 缓存目录
 cache_dir = setup_hf_cache()

src/demo/main.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import os
 import pandas as pd
-from langchain_huggingface import HuggingFaceEmbeddings
 from asg_retriever import legal_pdf
 from asg_loader import DocumentLoading
 from asg_retriever import Retriever, query_embeddings_new_new

 import os
 import pandas as pd
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from asg_retriever import legal_pdf
 from asg_loader import DocumentLoading
 from asg_retriever import Retriever, query_embeddings_new_new

src/demo/path_utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ def setup_hf_cache():
         # 在 Hugging Face Spaces 中使用临时目录作为缓存
         cache_dir = tempfile.mkdtemp()
         os.environ['HF_HOME'] = cache_dir
-        os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers')
         os.environ['HF_HUB_CACHE'] = os.path.join(cache_dir, 'hub')
         print(f"Using Hugging Face cache directory: {cache_dir}")
         return cache_dir

         # 在 Hugging Face Spaces 中使用临时目录作为缓存
         cache_dir = tempfile.mkdtemp()
         os.environ['HF_HOME'] = cache_dir
         os.environ['HF_HUB_CACHE'] = os.path.join(cache_dir, 'hub')
         print(f"Using Hugging Face cache directory: {cache_dir}")
         return cache_dir

src/demo/survey_generation_pipeline/asg_retriever.py CHANGED Viewed

@@ -5,11 +5,11 @@ import os
 import json
 import chromadb
 from .asg_splitter import TextSplitting
-from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 from ..path_utils import get_path, setup_hf_cache
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 # 设置 Hugging Face 缓存目录
 cache_dir = setup_hf_cache()

 import json
 import chromadb
 from .asg_splitter import TextSplitting
+from langchain_community.embeddings import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 from ..path_utils import get_path, setup_hf_cache
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 # 设置 Hugging Face 缓存目录
 cache_dir = setup_hf_cache()

src/demo/survey_generation_pipeline/main.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import sys
 import pandas as pd
-from langchain_huggingface import HuggingFaceEmbeddings
 import requests
 from tqdm import tqdm
 import urllib

 import sys
 import pandas as pd
+from langchain_community.embeddings import HuggingFaceEmbeddings
 import requests
 from tqdm import tqdm
 import urllib

src/demo/survey_generator_api.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import time
-from langchain_huggingface import HuggingFaceEmbeddings
 from openai import OpenAI
 import ast
 import re
@@ -598,7 +598,6 @@ Survey Paper Content for "{section_title}":
 import re
 import numpy as np
 from numpy.linalg import norm
-from langchain.embeddings import HuggingFaceEmbeddings
 def generate_survey_section_with_citations(context, client, section_title, citation_data_list,
                                            temp=0.5, base_threshold=0.7, dynamic_threshold=True):

 import time
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from openai import OpenAI
 import ast
 import re
 import re
 import numpy as np
 from numpy.linalg import norm
 def generate_survey_section_with_citations(context, client, section_title, citation_data_list,
                                            temp=0.5, base_threshold=0.7, dynamic_threshold=True):

src/demo/views.py CHANGED Viewed

@@ -1,3 +1,38 @@
 from __future__ import unicode_literals
 import sys
@@ -16,7 +51,6 @@ from io import BytesIO
 import hashlib
 import re
-import os
 import csv
 import xml.etree.ElementTree as ET
 import urllib.parse
@@ -39,7 +73,7 @@ from .asg_latex import tex_to_pdf, insert_figures, md_to_tex, preprocess_md
 # from .survey_generator_api import ensure_all_papers_cited
 import glob
-from langchain_huggingface import HuggingFaceEmbeddings
 from dotenv import load_dotenv
 from pathlib import Path
 from markdown_pdf import MarkdownPdf, Section

+# 设置环境变量以解决 Hugging Face Spaces 中的权限问题
+import os
+import tempfile
+# 检测是否在 Hugging Face Spaces 中运行
+is_hf_space = bool(os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'))
+if is_hf_space:
+    # 在 Hugging Face Spaces 中使用临时目录
+    temp_base = tempfile.mkdtemp()
+    # 设置 matplotlib 配置目录
+    matplotlib_dir = os.path.join(temp_base, "matplotlib")
+    os.makedirs(matplotlib_dir, exist_ok=True)
+    os.environ["MPLCONFIGDIR"] = matplotlib_dir
+    # 设置 XDG 缓存目录
+    os.environ["XDG_CACHE_HOME"] = temp_base
+    # 设置 numba 缓存目录
+    numba_dir = os.path.join(temp_base, "numba_cache")
+    os.makedirs(numba_dir, exist_ok=True)
+    os.environ["NUMBA_CACHE_DIR"] = numba_dir
+    # 设置 Hugging Face 缓存目录
+    hf_dir = os.path.join(temp_base, "hf_cache")
+    os.makedirs(hf_dir, exist_ok=True)
+    os.environ["HF_HOME"] = hf_dir
+    os.environ["HF_HUB_CACHE"] = os.path.join(hf_dir, "hub")
+    print(f"HF Spaces 环境变量已设置: {temp_base}")
+else:
+    # 本地环境，使用默认设置
+    print("本地环境，使用默认缓存目录")
 from __future__ import unicode_literals
 import sys
 import hashlib
 import re
 import csv
 import xml.etree.ElementTree as ET
 import urllib.parse
 # from .survey_generator_api import ensure_all_papers_cited
 import glob
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from dotenv import load_dotenv
 from pathlib import Path
 from markdown_pdf import MarkdownPdf, Section

startup.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/env python3
+"""
+启动脚本 - 设置环境变量以解决 Hugging Face Spaces 中的权限问题
+"""
+import os
+import tempfile
+import sys
+def setup_environment():
+    """设置所有必要的环境变量"""
+    # 检测是否在 Hugging Face Spaces 中运行
+    is_hf_space = bool(os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'))
+    if is_hf_space:
+        print("检测到 Hugging Face Spaces 环境，设置临时目录...")
+        # 创建临时目录
+        temp_base = tempfile.mkdtemp()
+        # 设置 matplotlib 配置目录
+        matplotlib_dir = os.path.join(temp_base, "matplotlib")
+        os.makedirs(matplotlib_dir, exist_ok=True)
+        os.environ["MPLCONFIGDIR"] = matplotlib_dir
+        # 设置 XDG 缓存目录
+        os.environ["XDG_CACHE_HOME"] = temp_base
+        # 设置 numba 缓存目录
+        numba_dir = os.path.join(temp_base, "numba_cache")
+        os.makedirs(numba_dir, exist_ok=True)
+        os.environ["NUMBA_CACHE_DIR"] = numba_dir
+        # 设置 Hugging Face 缓存目录
+        hf_dir = os.path.join(temp_base, "hf_cache")
+        os.makedirs(hf_dir, exist_ok=True)
+        os.environ["HF_HOME"] = hf_dir
+        os.environ["HF_HUB_CACHE"] = os.path.join(hf_dir, "hub")
+        print(f"环境变量已设置:")
+        print(f"  MPLCONFIGDIR: {matplotlib_dir}")
+        print(f"  XDG_CACHE_HOME: {temp_base}")
+        print(f"  NUMBA_CACHE_DIR: {numba_dir}")
+        print(f"  HF_HOME: {hf_dir}")
+        print(f"  HF_HUB_CACHE: {os.environ['HF_HUB_CACHE']}")
+    else:
+        print("本地环境，使用默认缓存目录")
+    return is_hf_space
+def check_imports():
+    """检查关键导入是否正常工作"""
+    try:
+        print("检查导入...")
+        # 检查 langchain 相关导入
+        from langchain_community.embeddings import HuggingFaceEmbeddings
+        print("✅ langchain_community.embeddings 导入成功")
+        from langchain_text_splitters import RecursiveCharacterTextSplitter
+        print("✅ langchain_text_splitters 导入成功")
+        # 检查其他关键库
+        import torch
+        print("✅ PyTorch 导入成功")
+        import transformers
+        print("✅ Transformers 导入成功")
+        import matplotlib
+        print("✅ Matplotlib 导入成功")
+        import numba
+        print("✅ Numba 导入成功")
+        return True
+    except ImportError as e:
+        print(f"❌ 导入失败: {e}")
+        return False
+if __name__ == "__main__":
+    print("=== 环境设置脚本 ===")
+    # 设置环境变量
+    is_hf_space = setup_environment()
+    # 检查导入
+    if check_imports():
+        print("\n✅ 环境设置完成，所有导入正常")
+        if is_hf_space:
+            print("💡 提示: 在 Hugging Face Spaces 中使用临时目录作为缓存")
+        else:
+            print("💡 提示: 在本地环境中使用默认缓存目录")
+    else:
+        print("\n❌ 环境设置失败，请检查依赖安装")
+        sys.exit(1)

test_cache_fix.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python3
+"""
+测试脚本：验证 Hugging Face 缓存目录修复是否有效
+"""
+import os
+import sys
+import tempfile
+# 添加 src 目录到 Python 路径
+sys.path.insert(0, 'src')
+print(f"Python path: {sys.path[:3]}")
+def test_cache_setup():
+    """测试缓存目录设置"""
+    print("=== 测试缓存目录设置 ===")
+    # 测试本地环境
+    print("\n1. 测试本地环境（无 SPACE_ID）:")
+    if 'SPACE_ID' in os.environ:
+        del os.environ['SPACE_ID']
+    if 'HF_SPACE_ID' in os.environ:
+        del os.environ['HF_SPACE_ID']
+    try:
+        print("   正在导入 setup_hf_cache...")
+        from demo.path_utils import setup_hf_cache
+        print("   导入成功，正在调用 setup_hf_cache...")
+        cache_dir = setup_hf_cache()
+        print(f"   缓存目录: {cache_dir}")
+        print("   ✅ 本地环境测试通过")
+    except Exception as e:
+        print(f"   ❌ 本地环境测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+    # 测试 Hugging Face Spaces 环境
+    print("\n2. 测试 Hugging Face Spaces 环境（有 SPACE_ID）:")
+    os.environ['SPACE_ID'] = 'test_space_123'
+    try:
+        cache_dir = setup_hf_cache()
+        print(f"   缓存目录: {cache_dir}")
+        if cache_dir and os.path.exists(cache_dir):
+            print("   ✅ Hugging Face Spaces 环境测试通过")
+        else:
+            print("   ❌ 缓存目录不存在")
+    except Exception as e:
+        print(f"   ❌ Hugging Face Spaces 环境测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+def test_embedder_initialization():
+    """测试 embedder 初始化"""
+    print("\n=== 测试 Embedder 初始化 ===")
+    # 设置 Hugging Face Spaces 环境
+    os.environ['SPACE_ID'] = 'test_space_456'
+    try:
+        from demo.path_utils import setup_hf_cache
+        from langchain_community.embeddings import HuggingFaceEmbeddings
+        cache_dir = setup_hf_cache()
+        print(f"使用缓存目录: {cache_dir}")
+        # 尝试初始化 embedder
+        print("正在初始化 HuggingFaceEmbeddings...")
+        embedder = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            cache_folder=cache_dir
+        )
+        print("✅ Embedder 初始化成功")
+        # 测试简单的嵌入
+        test_text = "This is a test sentence."
+        print("正在测试嵌入...")
+        embedding = embedder.embed_query(test_text)
+        print(f"✅ 嵌入测试成功，向量维度: {len(embedding)}")
+    except Exception as e:
+        print(f"❌ Embedder 初始化失败: {e}")
+        import traceback
+        traceback.print_exc()
+def test_sentence_transformer():
+    """测试 SentenceTransformer"""
+    print("\n=== 测试 SentenceTransformer ===")
+    # 设置 Hugging Face Spaces 环境
+    os.environ['SPACE_ID'] = 'test_space_789'
+    try:
+        from demo.path_utils import setup_hf_cache
+        from sentence_transformers import SentenceTransformer
+        cache_dir = setup_hf_cache()
+        print(f"使用缓存目录: {cache_dir}")
+        # 尝试初始化 SentenceTransformer
+        print("正在初始化 SentenceTransformer...")
+        model = SentenceTransformer(
+            "nomic-ai/nomic-embed-text-v1",
+            trust_remote_code=True,
+            cache_folder=cache_dir
+        )
+        print("✅ SentenceTransformer 初始化成功")
+        # 测试简单的嵌入
+        test_text = "This is a test sentence."
+        print("正在测试嵌入...")
+        embedding = model.encode(test_text)
+        print(f"✅ 嵌入测试成功，向量维度: {len(embedding)}")
+    except Exception as e:
+        print(f"❌ SentenceTransformer 初始化失败: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    print("开始测试 Hugging Face 缓存目录修复...")
+    test_cache_setup()
+    test_embedder_initialization()
+    test_sentence_transformer()
+    print("\n=== 测试完成 ===")