Spaces:
Sleeping
Sleeping
Commit
·
80c0e03
1
Parent(s):
d8c6d94
update
Browse files- README_FIXES.md +107 -0
- requirements.txt +42 -0
- src/demo/asg_retriever.py +2 -2
- src/demo/main.py +1 -1
- src/demo/path_utils.py +0 -1
- src/demo/survey_generation_pipeline/asg_retriever.py +2 -2
- src/demo/survey_generation_pipeline/main.py +1 -1
- src/demo/survey_generator_api.py +1 -2
- src/demo/views.py +36 -2
- startup.py +100 -0
- test_cache_fix.py +127 -0
README_FIXES.md
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 项目修复说明
|
2 |
+
|
3 |
+
## 修复的问题
|
4 |
+
|
5 |
+
### 1. 导入路径问题
|
6 |
+
- **问题**: 使用了已弃用的 `langchain_huggingface` 和 `langchain.embeddings` 导入
|
7 |
+
- **修复**: 统一使用 `langchain_community.embeddings` 导入 `HuggingFaceEmbeddings`
|
8 |
+
- **影响文件**:
|
9 |
+
- `src/demo/views.py`
|
10 |
+
- `src/demo/asg_retriever.py`
|
11 |
+
- `src/demo/main.py`
|
12 |
+
- `src/demo/survey_generator_api.py`
|
13 |
+
- `src/demo/survey_generation_pipeline/main.py`
|
14 |
+
- `src/demo/survey_generation_pipeline/asg_retriever.py`
|
15 |
+
- `test_cache_fix.py`
|
16 |
+
|
17 |
+
### 2. 文本分割器导入问题
|
18 |
+
- **问题**: 使用了已弃用的 `langchain.text_splitter` 导入
|
19 |
+
- **修复**: 使用新的 `langchain_text_splitters` 包
|
20 |
+
- **影响文件**:
|
21 |
+
- `src/demo/asg_retriever.py`
|
22 |
+
- `src/demo/survey_generation_pipeline/asg_retriever.py`
|
23 |
+
|
24 |
+
### 3. 环境变量和权限问题
|
25 |
+
- **问题**: 在 Hugging Face Spaces 中遇到 matplotlib、fontconfig、numba 缓存权限错误
|
26 |
+
- **修复**: 动态设置环境变量,在 HF Spaces 中使用临时目录
|
27 |
+
- **影响文件**:
|
28 |
+
- `src/demo/views.py` (主要修复)
|
29 |
+
- `src/demo/path_utils.py` (移除 TRANSFORMERS_CACHE)
|
30 |
+
|
31 |
+
### 4. Transformers 缓存警告
|
32 |
+
- **问题**: `TRANSFORMERS_CACHE` 环境变量已弃用
|
33 |
+
- **修复**: 只设置 `HF_HOME` 和 `HF_HUB_CACHE`
|
34 |
+
- **影响文件**:
|
35 |
+
- `src/demo/path_utils.py`
|
36 |
+
|
37 |
+
## 新增文件
|
38 |
+
|
39 |
+
### 1. `startup.py`
|
40 |
+
- 环境设置脚本,用于在启动时设置所有必要的环境变量
|
41 |
+
- 检查关键导入是否正常工作
|
42 |
+
- 支持本地和 Hugging Face Spaces 环境
|
43 |
+
|
44 |
+
### 2. `requirements.txt`
|
45 |
+
- 完整的依赖列表,包含所有必要的包和版本
|
46 |
+
- 确保兼容性和稳定性
|
47 |
+
|
48 |
+
### 3. `README_FIXES.md`
|
49 |
+
- 本文档,记录所有修复内容
|
50 |
+
|
51 |
+
## 环境变量设置
|
52 |
+
|
53 |
+
### Hugging Face Spaces 环境
|
54 |
+
当检测到 `SPACE_ID` 或 `HF_SPACE_ID` 环境变量时,自动设置:
|
55 |
+
|
56 |
+
```python
|
57 |
+
# 临时目录
|
58 |
+
temp_base = tempfile.mkdtemp()
|
59 |
+
|
60 |
+
# matplotlib 配置
|
61 |
+
os.environ["MPLCONFIGDIR"] = os.path.join(temp_base, "matplotlib")
|
62 |
+
|
63 |
+
# XDG 缓存
|
64 |
+
os.environ["XDG_CACHE_HOME"] = temp_base
|
65 |
+
|
66 |
+
# numba 缓存
|
67 |
+
os.environ["NUMBA_CACHE_DIR"] = os.path.join(temp_base, "numba_cache")
|
68 |
+
|
69 |
+
# Hugging Face 缓存
|
70 |
+
os.environ["HF_HOME"] = os.path.join(temp_base, "hf_cache")
|
71 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(temp_base, "hf_cache/hub")
|
72 |
+
```
|
73 |
+
|
74 |
+
### 本地环境
|
75 |
+
使用默认缓存目录,不进行特殊设置。
|
76 |
+
|
77 |
+
## 使用方法
|
78 |
+
|
79 |
+
### 1. 安装依赖
|
80 |
+
```bash
|
81 |
+
pip install -r requirements.txt
|
82 |
+
```
|
83 |
+
|
84 |
+
### 2. 运行环境检查
|
85 |
+
```bash
|
86 |
+
python startup.py
|
87 |
+
```
|
88 |
+
|
89 |
+
### 3. 启动应用
|
90 |
+
```bash
|
91 |
+
python manage.py runserver
|
92 |
+
```
|
93 |
+
|
94 |
+
## 注意事项
|
95 |
+
|
96 |
+
1. **缓存目录**: 在 Hugging Face Spaces 中,所有缓存都存储在临时目录中,重启后会清除
|
97 |
+
2. **权限**: 修复后的代码会自动处理权限问题,无需手动设置
|
98 |
+
3. **兼容性**: 代码同时支持本地开发和 Hugging Face Spaces 部署
|
99 |
+
4. **依赖**: 确保使用 `requirements.txt` 中的版本,避免兼容性问题
|
100 |
+
|
101 |
+
## 验证修复
|
102 |
+
|
103 |
+
运行 `startup.py` 脚本,应该看到:
|
104 |
+
- ✅ 所有导入成功
|
105 |
+
- ✅ 环境变量正确设置
|
106 |
+
- ✅ 无权限错误
|
107 |
+
- ✅ 无弃用警告
|
requirements.txt
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
django>=4.0.0
|
3 |
+
langchain>=0.1.0
|
4 |
+
langchain-community>=0.0.10
|
5 |
+
langchain-huggingface>=0.0.6
|
6 |
+
langchain-text-splitters>=0.0.1
|
7 |
+
|
8 |
+
# Hugging Face and ML libraries
|
9 |
+
transformers>=4.30.0
|
10 |
+
sentence-transformers>=2.2.0
|
11 |
+
torch>=2.0.0
|
12 |
+
torchvision>=0.15.0
|
13 |
+
|
14 |
+
# Data processing and visualization
|
15 |
+
pandas>=1.5.0
|
16 |
+
numpy>=1.24.0
|
17 |
+
matplotlib>=3.6.0
|
18 |
+
seaborn>=0.12.0
|
19 |
+
scikit-learn>=1.2.0
|
20 |
+
|
21 |
+
# UMAP and clustering
|
22 |
+
umap-learn>=0.5.3
|
23 |
+
bertopic>=0.15.0
|
24 |
+
numba>=0.57.0
|
25 |
+
|
26 |
+
# Vector database
|
27 |
+
chromadb>=0.4.0
|
28 |
+
|
29 |
+
# PDF processing
|
30 |
+
pypdf2>=3.0.0
|
31 |
+
markdown-pdf>=0.1.0
|
32 |
+
|
33 |
+
# API and networking
|
34 |
+
requests>=2.28.0
|
35 |
+
openai>=1.0.0
|
36 |
+
|
37 |
+
# Environment and configuration
|
38 |
+
python-dotenv>=1.0.0
|
39 |
+
|
40 |
+
# Additional utilities
|
41 |
+
tqdm>=4.64.0
|
42 |
+
Pillow>=9.4.0
|
src/demo/asg_retriever.py
CHANGED
@@ -5,11 +5,11 @@ import os
|
|
5 |
import json
|
6 |
import chromadb
|
7 |
from .asg_splitter import TextSplitting
|
8 |
-
from
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
from .path_utils import get_path, setup_hf_cache
|
12 |
-
from
|
13 |
|
14 |
# 设置 Hugging Face 缓存目录
|
15 |
cache_dir = setup_hf_cache()
|
|
|
5 |
import json
|
6 |
import chromadb
|
7 |
from .asg_splitter import TextSplitting
|
8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
from .path_utils import get_path, setup_hf_cache
|
12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
13 |
|
14 |
# 设置 Hugging Face 缓存目录
|
15 |
cache_dir = setup_hf_cache()
|
src/demo/main.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3 |
import os
|
4 |
|
5 |
import pandas as pd
|
6 |
-
from
|
7 |
from asg_retriever import legal_pdf
|
8 |
from asg_loader import DocumentLoading
|
9 |
from asg_retriever import Retriever, query_embeddings_new_new
|
|
|
3 |
import os
|
4 |
|
5 |
import pandas as pd
|
6 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
7 |
from asg_retriever import legal_pdf
|
8 |
from asg_loader import DocumentLoading
|
9 |
from asg_retriever import Retriever, query_embeddings_new_new
|
src/demo/path_utils.py
CHANGED
@@ -8,7 +8,6 @@ def setup_hf_cache():
|
|
8 |
# 在 Hugging Face Spaces 中使用临时目录作为缓存
|
9 |
cache_dir = tempfile.mkdtemp()
|
10 |
os.environ['HF_HOME'] = cache_dir
|
11 |
-
os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers')
|
12 |
os.environ['HF_HUB_CACHE'] = os.path.join(cache_dir, 'hub')
|
13 |
print(f"Using Hugging Face cache directory: {cache_dir}")
|
14 |
return cache_dir
|
|
|
8 |
# 在 Hugging Face Spaces 中使用临时目录作为缓存
|
9 |
cache_dir = tempfile.mkdtemp()
|
10 |
os.environ['HF_HOME'] = cache_dir
|
|
|
11 |
os.environ['HF_HUB_CACHE'] = os.path.join(cache_dir, 'hub')
|
12 |
print(f"Using Hugging Face cache directory: {cache_dir}")
|
13 |
return cache_dir
|
src/demo/survey_generation_pipeline/asg_retriever.py
CHANGED
@@ -5,11 +5,11 @@ import os
|
|
5 |
import json
|
6 |
import chromadb
|
7 |
from .asg_splitter import TextSplitting
|
8 |
-
from
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
from ..path_utils import get_path, setup_hf_cache
|
12 |
-
from
|
13 |
|
14 |
# 设置 Hugging Face 缓存目录
|
15 |
cache_dir = setup_hf_cache()
|
|
|
5 |
import json
|
6 |
import chromadb
|
7 |
from .asg_splitter import TextSplitting
|
8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
from ..path_utils import get_path, setup_hf_cache
|
12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
13 |
|
14 |
# 设置 Hugging Face 缓存目录
|
15 |
cache_dir = setup_hf_cache()
|
src/demo/survey_generation_pipeline/main.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
import sys
|
6 |
|
7 |
import pandas as pd
|
8 |
-
from
|
9 |
import requests
|
10 |
from tqdm import tqdm
|
11 |
import urllib
|
|
|
5 |
import sys
|
6 |
|
7 |
import pandas as pd
|
8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
import requests
|
10 |
from tqdm import tqdm
|
11 |
import urllib
|
src/demo/survey_generator_api.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import time
|
2 |
-
from
|
3 |
from openai import OpenAI
|
4 |
import ast
|
5 |
import re
|
@@ -598,7 +598,6 @@ Survey Paper Content for "{section_title}":
|
|
598 |
import re
|
599 |
import numpy as np
|
600 |
from numpy.linalg import norm
|
601 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
602 |
|
603 |
def generate_survey_section_with_citations(context, client, section_title, citation_data_list,
|
604 |
temp=0.5, base_threshold=0.7, dynamic_threshold=True):
|
|
|
1 |
import time
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
from openai import OpenAI
|
4 |
import ast
|
5 |
import re
|
|
|
598 |
import re
|
599 |
import numpy as np
|
600 |
from numpy.linalg import norm
|
|
|
601 |
|
602 |
def generate_survey_section_with_citations(context, client, section_title, citation_data_list,
|
603 |
temp=0.5, base_threshold=0.7, dynamic_threshold=True):
|
src/demo/views.py
CHANGED
@@ -1,3 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from __future__ import unicode_literals
|
2 |
import sys
|
3 |
|
@@ -16,7 +51,6 @@ from io import BytesIO
|
|
16 |
|
17 |
import hashlib
|
18 |
import re
|
19 |
-
import os
|
20 |
import csv
|
21 |
import xml.etree.ElementTree as ET
|
22 |
import urllib.parse
|
@@ -39,7 +73,7 @@ from .asg_latex import tex_to_pdf, insert_figures, md_to_tex, preprocess_md
|
|
39 |
# from .survey_generator_api import ensure_all_papers_cited
|
40 |
import glob
|
41 |
|
42 |
-
from
|
43 |
from dotenv import load_dotenv
|
44 |
from pathlib import Path
|
45 |
from markdown_pdf import MarkdownPdf, Section
|
|
|
1 |
+
# 设置环境变量以解决 Hugging Face Spaces 中的权限问题
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
|
5 |
+
# 检测是否在 Hugging Face Spaces 中运行
|
6 |
+
is_hf_space = bool(os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'))
|
7 |
+
|
8 |
+
if is_hf_space:
|
9 |
+
# 在 Hugging Face Spaces 中使用临时目录
|
10 |
+
temp_base = tempfile.mkdtemp()
|
11 |
+
|
12 |
+
# 设置 matplotlib 配置目录
|
13 |
+
matplotlib_dir = os.path.join(temp_base, "matplotlib")
|
14 |
+
os.makedirs(matplotlib_dir, exist_ok=True)
|
15 |
+
os.environ["MPLCONFIGDIR"] = matplotlib_dir
|
16 |
+
|
17 |
+
# 设置 XDG 缓存目录
|
18 |
+
os.environ["XDG_CACHE_HOME"] = temp_base
|
19 |
+
|
20 |
+
# 设置 numba 缓存目录
|
21 |
+
numba_dir = os.path.join(temp_base, "numba_cache")
|
22 |
+
os.makedirs(numba_dir, exist_ok=True)
|
23 |
+
os.environ["NUMBA_CACHE_DIR"] = numba_dir
|
24 |
+
|
25 |
+
# 设置 Hugging Face 缓存目录
|
26 |
+
hf_dir = os.path.join(temp_base, "hf_cache")
|
27 |
+
os.makedirs(hf_dir, exist_ok=True)
|
28 |
+
os.environ["HF_HOME"] = hf_dir
|
29 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(hf_dir, "hub")
|
30 |
+
|
31 |
+
print(f"HF Spaces 环境变量已设置: {temp_base}")
|
32 |
+
else:
|
33 |
+
# 本地环境,使用默认设置
|
34 |
+
print("本地环境,使用默认缓存目录")
|
35 |
+
|
36 |
from __future__ import unicode_literals
|
37 |
import sys
|
38 |
|
|
|
51 |
|
52 |
import hashlib
|
53 |
import re
|
|
|
54 |
import csv
|
55 |
import xml.etree.ElementTree as ET
|
56 |
import urllib.parse
|
|
|
73 |
# from .survey_generator_api import ensure_all_papers_cited
|
74 |
import glob
|
75 |
|
76 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
77 |
from dotenv import load_dotenv
|
78 |
from pathlib import Path
|
79 |
from markdown_pdf import MarkdownPdf, Section
|
startup.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
启动脚本 - 设置环境变量以解决 Hugging Face Spaces 中的权限问题
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import tempfile
|
8 |
+
import sys
|
9 |
+
|
10 |
+
def setup_environment():
|
11 |
+
"""设置所有必要的环境变量"""
|
12 |
+
|
13 |
+
# 检测是否在 Hugging Face Spaces 中运行
|
14 |
+
is_hf_space = bool(os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'))
|
15 |
+
|
16 |
+
if is_hf_space:
|
17 |
+
print("检测到 Hugging Face Spaces 环境,设置临时目录...")
|
18 |
+
|
19 |
+
# 创建临时目录
|
20 |
+
temp_base = tempfile.mkdtemp()
|
21 |
+
|
22 |
+
# 设置 matplotlib 配置目录
|
23 |
+
matplotlib_dir = os.path.join(temp_base, "matplotlib")
|
24 |
+
os.makedirs(matplotlib_dir, exist_ok=True)
|
25 |
+
os.environ["MPLCONFIGDIR"] = matplotlib_dir
|
26 |
+
|
27 |
+
# 设置 XDG 缓存目录
|
28 |
+
os.environ["XDG_CACHE_HOME"] = temp_base
|
29 |
+
|
30 |
+
# 设置 numba 缓存目录
|
31 |
+
numba_dir = os.path.join(temp_base, "numba_cache")
|
32 |
+
os.makedirs(numba_dir, exist_ok=True)
|
33 |
+
os.environ["NUMBA_CACHE_DIR"] = numba_dir
|
34 |
+
|
35 |
+
# 设置 Hugging Face 缓存目录
|
36 |
+
hf_dir = os.path.join(temp_base, "hf_cache")
|
37 |
+
os.makedirs(hf_dir, exist_ok=True)
|
38 |
+
os.environ["HF_HOME"] = hf_dir
|
39 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(hf_dir, "hub")
|
40 |
+
|
41 |
+
print(f"环境变量已设置:")
|
42 |
+
print(f" MPLCONFIGDIR: {matplotlib_dir}")
|
43 |
+
print(f" XDG_CACHE_HOME: {temp_base}")
|
44 |
+
print(f" NUMBA_CACHE_DIR: {numba_dir}")
|
45 |
+
print(f" HF_HOME: {hf_dir}")
|
46 |
+
print(f" HF_HUB_CACHE: {os.environ['HF_HUB_CACHE']}")
|
47 |
+
|
48 |
+
else:
|
49 |
+
print("本地环境,使用默认缓存目录")
|
50 |
+
|
51 |
+
return is_hf_space
|
52 |
+
|
53 |
+
def check_imports():
|
54 |
+
"""检查关键导入是否正常工作"""
|
55 |
+
try:
|
56 |
+
print("检查导入...")
|
57 |
+
|
58 |
+
# 检查 langchain 相关导入
|
59 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
60 |
+
print("✅ langchain_community.embeddings 导入成功")
|
61 |
+
|
62 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
63 |
+
print("✅ langchain_text_splitters 导入成功")
|
64 |
+
|
65 |
+
# 检查其他关键库
|
66 |
+
import torch
|
67 |
+
print("✅ PyTorch 导入成功")
|
68 |
+
|
69 |
+
import transformers
|
70 |
+
print("✅ Transformers 导入成功")
|
71 |
+
|
72 |
+
import matplotlib
|
73 |
+
print("✅ Matplotlib 导入成功")
|
74 |
+
|
75 |
+
import numba
|
76 |
+
print("✅ Numba 导入成功")
|
77 |
+
|
78 |
+
return True
|
79 |
+
|
80 |
+
except ImportError as e:
|
81 |
+
print(f"❌ 导入失败: {e}")
|
82 |
+
return False
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
print("=== 环境设置脚本 ===")
|
86 |
+
|
87 |
+
# 设置环境变量
|
88 |
+
is_hf_space = setup_environment()
|
89 |
+
|
90 |
+
# 检查导入
|
91 |
+
if check_imports():
|
92 |
+
print("\n✅ 环境设置完成,所有导入正常")
|
93 |
+
|
94 |
+
if is_hf_space:
|
95 |
+
print("💡 提示: 在 Hugging Face Spaces 中使用临时目录作为缓存")
|
96 |
+
else:
|
97 |
+
print("💡 提示: 在本地环境中使用默认缓存目录")
|
98 |
+
else:
|
99 |
+
print("\n❌ 环境设置失败,请检查依赖安装")
|
100 |
+
sys.exit(1)
|
test_cache_fix.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
测试脚本:验证 Hugging Face 缓存目录修复是否有效
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
# 添加 src 目录到 Python 路径
|
11 |
+
sys.path.insert(0, 'src')
|
12 |
+
print(f"Python path: {sys.path[:3]}")
|
13 |
+
|
14 |
+
def test_cache_setup():
|
15 |
+
"""测试缓存目录设置"""
|
16 |
+
print("=== 测试缓存目录设置 ===")
|
17 |
+
|
18 |
+
# 测试本地环境
|
19 |
+
print("\n1. 测试本地环境(无 SPACE_ID):")
|
20 |
+
if 'SPACE_ID' in os.environ:
|
21 |
+
del os.environ['SPACE_ID']
|
22 |
+
if 'HF_SPACE_ID' in os.environ:
|
23 |
+
del os.environ['HF_SPACE_ID']
|
24 |
+
|
25 |
+
try:
|
26 |
+
print(" 正在导入 setup_hf_cache...")
|
27 |
+
from demo.path_utils import setup_hf_cache
|
28 |
+
print(" 导入成功,正在调用 setup_hf_cache...")
|
29 |
+
cache_dir = setup_hf_cache()
|
30 |
+
print(f" 缓存目录: {cache_dir}")
|
31 |
+
print(" ✅ 本地环境测试通过")
|
32 |
+
except Exception as e:
|
33 |
+
print(f" ❌ 本地环境测试失败: {e}")
|
34 |
+
import traceback
|
35 |
+
traceback.print_exc()
|
36 |
+
|
37 |
+
# 测试 Hugging Face Spaces 环境
|
38 |
+
print("\n2. 测试 Hugging Face Spaces 环境(有 SPACE_ID):")
|
39 |
+
os.environ['SPACE_ID'] = 'test_space_123'
|
40 |
+
|
41 |
+
try:
|
42 |
+
cache_dir = setup_hf_cache()
|
43 |
+
print(f" 缓存目录: {cache_dir}")
|
44 |
+
if cache_dir and os.path.exists(cache_dir):
|
45 |
+
print(" ✅ Hugging Face Spaces 环境测试通过")
|
46 |
+
else:
|
47 |
+
print(" ❌ 缓存目录不存在")
|
48 |
+
except Exception as e:
|
49 |
+
print(f" ❌ Hugging Face Spaces 环境测试失败: {e}")
|
50 |
+
import traceback
|
51 |
+
traceback.print_exc()
|
52 |
+
|
53 |
+
def test_embedder_initialization():
|
54 |
+
"""测试 embedder 初始化"""
|
55 |
+
print("\n=== 测试 Embedder 初始化 ===")
|
56 |
+
|
57 |
+
# 设置 Hugging Face Spaces 环境
|
58 |
+
os.environ['SPACE_ID'] = 'test_space_456'
|
59 |
+
|
60 |
+
try:
|
61 |
+
from demo.path_utils import setup_hf_cache
|
62 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
63 |
+
|
64 |
+
cache_dir = setup_hf_cache()
|
65 |
+
print(f"使用缓存目录: {cache_dir}")
|
66 |
+
|
67 |
+
# 尝试初始化 embedder
|
68 |
+
print("正在初始化 HuggingFaceEmbeddings...")
|
69 |
+
embedder = HuggingFaceEmbeddings(
|
70 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
71 |
+
cache_folder=cache_dir
|
72 |
+
)
|
73 |
+
print("✅ Embedder 初始化成功")
|
74 |
+
|
75 |
+
# 测试简单的嵌入
|
76 |
+
test_text = "This is a test sentence."
|
77 |
+
print("正在测试嵌入...")
|
78 |
+
embedding = embedder.embed_query(test_text)
|
79 |
+
print(f"✅ 嵌入测试成功,向量维度: {len(embedding)}")
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"❌ Embedder 初始化失败: {e}")
|
83 |
+
import traceback
|
84 |
+
traceback.print_exc()
|
85 |
+
|
86 |
+
def test_sentence_transformer():
|
87 |
+
"""测试 SentenceTransformer"""
|
88 |
+
print("\n=== 测试 SentenceTransformer ===")
|
89 |
+
|
90 |
+
# 设置 Hugging Face Spaces 环境
|
91 |
+
os.environ['SPACE_ID'] = 'test_space_789'
|
92 |
+
|
93 |
+
try:
|
94 |
+
from demo.path_utils import setup_hf_cache
|
95 |
+
from sentence_transformers import SentenceTransformer
|
96 |
+
|
97 |
+
cache_dir = setup_hf_cache()
|
98 |
+
print(f"使用缓存目录: {cache_dir}")
|
99 |
+
|
100 |
+
# 尝试初始化 SentenceTransformer
|
101 |
+
print("正在初始化 SentenceTransformer...")
|
102 |
+
model = SentenceTransformer(
|
103 |
+
"nomic-ai/nomic-embed-text-v1",
|
104 |
+
trust_remote_code=True,
|
105 |
+
cache_folder=cache_dir
|
106 |
+
)
|
107 |
+
print("✅ SentenceTransformer 初始化成功")
|
108 |
+
|
109 |
+
# 测试简单的嵌入
|
110 |
+
test_text = "This is a test sentence."
|
111 |
+
print("正在测试嵌入...")
|
112 |
+
embedding = model.encode(test_text)
|
113 |
+
print(f"✅ 嵌入测试成功,向量维度: {len(embedding)}")
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
print(f"❌ SentenceTransformer 初始化失败: {e}")
|
117 |
+
import traceback
|
118 |
+
traceback.print_exc()
|
119 |
+
|
120 |
+
if __name__ == "__main__":
|
121 |
+
print("开始测试 Hugging Face 缓存目录修复...")
|
122 |
+
|
123 |
+
test_cache_setup()
|
124 |
+
test_embedder_initialization()
|
125 |
+
test_sentence_transformer()
|
126 |
+
|
127 |
+
print("\n=== 测试完成 ===")
|