perf: improvement requests handling with async mode
Browse files- docsifer/router.py +9 -12
- docsifer/service.py +62 -23
docsifer/router.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
# filename: router.py
|
2 |
-
|
3 |
import logging
|
4 |
import json
|
5 |
import tempfile
|
@@ -17,14 +15,14 @@ from .analytics import Analytics
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
|
19 |
|
20 |
-
# Initialize analytics (
|
21 |
analytics = Analytics(
|
22 |
url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
|
23 |
token=os.environ.get("REDIS_TOKEN", "***"),
|
24 |
sync_interval=30 * 60, # e.g. 30 minutes
|
25 |
)
|
26 |
|
27 |
-
# Initialize the Docsifer service (
|
28 |
docsifer_service = DocsiferService(model_name="gpt-4o")
|
29 |
|
30 |
|
@@ -45,12 +43,13 @@ async def convert_document(
|
|
45 |
):
|
46 |
"""
|
47 |
Convert a file or an HTML page from a URL into Markdown.
|
48 |
-
If 'file' is provided, it
|
49 |
-
|
50 |
-
- '
|
|
|
51 |
"""
|
52 |
try:
|
53 |
-
# Parse
|
54 |
try:
|
55 |
openai_config = json.loads(openai) if openai else {}
|
56 |
except json.JSONDecodeError:
|
@@ -63,7 +62,7 @@ async def convert_document(
|
|
63 |
|
64 |
cleanup = settings_config.get("cleanup", True)
|
65 |
|
66 |
-
# If a file is provided, use the
|
67 |
if file is not None:
|
68 |
with tempfile.TemporaryDirectory() as tmpdir:
|
69 |
temp_path = Path(tmpdir) / file.filename
|
@@ -74,7 +73,6 @@ async def convert_document(
|
|
74 |
openai_config=openai_config,
|
75 |
cleanup=cleanup,
|
76 |
)
|
77 |
-
# Otherwise, fetch HTML from URL and convert
|
78 |
elif url:
|
79 |
async with aiohttp.ClientSession() as session:
|
80 |
async with session.get(url) as resp:
|
@@ -94,7 +92,7 @@ async def convert_document(
|
|
94 |
status_code=400, detail="Provide either 'file' or 'url'."
|
95 |
)
|
96 |
|
97 |
-
#
|
98 |
background_tasks.add_task(analytics.access, token_count)
|
99 |
return ConvertResponse(**result)
|
100 |
|
@@ -108,7 +106,6 @@ async def convert_document(
|
|
108 |
async def get_stats():
|
109 |
"""
|
110 |
Return usage statistics (access, tokens) from the Analytics system.
|
111 |
-
All data is stored under "docsifer".
|
112 |
"""
|
113 |
try:
|
114 |
data = await analytics.stats()
|
|
|
|
|
|
|
1 |
import logging
|
2 |
import json
|
3 |
import tempfile
|
|
|
15 |
logger = logging.getLogger(__name__)
|
16 |
router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
|
17 |
|
18 |
+
# Initialize analytics (aggregated under "docsifer")
|
19 |
analytics = Analytics(
|
20 |
url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
|
21 |
token=os.environ.get("REDIS_TOKEN", "***"),
|
22 |
sync_interval=30 * 60, # e.g. 30 minutes
|
23 |
)
|
24 |
|
25 |
+
# Initialize the Docsifer service (using "gpt-4o" for token counting)
|
26 |
docsifer_service = DocsiferService(model_name="gpt-4o")
|
27 |
|
28 |
|
|
|
43 |
):
|
44 |
"""
|
45 |
Convert a file or an HTML page from a URL into Markdown.
|
46 |
+
If 'file' is provided, it takes priority over 'url'.
|
47 |
+
|
48 |
+
- 'openai' is a JSON string with keys such as {"api_key": "...", "base_url": "..."}.
|
49 |
+
- 'settings' is a JSON string with keys such as {"cleanup": bool}.
|
50 |
"""
|
51 |
try:
|
52 |
+
# Parse the JSON configuration parameters.
|
53 |
try:
|
54 |
openai_config = json.loads(openai) if openai else {}
|
55 |
except json.JSONDecodeError:
|
|
|
62 |
|
63 |
cleanup = settings_config.get("cleanup", True)
|
64 |
|
65 |
+
# If a file is provided, use it; otherwise, fetch the content from the URL.
|
66 |
if file is not None:
|
67 |
with tempfile.TemporaryDirectory() as tmpdir:
|
68 |
temp_path = Path(tmpdir) / file.filename
|
|
|
73 |
openai_config=openai_config,
|
74 |
cleanup=cleanup,
|
75 |
)
|
|
|
76 |
elif url:
|
77 |
async with aiohttp.ClientSession() as session:
|
78 |
async with session.get(url) as resp:
|
|
|
92 |
status_code=400, detail="Provide either 'file' or 'url'."
|
93 |
)
|
94 |
|
95 |
+
# Record token usage in the background.
|
96 |
background_tasks.add_task(analytics.access, token_count)
|
97 |
return ConvertResponse(**result)
|
98 |
|
|
|
106 |
async def get_stats():
|
107 |
"""
|
108 |
Return usage statistics (access, tokens) from the Analytics system.
|
|
|
109 |
"""
|
110 |
try:
|
111 |
data = await analytics.stats()
|
docsifer/service.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
-
# filename: service.py
|
2 |
-
|
3 |
from __future__ import annotations
|
4 |
|
|
|
5 |
import logging
|
6 |
import tempfile
|
7 |
import magic
|
@@ -14,7 +13,6 @@ from pyquery import PyQuery as pq
|
|
14 |
from markitdown import MarkItDown
|
15 |
from openai import OpenAI
|
16 |
|
17 |
-
|
18 |
logger = logging.getLogger(__name__)
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
@@ -23,16 +21,17 @@ class DocsiferService:
|
|
23 |
"""
|
24 |
A service that converts local files to Markdown using MarkItDown,
|
25 |
optionally with an OpenAI LLM for advanced extraction.
|
26 |
-
Token counting uses
|
27 |
"""
|
28 |
|
29 |
def __init__(self, model_name: str = "gpt-4o"):
|
30 |
"""
|
31 |
Initialize the DocsiferService with a basic MarkItDown instance
|
32 |
-
and a tiktoken encoder for counting tokens using
|
33 |
"""
|
34 |
self._basic_markitdown = MarkItDown() # MarkItDown without LLM
|
35 |
-
|
|
|
36 |
try:
|
37 |
self._encoder = tiktoken.encoding_for_model(model_name)
|
38 |
except Exception as e:
|
@@ -47,8 +46,13 @@ class DocsiferService:
|
|
47 |
|
48 |
def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
|
49 |
"""
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
52 |
"""
|
53 |
api_key = openai_config.get("api_key", "")
|
54 |
if not api_key:
|
@@ -64,26 +68,34 @@ class DocsiferService:
|
|
64 |
|
65 |
def _maybe_cleanup_html(self, html_file: Path) -> None:
|
66 |
"""
|
67 |
-
If the file is HTML, remove <style> tags
|
|
|
|
|
|
|
68 |
"""
|
69 |
try:
|
70 |
content = html_file.read_text(encoding="utf-8", errors="ignore")
|
71 |
d = pq(content)
|
72 |
-
# Remove hidden elements and styles
|
73 |
d(":hidden").remove()
|
74 |
d("[style='display:none']").remove()
|
75 |
d('*[style*="display:none"]').remove()
|
76 |
d("style").remove()
|
77 |
-
cleaned_html = str(d)
|
78 |
-
cleaned_html = cleaned_html.strip()
|
79 |
html_file.write_text(cleaned_html, encoding="utf-8")
|
80 |
except Exception as e:
|
81 |
logger.error("HTML cleanup failed for %s: %s", html_file, e)
|
82 |
|
83 |
def _count_tokens(self, text: str) -> int:
|
84 |
"""
|
85 |
-
Count tokens using the configured tiktoken encoder.
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
"""
|
88 |
try:
|
89 |
return len(self._encoder.encode(text))
|
@@ -93,14 +105,21 @@ class DocsiferService:
|
|
93 |
)
|
94 |
return len(text.split())
|
95 |
|
96 |
-
|
97 |
self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
|
98 |
) -> Tuple[Dict[str, str], int]:
|
99 |
"""
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
"""
|
105 |
src = Path(file_path)
|
106 |
if not src.exists():
|
@@ -108,7 +127,7 @@ class DocsiferService:
|
|
108 |
|
109 |
logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
|
110 |
|
111 |
-
#
|
112 |
with tempfile.TemporaryDirectory() as tmpdir:
|
113 |
mime_type = magic.from_file(str(src), mime=True)
|
114 |
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
@@ -128,11 +147,11 @@ class DocsiferService:
|
|
128 |
guessed_ext,
|
129 |
)
|
130 |
|
131 |
-
#
|
132 |
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
133 |
self._maybe_cleanup_html(tmp_path)
|
134 |
|
135 |
-
# Decide whether to use LLM or basic
|
136 |
if openai_config and openai_config.get("api_key"):
|
137 |
md_converter = self._init_markitdown_with_llm(openai_config)
|
138 |
else:
|
@@ -144,7 +163,7 @@ class DocsiferService:
|
|
144 |
logger.error("MarkItDown conversion failed: %s", e)
|
145 |
raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
|
146 |
|
147 |
-
# Count tokens
|
148 |
token_count = self._count_tokens(result_obj.text_content)
|
149 |
|
150 |
result_dict = {
|
@@ -152,3 +171,23 @@ class DocsiferService:
|
|
152 |
"markdown": result_obj.text_content,
|
153 |
}
|
154 |
return result_dict, token_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
+
import asyncio
|
4 |
import logging
|
5 |
import tempfile
|
6 |
import magic
|
|
|
13 |
from markitdown import MarkItDown
|
14 |
from openai import OpenAI
|
15 |
|
|
|
16 |
logger = logging.getLogger(__name__)
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
|
|
|
21 |
"""
|
22 |
A service that converts local files to Markdown using MarkItDown,
|
23 |
optionally with an OpenAI LLM for advanced extraction.
|
24 |
+
Token counting uses a tiktoken encoder (heuristically with the provided model).
|
25 |
"""
|
26 |
|
27 |
def __init__(self, model_name: str = "gpt-4o"):
|
28 |
"""
|
29 |
Initialize the DocsiferService with a basic MarkItDown instance
|
30 |
+
and a tiktoken encoder for counting tokens using the provided model.
|
31 |
"""
|
32 |
self._basic_markitdown = MarkItDown() # MarkItDown without LLM
|
33 |
+
|
34 |
+
# Use the given model for token counting
|
35 |
try:
|
36 |
self._encoder = tiktoken.encoding_for_model(model_name)
|
37 |
except Exception as e:
|
|
|
46 |
|
47 |
def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
|
48 |
"""
|
49 |
+
Initialize a MarkItDown instance configured with an OpenAI LLM if an API key is provided.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
openai_config: A dictionary containing OpenAI configuration (e.g., api_key, model, base_url).
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
A MarkItDown instance configured with the OpenAI client, or the basic instance if no key is provided.
|
56 |
"""
|
57 |
api_key = openai_config.get("api_key", "")
|
58 |
if not api_key:
|
|
|
68 |
|
69 |
def _maybe_cleanup_html(self, html_file: Path) -> None:
|
70 |
"""
|
71 |
+
If the file is HTML, remove <style> tags and hidden elements to clean up the content.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
html_file: Path to the HTML file.
|
75 |
"""
|
76 |
try:
|
77 |
content = html_file.read_text(encoding="utf-8", errors="ignore")
|
78 |
d = pq(content)
|
79 |
+
# Remove hidden elements and inline styles that hide content.
|
80 |
d(":hidden").remove()
|
81 |
d("[style='display:none']").remove()
|
82 |
d('*[style*="display:none"]').remove()
|
83 |
d("style").remove()
|
84 |
+
cleaned_html = str(d).strip()
|
|
|
85 |
html_file.write_text(cleaned_html, encoding="utf-8")
|
86 |
except Exception as e:
|
87 |
logger.error("HTML cleanup failed for %s: %s", html_file, e)
|
88 |
|
89 |
def _count_tokens(self, text: str) -> int:
|
90 |
"""
|
91 |
+
Count tokens in the given text using the configured tiktoken encoder.
|
92 |
+
Falls back to a whitespace-based count if an error occurs.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
text: The text to count tokens in.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
The number of tokens.
|
99 |
"""
|
100 |
try:
|
101 |
return len(self._encoder.encode(text))
|
|
|
105 |
)
|
106 |
return len(text.split())
|
107 |
|
108 |
+
def _convert_file_sync(
|
109 |
self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
|
110 |
) -> Tuple[Dict[str, str], int]:
|
111 |
"""
|
112 |
+
Synchronously convert a file at `file_path` to Markdown.
|
113 |
+
This helper method performs blocking file I/O, MIME detection, temporary file handling,
|
114 |
+
optional HTML cleanup, and MarkItDown conversion.
|
115 |
+
|
116 |
+
Args:
|
117 |
+
file_path: Path to the source file.
|
118 |
+
openai_config: Optional dictionary with OpenAI configuration.
|
119 |
+
cleanup: Whether to perform HTML cleanup if the file is an HTML file.
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
|
123 |
"""
|
124 |
src = Path(file_path)
|
125 |
if not src.exists():
|
|
|
127 |
|
128 |
logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
|
129 |
|
130 |
+
# Create a temporary directory so that MarkItDown sees the proper file extension.
|
131 |
with tempfile.TemporaryDirectory() as tmpdir:
|
132 |
mime_type = magic.from_file(str(src), mime=True)
|
133 |
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
|
|
147 |
guessed_ext,
|
148 |
)
|
149 |
|
150 |
+
# Perform HTML cleanup if requested.
|
151 |
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
152 |
self._maybe_cleanup_html(tmp_path)
|
153 |
|
154 |
+
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
155 |
if openai_config and openai_config.get("api_key"):
|
156 |
md_converter = self._init_markitdown_with_llm(openai_config)
|
157 |
else:
|
|
|
163 |
logger.error("MarkItDown conversion failed: %s", e)
|
164 |
raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
|
165 |
|
166 |
+
# Count tokens in the resulting markdown text.
|
167 |
token_count = self._count_tokens(result_obj.text_content)
|
168 |
|
169 |
result_dict = {
|
|
|
171 |
"markdown": result_obj.text_content,
|
172 |
}
|
173 |
return result_dict, token_count
|
174 |
+
|
175 |
+
async def convert_file(
|
176 |
+
self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
|
177 |
+
) -> Tuple[Dict[str, str], int]:
|
178 |
+
"""
|
179 |
+
Asynchronously convert a file at `file_path` to Markdown.
|
180 |
+
This method offloads the blocking conversion process to a separate thread.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
file_path: Path to the file to convert.
|
184 |
+
openai_config: Optional OpenAI configuration dictionary.
|
185 |
+
cleanup: Whether to perform HTML cleanup if applicable.
|
186 |
+
|
187 |
+
Returns:
|
188 |
+
A tuple containing the result dictionary (with keys "filename" and "markdown")
|
189 |
+
and the token count.
|
190 |
+
"""
|
191 |
+
return await asyncio.to_thread(
|
192 |
+
self._convert_file_sync, file_path, openai_config, cleanup
|
193 |
+
)
|