Spaces:

lamhieu
/

docsifer

Running

App Files Files Community

lamhieu commited on Feb 2

Commit

5893331

1 Parent(s): 06b057b

perf: improvement requests handling with async mode

Browse files

Files changed (2) hide show

docsifer/router.py +9 -12
docsifer/service.py +62 -23

docsifer/router.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# filename: router.py
 import logging
 import json
 import tempfile
@@ -17,14 +15,14 @@ from .analytics import Analytics
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
-# Initialize analytics (single aggregator = "docsifer")
 analytics = Analytics(
     url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
     token=os.environ.get("REDIS_TOKEN", "***"),
     sync_interval=30 * 60,  # e.g. 30 minutes
 )
-# Initialize the Docsifer service (token counting with gpt-4o)
 docsifer_service = DocsiferService(model_name="gpt-4o")
@@ -45,12 +43,13 @@ async def convert_document(
 ):
     """
     Convert a file or an HTML page from a URL into Markdown.
-    If 'file' is provided, it has priority over 'url'.
-    - 'openai' is a JSON string with keys: {"api_key": "...", "base_url": "..."}
-    - 'settings' is a JSON string with keys: {"cleanup": bool}
     """
     try:
-        # Parse configs
         try:
             openai_config = json.loads(openai) if openai else {}
         except json.JSONDecodeError:
@@ -63,7 +62,7 @@ async def convert_document(
         cleanup = settings_config.get("cleanup", True)
-        # If a file is provided, use the existing flow
         if file is not None:
             with tempfile.TemporaryDirectory() as tmpdir:
                 temp_path = Path(tmpdir) / file.filename
@@ -74,7 +73,6 @@ async def convert_document(
                     openai_config=openai_config,
                     cleanup=cleanup,
                 )
-        # Otherwise, fetch HTML from URL and convert
         elif url:
             async with aiohttp.ClientSession() as session:
                 async with session.get(url) as resp:
@@ -94,7 +92,7 @@ async def convert_document(
                 status_code=400, detail="Provide either 'file' or 'url'."
             )
-        # Track usage
         background_tasks.add_task(analytics.access, token_count)
         return ConvertResponse(**result)
@@ -108,7 +106,6 @@ async def convert_document(
 async def get_stats():
     """
     Return usage statistics (access, tokens) from the Analytics system.
-    All data is stored under "docsifer".
     """
     try:
         data = await analytics.stats()

 import logging
 import json
 import tempfile
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
+# Initialize analytics (aggregated under "docsifer")
 analytics = Analytics(
     url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
     token=os.environ.get("REDIS_TOKEN", "***"),
     sync_interval=30 * 60,  # e.g. 30 minutes
 )
+# Initialize the Docsifer service (using "gpt-4o" for token counting)
 docsifer_service = DocsiferService(model_name="gpt-4o")
 ):
     """
     Convert a file or an HTML page from a URL into Markdown.
+    If 'file' is provided, it takes priority over 'url'.
+    - 'openai' is a JSON string with keys such as {"api_key": "...", "base_url": "..."}.
+    - 'settings' is a JSON string with keys such as {"cleanup": bool}.
     """
     try:
+        # Parse the JSON configuration parameters.
         try:
             openai_config = json.loads(openai) if openai else {}
         except json.JSONDecodeError:
         cleanup = settings_config.get("cleanup", True)
+        # If a file is provided, use it; otherwise, fetch the content from the URL.
         if file is not None:
             with tempfile.TemporaryDirectory() as tmpdir:
                 temp_path = Path(tmpdir) / file.filename
                     openai_config=openai_config,
                     cleanup=cleanup,
                 )
         elif url:
             async with aiohttp.ClientSession() as session:
                 async with session.get(url) as resp:
                 status_code=400, detail="Provide either 'file' or 'url'."
             )
+        # Record token usage in the background.
         background_tasks.add_task(analytics.access, token_count)
         return ConvertResponse(**result)
 async def get_stats():
     """
     Return usage statistics (access, tokens) from the Analytics system.
     """
     try:
         data = await analytics.stats()

docsifer/service.py CHANGED Viewed

@@ -1,7 +1,6 @@
-# filename: service.py
 from __future__ import annotations
 import logging
 import tempfile
 import magic
@@ -14,7 +13,6 @@ from pyquery import PyQuery as pq
 from markitdown import MarkItDown
 from openai import OpenAI
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -23,16 +21,17 @@ class DocsiferService:
     """
     A service that converts local files to Markdown using MarkItDown,
     optionally with an OpenAI LLM for advanced extraction.
-    Token counting uses "gpt-4o" as a heuristic via tiktoken.
     """
     def __init__(self, model_name: str = "gpt-4o"):
         """
         Initialize the DocsiferService with a basic MarkItDown instance
-        and a tiktoken encoder for counting tokens using "gpt-4o".
         """
         self._basic_markitdown = MarkItDown()  # MarkItDown without LLM
-        # Use "gpt-4o" for token counting
         try:
             self._encoder = tiktoken.encoding_for_model(model_name)
         except Exception as e:
@@ -47,8 +46,13 @@ class DocsiferService:
     def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
         """
-        If openai_config has an 'api_key', configure openai and return
-        a MarkItDown instance with that OpenAI client.
         """
         api_key = openai_config.get("api_key", "")
         if not api_key:
@@ -64,26 +68,34 @@ class DocsiferService:
     def _maybe_cleanup_html(self, html_file: Path) -> None:
         """
-        If the file is HTML, remove <style> tags, optionally hidden elements, etc.
         """
         try:
             content = html_file.read_text(encoding="utf-8", errors="ignore")
             d = pq(content)
-            # Remove hidden elements and styles
             d(":hidden").remove()
             d("[style='display:none']").remove()
             d('*[style*="display:none"]').remove()
             d("style").remove()
-            cleaned_html = str(d)
-            cleaned_html = cleaned_html.strip()
             html_file.write_text(cleaned_html, encoding="utf-8")
         except Exception as e:
             logger.error("HTML cleanup failed for %s: %s", html_file, e)
     def _count_tokens(self, text: str) -> int:
         """
-        Count tokens using the configured tiktoken encoder.
-        Fallback to whitespace-based counting if an error occurs.
         """
         try:
             return len(self._encoder.encode(text))
@@ -93,14 +105,21 @@ class DocsiferService:
             )
             return len(text.split())
-    async def convert_file(
         self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
     ) -> Tuple[Dict[str, str], int]:
         """
-        Converts a file at `file_path` to Markdown.
-        - If `cleanup` is True and file is .html/.htm, does HTML cleanup.
-        - If `openai_config` has a valid API key, use LLM-based MarkItDown.
-        Returns ({"filename": filename, "markdown": md_string}, token_count).
         """
         src = Path(file_path)
         if not src.exists():
@@ -108,7 +127,7 @@ class DocsiferService:
         logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
-        # Use a temp directory so MarkItDown sees the real file extension
         with tempfile.TemporaryDirectory() as tmpdir:
             mime_type = magic.from_file(str(src), mime=True)
             guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
@@ -128,11 +147,11 @@ class DocsiferService:
                 guessed_ext,
             )
-            # If it's HTML and cleanup is requested
             if cleanup and guessed_ext.lower() in (".html", ".htm"):
                 self._maybe_cleanup_html(tmp_path)
-            # Decide whether to use LLM or basic
             if openai_config and openai_config.get("api_key"):
                 md_converter = self._init_markitdown_with_llm(openai_config)
             else:
@@ -144,7 +163,7 @@ class DocsiferService:
                 logger.error("MarkItDown conversion failed: %s", e)
                 raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
-            # Count tokens
             token_count = self._count_tokens(result_obj.text_content)
             result_dict = {
@@ -152,3 +171,23 @@ class DocsiferService:
                 "markdown": result_obj.text_content,
             }
             return result_dict, token_count

 from __future__ import annotations
+import asyncio
 import logging
 import tempfile
 import magic
 from markitdown import MarkItDown
 from openai import OpenAI
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
     """
     A service that converts local files to Markdown using MarkItDown,
     optionally with an OpenAI LLM for advanced extraction.
+    Token counting uses a tiktoken encoder (heuristically with the provided model).
     """
     def __init__(self, model_name: str = "gpt-4o"):
         """
         Initialize the DocsiferService with a basic MarkItDown instance
+        and a tiktoken encoder for counting tokens using the provided model.
         """
         self._basic_markitdown = MarkItDown()  # MarkItDown without LLM
+        # Use the given model for token counting
         try:
             self._encoder = tiktoken.encoding_for_model(model_name)
         except Exception as e:
     def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
         """
+        Initialize a MarkItDown instance configured with an OpenAI LLM if an API key is provided.
+        Args:
+            openai_config: A dictionary containing OpenAI configuration (e.g., api_key, model, base_url).
+        Returns:
+            A MarkItDown instance configured with the OpenAI client, or the basic instance if no key is provided.
         """
         api_key = openai_config.get("api_key", "")
         if not api_key:
     def _maybe_cleanup_html(self, html_file: Path) -> None:
         """
+        If the file is HTML, remove <style> tags and hidden elements to clean up the content.
+        Args:
+            html_file: Path to the HTML file.
         """
         try:
             content = html_file.read_text(encoding="utf-8", errors="ignore")
             d = pq(content)
+            # Remove hidden elements and inline styles that hide content.
             d(":hidden").remove()
             d("[style='display:none']").remove()
             d('*[style*="display:none"]').remove()
             d("style").remove()
+            cleaned_html = str(d).strip()
             html_file.write_text(cleaned_html, encoding="utf-8")
         except Exception as e:
             logger.error("HTML cleanup failed for %s: %s", html_file, e)
     def _count_tokens(self, text: str) -> int:
         """
+        Count tokens in the given text using the configured tiktoken encoder.
+        Falls back to a whitespace-based count if an error occurs.
+        Args:
+            text: The text to count tokens in.
+        Returns:
+            The number of tokens.
         """
         try:
             return len(self._encoder.encode(text))
             )
             return len(text.split())
+    def _convert_file_sync(
         self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
     ) -> Tuple[Dict[str, str], int]:
         """
+        Synchronously convert a file at `file_path` to Markdown.
+        This helper method performs blocking file I/O, MIME detection, temporary file handling,
+        optional HTML cleanup, and MarkItDown conversion.
+        Args:
+            file_path: Path to the source file.
+            openai_config: Optional dictionary with OpenAI configuration.
+            cleanup: Whether to perform HTML cleanup if the file is an HTML file.
+        Returns:
+            A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
         """
         src = Path(file_path)
         if not src.exists():
         logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
+        # Create a temporary directory so that MarkItDown sees the proper file extension.
         with tempfile.TemporaryDirectory() as tmpdir:
             mime_type = magic.from_file(str(src), mime=True)
             guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
                 guessed_ext,
             )
+            # Perform HTML cleanup if requested.
             if cleanup and guessed_ext.lower() in (".html", ".htm"):
                 self._maybe_cleanup_html(tmp_path)
+            # Decide whether to use LLM-enhanced conversion or the basic converter.
             if openai_config and openai_config.get("api_key"):
                 md_converter = self._init_markitdown_with_llm(openai_config)
             else:
                 logger.error("MarkItDown conversion failed: %s", e)
                 raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
+            # Count tokens in the resulting markdown text.
             token_count = self._count_tokens(result_obj.text_content)
             result_dict = {
                 "markdown": result_obj.text_content,
             }
             return result_dict, token_count
+    async def convert_file(
+        self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
+    ) -> Tuple[Dict[str, str], int]:
+        """
+        Asynchronously convert a file at `file_path` to Markdown.
+        This method offloads the blocking conversion process to a separate thread.
+        Args:
+            file_path: Path to the file to convert.
+            openai_config: Optional OpenAI configuration dictionary.
+            cleanup: Whether to perform HTML cleanup if applicable.
+        Returns:
+            A tuple containing the result dictionary (with keys "filename" and "markdown")
+            and the token count.
+        """
+        return await asyncio.to_thread(
+            self._convert_file_sync, file_path, openai_config, cleanup
+        )