lamhieu commited on
Commit
5893331
Β·
1 Parent(s): 06b057b

perf: improvement requests handling with async mode

Browse files
Files changed (2) hide show
  1. docsifer/router.py +9 -12
  2. docsifer/service.py +62 -23
docsifer/router.py CHANGED
@@ -1,5 +1,3 @@
1
- # filename: router.py
2
-
3
  import logging
4
  import json
5
  import tempfile
@@ -17,14 +15,14 @@ from .analytics import Analytics
17
  logger = logging.getLogger(__name__)
18
  router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
19
 
20
- # Initialize analytics (single aggregator = "docsifer")
21
  analytics = Analytics(
22
  url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
23
  token=os.environ.get("REDIS_TOKEN", "***"),
24
  sync_interval=30 * 60, # e.g. 30 minutes
25
  )
26
 
27
- # Initialize the Docsifer service (token counting with gpt-4o)
28
  docsifer_service = DocsiferService(model_name="gpt-4o")
29
 
30
 
@@ -45,12 +43,13 @@ async def convert_document(
45
  ):
46
  """
47
  Convert a file or an HTML page from a URL into Markdown.
48
- If 'file' is provided, it has priority over 'url'.
49
- - 'openai' is a JSON string with keys: {"api_key": "...", "base_url": "..."}
50
- - 'settings' is a JSON string with keys: {"cleanup": bool}
 
51
  """
52
  try:
53
- # Parse configs
54
  try:
55
  openai_config = json.loads(openai) if openai else {}
56
  except json.JSONDecodeError:
@@ -63,7 +62,7 @@ async def convert_document(
63
 
64
  cleanup = settings_config.get("cleanup", True)
65
 
66
- # If a file is provided, use the existing flow
67
  if file is not None:
68
  with tempfile.TemporaryDirectory() as tmpdir:
69
  temp_path = Path(tmpdir) / file.filename
@@ -74,7 +73,6 @@ async def convert_document(
74
  openai_config=openai_config,
75
  cleanup=cleanup,
76
  )
77
- # Otherwise, fetch HTML from URL and convert
78
  elif url:
79
  async with aiohttp.ClientSession() as session:
80
  async with session.get(url) as resp:
@@ -94,7 +92,7 @@ async def convert_document(
94
  status_code=400, detail="Provide either 'file' or 'url'."
95
  )
96
 
97
- # Track usage
98
  background_tasks.add_task(analytics.access, token_count)
99
  return ConvertResponse(**result)
100
 
@@ -108,7 +106,6 @@ async def convert_document(
108
  async def get_stats():
109
  """
110
  Return usage statistics (access, tokens) from the Analytics system.
111
- All data is stored under "docsifer".
112
  """
113
  try:
114
  data = await analytics.stats()
 
 
 
1
  import logging
2
  import json
3
  import tempfile
 
15
  logger = logging.getLogger(__name__)
16
  router = APIRouter(tags=["v1"], responses={404: {"description": "Not found"}})
17
 
18
+ # Initialize analytics (aggregated under "docsifer")
19
  analytics = Analytics(
20
  url=os.environ.get("REDIS_URL", "redis://localhost:6379/0"),
21
  token=os.environ.get("REDIS_TOKEN", "***"),
22
  sync_interval=30 * 60, # e.g. 30 minutes
23
  )
24
 
25
+ # Initialize the Docsifer service (using "gpt-4o" for token counting)
26
  docsifer_service = DocsiferService(model_name="gpt-4o")
27
 
28
 
 
43
  ):
44
  """
45
  Convert a file or an HTML page from a URL into Markdown.
46
+ If 'file' is provided, it takes priority over 'url'.
47
+
48
+ - 'openai' is a JSON string with keys such as {"api_key": "...", "base_url": "..."}.
49
+ - 'settings' is a JSON string with keys such as {"cleanup": bool}.
50
  """
51
  try:
52
+ # Parse the JSON configuration parameters.
53
  try:
54
  openai_config = json.loads(openai) if openai else {}
55
  except json.JSONDecodeError:
 
62
 
63
  cleanup = settings_config.get("cleanup", True)
64
 
65
+ # If a file is provided, use it; otherwise, fetch the content from the URL.
66
  if file is not None:
67
  with tempfile.TemporaryDirectory() as tmpdir:
68
  temp_path = Path(tmpdir) / file.filename
 
73
  openai_config=openai_config,
74
  cleanup=cleanup,
75
  )
 
76
  elif url:
77
  async with aiohttp.ClientSession() as session:
78
  async with session.get(url) as resp:
 
92
  status_code=400, detail="Provide either 'file' or 'url'."
93
  )
94
 
95
+ # Record token usage in the background.
96
  background_tasks.add_task(analytics.access, token_count)
97
  return ConvertResponse(**result)
98
 
 
106
  async def get_stats():
107
  """
108
  Return usage statistics (access, tokens) from the Analytics system.
 
109
  """
110
  try:
111
  data = await analytics.stats()
docsifer/service.py CHANGED
@@ -1,7 +1,6 @@
1
- # filename: service.py
2
-
3
  from __future__ import annotations
4
 
 
5
  import logging
6
  import tempfile
7
  import magic
@@ -14,7 +13,6 @@ from pyquery import PyQuery as pq
14
  from markitdown import MarkItDown
15
  from openai import OpenAI
16
 
17
-
18
  logger = logging.getLogger(__name__)
19
  logging.basicConfig(level=logging.INFO)
20
 
@@ -23,16 +21,17 @@ class DocsiferService:
23
  """
24
  A service that converts local files to Markdown using MarkItDown,
25
  optionally with an OpenAI LLM for advanced extraction.
26
- Token counting uses "gpt-4o" as a heuristic via tiktoken.
27
  """
28
 
29
  def __init__(self, model_name: str = "gpt-4o"):
30
  """
31
  Initialize the DocsiferService with a basic MarkItDown instance
32
- and a tiktoken encoder for counting tokens using "gpt-4o".
33
  """
34
  self._basic_markitdown = MarkItDown() # MarkItDown without LLM
35
- # Use "gpt-4o" for token counting
 
36
  try:
37
  self._encoder = tiktoken.encoding_for_model(model_name)
38
  except Exception as e:
@@ -47,8 +46,13 @@ class DocsiferService:
47
 
48
  def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
49
  """
50
- If openai_config has an 'api_key', configure openai and return
51
- a MarkItDown instance with that OpenAI client.
 
 
 
 
 
52
  """
53
  api_key = openai_config.get("api_key", "")
54
  if not api_key:
@@ -64,26 +68,34 @@ class DocsiferService:
64
 
65
  def _maybe_cleanup_html(self, html_file: Path) -> None:
66
  """
67
- If the file is HTML, remove <style> tags, optionally hidden elements, etc.
 
 
 
68
  """
69
  try:
70
  content = html_file.read_text(encoding="utf-8", errors="ignore")
71
  d = pq(content)
72
- # Remove hidden elements and styles
73
  d(":hidden").remove()
74
  d("[style='display:none']").remove()
75
  d('*[style*="display:none"]').remove()
76
  d("style").remove()
77
- cleaned_html = str(d)
78
- cleaned_html = cleaned_html.strip()
79
  html_file.write_text(cleaned_html, encoding="utf-8")
80
  except Exception as e:
81
  logger.error("HTML cleanup failed for %s: %s", html_file, e)
82
 
83
  def _count_tokens(self, text: str) -> int:
84
  """
85
- Count tokens using the configured tiktoken encoder.
86
- Fallback to whitespace-based counting if an error occurs.
 
 
 
 
 
 
87
  """
88
  try:
89
  return len(self._encoder.encode(text))
@@ -93,14 +105,21 @@ class DocsiferService:
93
  )
94
  return len(text.split())
95
 
96
- async def convert_file(
97
  self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
98
  ) -> Tuple[Dict[str, str], int]:
99
  """
100
- Converts a file at `file_path` to Markdown.
101
- - If `cleanup` is True and file is .html/.htm, does HTML cleanup.
102
- - If `openai_config` has a valid API key, use LLM-based MarkItDown.
103
- Returns ({"filename": filename, "markdown": md_string}, token_count).
 
 
 
 
 
 
 
104
  """
105
  src = Path(file_path)
106
  if not src.exists():
@@ -108,7 +127,7 @@ class DocsiferService:
108
 
109
  logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
110
 
111
- # Use a temp directory so MarkItDown sees the real file extension
112
  with tempfile.TemporaryDirectory() as tmpdir:
113
  mime_type = magic.from_file(str(src), mime=True)
114
  guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
@@ -128,11 +147,11 @@ class DocsiferService:
128
  guessed_ext,
129
  )
130
 
131
- # If it's HTML and cleanup is requested
132
  if cleanup and guessed_ext.lower() in (".html", ".htm"):
133
  self._maybe_cleanup_html(tmp_path)
134
 
135
- # Decide whether to use LLM or basic
136
  if openai_config and openai_config.get("api_key"):
137
  md_converter = self._init_markitdown_with_llm(openai_config)
138
  else:
@@ -144,7 +163,7 @@ class DocsiferService:
144
  logger.error("MarkItDown conversion failed: %s", e)
145
  raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
146
 
147
- # Count tokens
148
  token_count = self._count_tokens(result_obj.text_content)
149
 
150
  result_dict = {
@@ -152,3 +171,23 @@ class DocsiferService:
152
  "markdown": result_obj.text_content,
153
  }
154
  return result_dict, token_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
+ import asyncio
4
  import logging
5
  import tempfile
6
  import magic
 
13
  from markitdown import MarkItDown
14
  from openai import OpenAI
15
 
 
16
  logger = logging.getLogger(__name__)
17
  logging.basicConfig(level=logging.INFO)
18
 
 
21
  """
22
  A service that converts local files to Markdown using MarkItDown,
23
  optionally with an OpenAI LLM for advanced extraction.
24
+ Token counting uses a tiktoken encoder (heuristically with the provided model).
25
  """
26
 
27
  def __init__(self, model_name: str = "gpt-4o"):
28
  """
29
  Initialize the DocsiferService with a basic MarkItDown instance
30
+ and a tiktoken encoder for counting tokens using the provided model.
31
  """
32
  self._basic_markitdown = MarkItDown() # MarkItDown without LLM
33
+
34
+ # Use the given model for token counting
35
  try:
36
  self._encoder = tiktoken.encoding_for_model(model_name)
37
  except Exception as e:
 
46
 
47
  def _init_markitdown_with_llm(self, openai_config: Dict[str, Any]) -> MarkItDown:
48
  """
49
+ Initialize a MarkItDown instance configured with an OpenAI LLM if an API key is provided.
50
+
51
+ Args:
52
+ openai_config: A dictionary containing OpenAI configuration (e.g., api_key, model, base_url).
53
+
54
+ Returns:
55
+ A MarkItDown instance configured with the OpenAI client, or the basic instance if no key is provided.
56
  """
57
  api_key = openai_config.get("api_key", "")
58
  if not api_key:
 
68
 
69
  def _maybe_cleanup_html(self, html_file: Path) -> None:
70
  """
71
+ If the file is HTML, remove <style> tags and hidden elements to clean up the content.
72
+
73
+ Args:
74
+ html_file: Path to the HTML file.
75
  """
76
  try:
77
  content = html_file.read_text(encoding="utf-8", errors="ignore")
78
  d = pq(content)
79
+ # Remove hidden elements and inline styles that hide content.
80
  d(":hidden").remove()
81
  d("[style='display:none']").remove()
82
  d('*[style*="display:none"]').remove()
83
  d("style").remove()
84
+ cleaned_html = str(d).strip()
 
85
  html_file.write_text(cleaned_html, encoding="utf-8")
86
  except Exception as e:
87
  logger.error("HTML cleanup failed for %s: %s", html_file, e)
88
 
89
  def _count_tokens(self, text: str) -> int:
90
  """
91
+ Count tokens in the given text using the configured tiktoken encoder.
92
+ Falls back to a whitespace-based count if an error occurs.
93
+
94
+ Args:
95
+ text: The text to count tokens in.
96
+
97
+ Returns:
98
+ The number of tokens.
99
  """
100
  try:
101
  return len(self._encoder.encode(text))
 
105
  )
106
  return len(text.split())
107
 
108
+ def _convert_file_sync(
109
  self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
110
  ) -> Tuple[Dict[str, str], int]:
111
  """
112
+ Synchronously convert a file at `file_path` to Markdown.
113
+ This helper method performs blocking file I/O, MIME detection, temporary file handling,
114
+ optional HTML cleanup, and MarkItDown conversion.
115
+
116
+ Args:
117
+ file_path: Path to the source file.
118
+ openai_config: Optional dictionary with OpenAI configuration.
119
+ cleanup: Whether to perform HTML cleanup if the file is an HTML file.
120
+
121
+ Returns:
122
+ A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
123
  """
124
  src = Path(file_path)
125
  if not src.exists():
 
127
 
128
  logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
129
 
130
+ # Create a temporary directory so that MarkItDown sees the proper file extension.
131
  with tempfile.TemporaryDirectory() as tmpdir:
132
  mime_type = magic.from_file(str(src), mime=True)
133
  guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
 
147
  guessed_ext,
148
  )
149
 
150
+ # Perform HTML cleanup if requested.
151
  if cleanup and guessed_ext.lower() in (".html", ".htm"):
152
  self._maybe_cleanup_html(tmp_path)
153
 
154
+ # Decide whether to use LLM-enhanced conversion or the basic converter.
155
  if openai_config and openai_config.get("api_key"):
156
  md_converter = self._init_markitdown_with_llm(openai_config)
157
  else:
 
163
  logger.error("MarkItDown conversion failed: %s", e)
164
  raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
165
 
166
+ # Count tokens in the resulting markdown text.
167
  token_count = self._count_tokens(result_obj.text_content)
168
 
169
  result_dict = {
 
171
  "markdown": result_obj.text_content,
172
  }
173
  return result_dict, token_count
174
+
175
+ async def convert_file(
176
+ self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
177
+ ) -> Tuple[Dict[str, str], int]:
178
+ """
179
+ Asynchronously convert a file at `file_path` to Markdown.
180
+ This method offloads the blocking conversion process to a separate thread.
181
+
182
+ Args:
183
+ file_path: Path to the file to convert.
184
+ openai_config: Optional OpenAI configuration dictionary.
185
+ cleanup: Whether to perform HTML cleanup if applicable.
186
+
187
+ Returns:
188
+ A tuple containing the result dictionary (with keys "filename" and "markdown")
189
+ and the token count.
190
+ """
191
+ return await asyncio.to_thread(
192
+ self._convert_file_sync, file_path, openai_config, cleanup
193
+ )