lamhieu commited on
Commit
1799b05
Β·
1 Parent(s): 7df81c3

chore: update something

Browse files
Files changed (2) hide show
  1. docsifer/router.py +21 -16
  2. docsifer/service.py +61 -54
docsifer/router.py CHANGED
@@ -2,12 +2,12 @@ import logging
2
  import json
3
  import tempfile
4
  import os
5
- import aiohttp
6
  from pathlib import Path
7
 
8
  from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
9
  from pydantic import BaseModel
10
- from scuid import scuid
11
 
12
  from .service import DocsiferService
13
  from .analytics import Analytics
@@ -69,24 +69,29 @@ async def convert_document(
69
  contents = await file.read()
70
  temp_path.write_bytes(contents)
71
  result, token_count = await docsifer_service.convert_file(
72
- file_path=str(temp_path),
73
  openai_config=openai_config,
74
  cleanup=cleanup,
75
  )
76
  elif url:
77
- async with aiohttp.ClientSession() as session:
78
- async with session.get(url) as resp:
79
- if resp.status != 200:
80
- raise ValueError(f"Failed to fetch URL: status {resp.status}")
81
- data = await resp.read()
82
- with tempfile.TemporaryDirectory() as tmpdir:
83
- temp_path = Path(tmpdir) / f"{scuid()}.html"
84
- temp_path.write_bytes(data)
85
- result, token_count = await docsifer_service.convert_file(
86
- file_path=str(temp_path),
87
- openai_config=openai_config,
88
- cleanup=cleanup,
89
- )
 
 
 
 
 
90
  else:
91
  raise HTTPException(
92
  status_code=400, detail="Provide either 'file' or 'url'."
 
2
  import json
3
  import tempfile
4
  import os
5
+ # import aiohttp
6
  from pathlib import Path
7
 
8
  from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
9
  from pydantic import BaseModel
10
+ # from scuid import scuid
11
 
12
  from .service import DocsiferService
13
  from .analytics import Analytics
 
69
  contents = await file.read()
70
  temp_path.write_bytes(contents)
71
  result, token_count = await docsifer_service.convert_file(
72
+ source=str(temp_path),
73
  openai_config=openai_config,
74
  cleanup=cleanup,
75
  )
76
  elif url:
77
+ # async with aiohttp.ClientSession() as session:
78
+ # async with session.get(url) as resp:
79
+ # if resp.status != 200:
80
+ # raise ValueError(f"Failed to fetch URL: status {resp.status}")
81
+ # data = await resp.read()
82
+ # with tempfile.TemporaryDirectory() as tmpdir:
83
+ # temp_path = Path(tmpdir) / f"{scuid()}.html"
84
+ # temp_path.write_bytes(data)
85
+ # result, token_count = await docsifer_service.convert_file(
86
+ # source=str(temp_path),
87
+ # openai_config=openai_config,
88
+ # cleanup=cleanup,
89
+ # )
90
+ result, token_count = await docsifer_service.convert_file(
91
+ source=str(url),
92
+ openai_config=openai_config,
93
+ cleanup=cleanup,
94
+ )
95
  else:
96
  raise HTTPException(
97
  status_code=400, detail="Provide either 'file' or 'url'."
docsifer/service.py CHANGED
@@ -7,6 +7,7 @@ import magic
7
  import mimetypes
8
  from pathlib import Path
9
  from typing import Optional, Dict, Tuple, Any
 
10
 
11
  import tiktoken
12
  from pyquery import PyQuery as pq
@@ -105,8 +106,8 @@ class DocsiferService:
105
  )
106
  return len(text.split())
107
 
108
- def _convert_file_sync(
109
- self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
110
  ) -> Tuple[Dict[str, str], int]:
111
  """
112
  Synchronously convert a file at `file_path` to Markdown.
@@ -114,73 +115,79 @@ class DocsiferService:
114
  optional HTML cleanup, and MarkItDown conversion.
115
 
116
  Args:
117
- file_path: Path to the source file.
118
  openai_config: Optional dictionary with OpenAI configuration.
119
  cleanup: Whether to perform HTML cleanup if the file is an HTML file.
120
 
121
  Returns:
122
  A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
123
  """
124
- src = Path(file_path)
125
- if not src.exists():
126
- raise FileNotFoundError(f"File not found: {file_path}")
127
-
128
- logger.info("Converting file: %s (cleanup=%s)", file_path, cleanup)
129
-
130
- # Create a temporary directory so that MarkItDown sees the proper file extension.
131
- with tempfile.TemporaryDirectory() as tmpdir:
132
- mime_type = magic.from_file(str(src), mime=True)
133
- guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
134
- if not mime_type:
135
- logger.warning(f"Could not detect file type for: {src}")
136
- new_filename = src.name
137
- else:
138
- logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
139
- new_filename = f"{src.stem}{guessed_ext}"
140
- tmp_path = Path(tmpdir) / new_filename
141
- tmp_path.write_bytes(src.read_bytes())
142
-
143
- logger.info(
144
- "Using temp file: %s, MIME type: %s, Guessed ext: %s",
145
- tmp_path,
146
- mime_type,
147
- guessed_ext,
148
- )
149
-
150
- # Perform HTML cleanup if requested.
151
- if cleanup and guessed_ext.lower() in (".html", ".htm"):
152
- self._maybe_cleanup_html(tmp_path)
153
-
154
- # Decide whether to use LLM-enhanced conversion or the basic converter.
155
- if openai_config and openai_config.get("api_key"):
156
- md_converter = self._init_markitdown_with_llm(openai_config)
157
- else:
158
- md_converter = self._basic_markitdown
 
 
 
 
 
 
159
 
160
- try:
161
- result_obj = md_converter.convert(str(tmp_path))
162
- except Exception as e:
163
- logger.error("MarkItDown conversion failed: %s", e)
164
- raise RuntimeError(f"Conversion failed for '{file_path}': {e}")
165
 
166
- # Count tokens in the resulting markdown text.
167
- token_count = self._count_tokens(result_obj.text_content)
168
 
169
- result_dict = {
170
- "filename": src.name,
171
- "markdown": result_obj.text_content,
172
- }
173
- return result_dict, token_count
174
 
175
  async def convert_file(
176
- self, file_path: str, openai_config: Optional[dict] = None, cleanup: bool = True
177
  ) -> Tuple[Dict[str, str], int]:
178
  """
179
- Asynchronously convert a file at `file_path` to Markdown.
180
  This method offloads the blocking conversion process to a separate thread.
181
 
182
  Args:
183
- file_path: Path to the file to convert.
184
  openai_config: Optional OpenAI configuration dictionary.
185
  cleanup: Whether to perform HTML cleanup if applicable.
186
 
@@ -189,5 +196,5 @@ class DocsiferService:
189
  and the token count.
190
  """
191
  return await asyncio.to_thread(
192
- self._convert_file_sync, file_path, openai_config, cleanup
193
  )
 
7
  import mimetypes
8
  from pathlib import Path
9
  from typing import Optional, Dict, Tuple, Any
10
+ from scuid import scuid
11
 
12
  import tiktoken
13
  from pyquery import PyQuery as pq
 
106
  )
107
  return len(text.split())
108
 
109
+ def _convert_sync(
110
+ self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
111
  ) -> Tuple[Dict[str, str], int]:
112
  """
113
  Synchronously convert a file at `file_path` to Markdown.
 
115
  optional HTML cleanup, and MarkItDown conversion.
116
 
117
  Args:
118
+ source: Path to the source file or URL to fetch content from.
119
  openai_config: Optional dictionary with OpenAI configuration.
120
  cleanup: Whether to perform HTML cleanup if the file is an HTML file.
121
 
122
  Returns:
123
  A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
124
  """
125
+ if source.startswith("http"):
126
+ filename = f"{scuid()}.html"
127
+ else:
128
+ src = Path(source)
129
+ if not src.exists():
130
+ raise FileNotFoundError(f"File not found: {source}")
131
+
132
+ logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
133
+
134
+ # Create a temporary directory so that MarkItDown sees the proper file extension.
135
+ with tempfile.TemporaryDirectory() as tmpdir:
136
+ mime_type = magic.from_file(str(src), mime=True)
137
+ guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
138
+ if not mime_type:
139
+ logger.warning(f"Could not detect file type for: {src}")
140
+ new_filename = src.name
141
+ else:
142
+ logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
143
+ new_filename = f"{src.stem}{guessed_ext}"
144
+ tmp_path = Path(tmpdir) / new_filename
145
+ tmp_path.write_bytes(src.read_bytes())
146
+
147
+ logger.info(
148
+ "Using temp file: %s, MIME type: %s, Guessed ext: %s",
149
+ tmp_path,
150
+ mime_type,
151
+ guessed_ext,
152
+ )
153
+
154
+ # Perform HTML cleanup if requested.
155
+ if cleanup and guessed_ext.lower() in (".html", ".htm"):
156
+ self._maybe_cleanup_html(tmp_path)
157
+
158
+ filename = src.name
159
+ source = str(tmp_path)
160
+
161
+ # Decide whether to use LLM-enhanced conversion or the basic converter.
162
+ if openai_config and openai_config.get("api_key"):
163
+ md_converter = self._init_markitdown_with_llm(openai_config)
164
+ else:
165
+ md_converter = self._basic_markitdown
166
 
167
+ try:
168
+ result_obj = md_converter.convert(source)
169
+ except Exception as e:
170
+ logger.error("MarkItDown conversion failed: %s", e)
171
+ raise RuntimeError(f"Conversion failed for '{source}': {e}")
172
 
173
+ # Count tokens in the resulting markdown text.
174
+ token_count = self._count_tokens(result_obj.text_content)
175
 
176
+ result_dict = {
177
+ "filename": filename,
178
+ "markdown": result_obj.text_content,
179
+ }
180
+ return result_dict, token_count
181
 
182
  async def convert_file(
183
+ self, source: str, openai_config: Optional[dict] = None, cleanup: bool = True
184
  ) -> Tuple[Dict[str, str], int]:
185
  """
186
+ Asynchronously convert a file at `source` to Markdown.
187
  This method offloads the blocking conversion process to a separate thread.
188
 
189
  Args:
190
+ source: Path to the file to convert or a URL to fetch content from.
191
  openai_config: Optional OpenAI configuration dictionary.
192
  cleanup: Whether to perform HTML cleanup if applicable.
193
 
 
196
  and the token count.
197
  """
198
  return await asyncio.to_thread(
199
+ self._convert_sync, source, openai_config, cleanup
200
  )