lamhieu commited on
Commit
5f39983
Β·
1 Parent(s): c805c37

chore: update something

Browse files
Files changed (1) hide show
  1. docsifer/service.py +15 -14
docsifer/service.py CHANGED
@@ -130,6 +130,8 @@ class DocsiferService:
130
  Returns:
131
  A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
132
  """
 
 
133
  if source.startswith("http"):
134
  filename = f"{scuid()}.html"
135
  else:
@@ -160,38 +162,37 @@ class DocsiferService:
160
  )
161
 
162
  # Perform HTML cleanup if requested.
163
- # if cleanup and guessed_ext.lower() in (".html", ".htm"):
164
- # self._maybe_cleanup_html(tmp_path)
165
 
 
166
  filename = new_filename
167
  source = tmp_path
168
 
169
  # Decide whether to use LLM-enhanced conversion or the basic converter.
170
  if openai_config and openai_config.get("api_key"):
171
- print("openai_config:\n", openai_config)
172
  md_converter = self._init_markitdown_with_llm(openai_config)
173
  else:
174
- print("no openai_config")
175
  md_converter = self._basic_markitdown
176
 
177
  # Load cookies if provided in the HTTP config.
178
- # if http_config:
179
- # if "cookies" in http_config:
180
- # requests.cookies.cookiejar_from_dict(
181
- # http_config["cookies"],
182
- # requests.cookies.RequestsCookieJar,
183
- # overwrite=True,
184
- # )
185
 
186
  try:
187
- result_obj = md_converter.convert(source=str(source))
188
  print("result_obj:\n", result_obj.text_content)
189
  except Exception as e:
190
  logger.error("MarkItDown conversion failed: %s", e)
191
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
192
 
193
- # if isinstance(source, Path) and source.exists():
194
- # source.unlink()
195
 
196
  # Count tokens in the resulting markdown text.
197
  token_count = self._count_tokens(result_obj.text_content)
 
130
  Returns:
131
  A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
132
  """
133
+ file_extension = None
134
+
135
  if source.startswith("http"):
136
  filename = f"{scuid()}.html"
137
  else:
 
162
  )
163
 
164
  # Perform HTML cleanup if requested.
165
+ if cleanup and guessed_ext.lower() in (".html", ".htm"):
166
+ self._maybe_cleanup_html(tmp_path)
167
 
168
+ file_extension = guessed_ext.lstrip(".")
169
  filename = new_filename
170
  source = tmp_path
171
 
172
  # Decide whether to use LLM-enhanced conversion or the basic converter.
173
  if openai_config and openai_config.get("api_key"):
 
174
  md_converter = self._init_markitdown_with_llm(openai_config)
175
  else:
 
176
  md_converter = self._basic_markitdown
177
 
178
  # Load cookies if provided in the HTTP config.
179
+ if http_config:
180
+ if "cookies" in http_config:
181
+ requests.cookies.cookiejar_from_dict(
182
+ http_config["cookies"],
183
+ requests.cookies.RequestsCookieJar,
184
+ overwrite=True,
185
+ )
186
 
187
  try:
188
+ result_obj = md_converter.convert(source, file_extension=file_extension)
189
  print("result_obj:\n", result_obj.text_content)
190
  except Exception as e:
191
  logger.error("MarkItDown conversion failed: %s", e)
192
  raise RuntimeError(f"Conversion failed for '{source}': {e}")
193
 
194
+ if isinstance(source, Path) and source.exists():
195
+ source.unlink()
196
 
197
  # Count tokens in the resulting markdown text.
198
  token_count = self._count_tokens(result_obj.text_content)