Svngoku commited on
Commit
39c7fb4
·
verified ·
1 Parent(s): cde3785

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -173,7 +173,7 @@ def chunk_markdown(
173
  chunk_overlap: int = 200,
174
  strip_headers: bool = True
175
  ) -> List[Document]:
176
- """Chunks markdown text, preserving headers in metadata."""
177
  if not markdown_text_with_images or not markdown_text_with_images.strip():
178
  logger.warning("chunk_markdown received empty input.")
179
  return []
@@ -188,12 +188,15 @@ def chunk_markdown(
188
  header_chunks = markdown_splitter.split_text(markdown_text_with_images)
189
 
190
  if not header_chunks:
 
191
  return []
192
 
193
  final_chunks = []
194
  if chunk_size > 0:
195
  text_splitter = RecursiveCharacterTextSplitter(
196
- chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len,
 
 
197
  separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
198
  add_start_index=True
199
  )
@@ -206,17 +209,23 @@ def chunk_markdown(
206
  else:
207
  final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
208
 
 
209
  for chunk in final_chunks:
 
 
 
 
210
  images_in_chunk = re.findall(
211
- r"!\[.*?\]\((data:image/[a-zA-Z+]+;base64,[A-Za-z0-9+/=]+)\)",
212
  chunk.page_content
213
  )
214
- if not hasattr(chunk, 'metadata'):
215
- chunk.metadata = {}
216
- chunk.metadata["images_base64"] = images_in_chunk
217
 
 
218
  return final_chunks
219
 
 
220
  def get_hf_token(explicit_token: str = None) -> str:
221
  """Retrieve Hugging Face token with fallback mechanisms."""
222
  global hf_token_global
 
173
  chunk_overlap: int = 200,
174
  strip_headers: bool = True
175
  ) -> List[Document]:
176
+ """Chunks markdown text, preserving headers in metadata and extracting base64 images."""
177
  if not markdown_text_with_images or not markdown_text_with_images.strip():
178
  logger.warning("chunk_markdown received empty input.")
179
  return []
 
188
  header_chunks = markdown_splitter.split_text(markdown_text_with_images)
189
 
190
  if not header_chunks:
191
+ logger.warning("No chunks created from markdown splitting.")
192
  return []
193
 
194
  final_chunks = []
195
  if chunk_size > 0:
196
  text_splitter = RecursiveCharacterTextSplitter(
197
+ chunk_size=chunk_size,
198
+ chunk_overlap=chunk_overlap,
199
+ length_function=len,
200
  separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
201
  add_start_index=True
202
  )
 
209
  else:
210
  final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
211
 
212
+ # Extract base64 images and add to metadata
213
  for chunk in final_chunks:
214
+ if not hasattr(chunk, 'metadata'):
215
+ chunk.metadata = {}
216
+
217
+ # Improved regex to capture full base64 data URI
218
  images_in_chunk = re.findall(
219
+ r"!\[.*?\]\((data:image/[a-zA-Z]+;base64,[A-Za-z0-9+/]+={0,2})\)",
220
  chunk.page_content
221
  )
222
+ chunk.metadata["images_base64"] = images_in_chunk if images_in_chunk else []
223
+ logger.debug(f"Chunk metadata updated with {len(images_in_chunk)} base64 images")
 
224
 
225
+ logger.info(f"Created {len(final_chunks)} chunks with base64 metadata")
226
  return final_chunks
227
 
228
+
229
  def get_hf_token(explicit_token: str = None) -> str:
230
  """Retrieve Hugging Face token with fallback mechanisms."""
231
  global hf_token_global