Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -173,7 +173,7 @@ def chunk_markdown(
|
|
173 |
chunk_overlap: int = 200,
|
174 |
strip_headers: bool = True
|
175 |
) -> List[Document]:
|
176 |
-
"""Chunks markdown text, preserving headers in metadata."""
|
177 |
if not markdown_text_with_images or not markdown_text_with_images.strip():
|
178 |
logger.warning("chunk_markdown received empty input.")
|
179 |
return []
|
@@ -188,12 +188,15 @@ def chunk_markdown(
|
|
188 |
header_chunks = markdown_splitter.split_text(markdown_text_with_images)
|
189 |
|
190 |
if not header_chunks:
|
|
|
191 |
return []
|
192 |
|
193 |
final_chunks = []
|
194 |
if chunk_size > 0:
|
195 |
text_splitter = RecursiveCharacterTextSplitter(
|
196 |
-
chunk_size=chunk_size,
|
|
|
|
|
197 |
separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
|
198 |
add_start_index=True
|
199 |
)
|
@@ -206,17 +209,23 @@ def chunk_markdown(
|
|
206 |
else:
|
207 |
final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
|
208 |
|
|
|
209 |
for chunk in final_chunks:
|
|
|
|
|
|
|
|
|
210 |
images_in_chunk = re.findall(
|
211 |
-
r"!\[.*?\]\((data:image/[a-zA-Z
|
212 |
chunk.page_content
|
213 |
)
|
214 |
-
if
|
215 |
-
|
216 |
-
chunk.metadata["images_base64"] = images_in_chunk
|
217 |
|
|
|
218 |
return final_chunks
|
219 |
|
|
|
220 |
def get_hf_token(explicit_token: str = None) -> str:
|
221 |
"""Retrieve Hugging Face token with fallback mechanisms."""
|
222 |
global hf_token_global
|
|
|
173 |
chunk_overlap: int = 200,
|
174 |
strip_headers: bool = True
|
175 |
) -> List[Document]:
|
176 |
+
"""Chunks markdown text, preserving headers in metadata and extracting base64 images."""
|
177 |
if not markdown_text_with_images or not markdown_text_with_images.strip():
|
178 |
logger.warning("chunk_markdown received empty input.")
|
179 |
return []
|
|
|
188 |
header_chunks = markdown_splitter.split_text(markdown_text_with_images)
|
189 |
|
190 |
if not header_chunks:
|
191 |
+
logger.warning("No chunks created from markdown splitting.")
|
192 |
return []
|
193 |
|
194 |
final_chunks = []
|
195 |
if chunk_size > 0:
|
196 |
text_splitter = RecursiveCharacterTextSplitter(
|
197 |
+
chunk_size=chunk_size,
|
198 |
+
chunk_overlap=chunk_overlap,
|
199 |
+
length_function=len,
|
200 |
separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
|
201 |
add_start_index=True
|
202 |
)
|
|
|
209 |
else:
|
210 |
final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
|
211 |
|
212 |
+
# Extract base64 images and add to metadata
|
213 |
for chunk in final_chunks:
|
214 |
+
if not hasattr(chunk, 'metadata'):
|
215 |
+
chunk.metadata = {}
|
216 |
+
|
217 |
+
# Improved regex to capture full base64 data URI
|
218 |
images_in_chunk = re.findall(
|
219 |
+
r"!\[.*?\]\((data:image/[a-zA-Z]+;base64,[A-Za-z0-9+/]+={0,2})\)",
|
220 |
chunk.page_content
|
221 |
)
|
222 |
+
chunk.metadata["images_base64"] = images_in_chunk if images_in_chunk else []
|
223 |
+
logger.debug(f"Chunk metadata updated with {len(images_in_chunk)} base64 images")
|
|
|
224 |
|
225 |
+
logger.info(f"Created {len(final_chunks)} chunks with base64 metadata")
|
226 |
return final_chunks
|
227 |
|
228 |
+
|
229 |
def get_hf_token(explicit_token: str = None) -> str:
|
230 |
"""Retrieve Hugging Face token with fallback mechanisms."""
|
231 |
global hf_token_global
|