Svngoku commited on
Commit
716b14f
·
verified ·
1 Parent(s): 4371128

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
  from langchain.schema import Document
4
- from typing import List, Dict
5
  import logging
6
  import re
7
  from pathlib import Path
@@ -41,7 +41,7 @@ def replace_images_in_markdown(markdown_str: str, images_dict: Dict[str, str]) -
41
  return markdown_str
42
 
43
  # Function to combine markdown from OCR response
44
- def get_combined_markdown(ocr_response) -> tuple[str, str]:
45
  markdowns = []
46
  raw_markdowns = []
47
  image_data = {} # Collect all image data
@@ -147,7 +147,11 @@ def chunk_markdown(
147
  # Add images to metadata
148
  for chunk in chunks:
149
  image_names = extract_image_names_from_markdown(chunk.page_content)
150
- chunk.metadata["images"] = {name: image_data.get(name, None) for name in image_names}
 
 
 
 
151
 
152
  logger.info(f"Created {len(chunks)} chunks")
153
  return chunks
@@ -156,13 +160,6 @@ def chunk_markdown(
156
  logger.error(f"Error processing markdown: {str(e)}")
157
  raise
158
 
159
- # Placeholder image generation
160
- def text_to_base64_dummy(text: str, chunk_index: int):
161
- img = Image.new('RGB', (200, 200), color='white')
162
- buffer = io.BytesIO()
163
- img.save(buffer, format="PNG")
164
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
165
-
166
  # Process file: OCR -> Chunk -> Save
167
  def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
168
  try:
@@ -175,7 +172,7 @@ def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_tok
175
  chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
176
 
177
  # Step 3: Prepare dataset
178
- data = {
179
  "chunk_id": [],
180
  "content": [],
181
  "metadata": [],
 
1
  import gradio as gr
2
  from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
  from langchain.schema import Document
4
+ from typing import List, Dict, Any
5
  import logging
6
  import re
7
  from pathlib import Path
 
41
  return markdown_str
42
 
43
  # Function to combine markdown from OCR response
44
+ def get_combined_markdown(ocr_response) -> tuple[str, str, Dict[str, str]]:
45
  markdowns = []
46
  raw_markdowns = []
47
  image_data = {} # Collect all image data
 
147
  # Add images to metadata
148
  for chunk in chunks:
149
  image_names = extract_image_names_from_markdown(chunk.page_content)
150
+ images = {name: image_data.get(name, None) for name in image_names}
151
+ # Add a dummy field if the images dictionary is empty
152
+ if not images:
153
+ images = {"dummy": None}
154
+ chunk.metadata["images"] = images
155
 
156
  logger.info(f"Created {len(chunks)} chunks")
157
  return chunks
 
160
  logger.error(f"Error processing markdown: {str(e)}")
161
  raise
162
 
 
 
 
 
 
 
 
163
  # Process file: OCR -> Chunk -> Save
164
  def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
165
  try:
 
172
  chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
173
 
174
  # Step 3: Prepare dataset
175
+ data: Dict[str, List[Any]] = {
176
  "chunk_id": [],
177
  "content": [],
178
  "metadata": [],