Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
3 |
from langchain.schema import Document
|
4 |
-
from typing import List, Dict
|
5 |
import logging
|
6 |
import re
|
7 |
from pathlib import Path
|
@@ -41,7 +41,7 @@ def replace_images_in_markdown(markdown_str: str, images_dict: Dict[str, str]) -
|
|
41 |
return markdown_str
|
42 |
|
43 |
# Function to combine markdown from OCR response
|
44 |
-
def get_combined_markdown(ocr_response) -> tuple[str, str]:
|
45 |
markdowns = []
|
46 |
raw_markdowns = []
|
47 |
image_data = {} # Collect all image data
|
@@ -147,7 +147,11 @@ def chunk_markdown(
|
|
147 |
# Add images to metadata
|
148 |
for chunk in chunks:
|
149 |
image_names = extract_image_names_from_markdown(chunk.page_content)
|
150 |
-
|
|
|
|
|
|
|
|
|
151 |
|
152 |
logger.info(f"Created {len(chunks)} chunks")
|
153 |
return chunks
|
@@ -156,13 +160,6 @@ def chunk_markdown(
|
|
156 |
logger.error(f"Error processing markdown: {str(e)}")
|
157 |
raise
|
158 |
|
159 |
-
# Placeholder image generation
|
160 |
-
def text_to_base64_dummy(text: str, chunk_index: int):
|
161 |
-
img = Image.new('RGB', (200, 200), color='white')
|
162 |
-
buffer = io.BytesIO()
|
163 |
-
img.save(buffer, format="PNG")
|
164 |
-
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
165 |
-
|
166 |
# Process file: OCR -> Chunk -> Save
|
167 |
def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
|
168 |
try:
|
@@ -175,7 +172,7 @@ def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_tok
|
|
175 |
chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
|
176 |
|
177 |
# Step 3: Prepare dataset
|
178 |
-
data = {
|
179 |
"chunk_id": [],
|
180 |
"content": [],
|
181 |
"metadata": [],
|
|
|
1 |
import gradio as gr
|
2 |
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
3 |
from langchain.schema import Document
|
4 |
+
from typing import List, Dict, Any
|
5 |
import logging
|
6 |
import re
|
7 |
from pathlib import Path
|
|
|
41 |
return markdown_str
|
42 |
|
43 |
# Function to combine markdown from OCR response
|
44 |
+
def get_combined_markdown(ocr_response) -> tuple[str, str, Dict[str, str]]:
|
45 |
markdowns = []
|
46 |
raw_markdowns = []
|
47 |
image_data = {} # Collect all image data
|
|
|
147 |
# Add images to metadata
|
148 |
for chunk in chunks:
|
149 |
image_names = extract_image_names_from_markdown(chunk.page_content)
|
150 |
+
images = {name: image_data.get(name, None) for name in image_names}
|
151 |
+
# Add a dummy field if the images dictionary is empty
|
152 |
+
if not images:
|
153 |
+
images = {"dummy": None}
|
154 |
+
chunk.metadata["images"] = images
|
155 |
|
156 |
logger.info(f"Created {len(chunks)} chunks")
|
157 |
return chunks
|
|
|
160 |
logger.error(f"Error processing markdown: {str(e)}")
|
161 |
raise
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
# Process file: OCR -> Chunk -> Save
|
164 |
def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
|
165 |
try:
|
|
|
172 |
chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
|
173 |
|
174 |
# Step 3: Prepare dataset
|
175 |
+
data: Dict[str, List[Any]] = {
|
176 |
"chunk_id": [],
|
177 |
"content": [],
|
178 |
"metadata": [],
|