Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
3 |
from langchain.schema import Document
|
4 |
-
from typing import List
|
5 |
import logging
|
6 |
import re
|
7 |
from pathlib import Path
|
@@ -35,22 +35,22 @@ def encode_image(image_path):
|
|
35 |
return f"Error: {e}"
|
36 |
|
37 |
# Function to replace images in markdown with base64 strings
|
38 |
-
def replace_images_in_markdown(markdown_str: str, images_dict:
|
39 |
for img_name, base64_str in images_dict.items():
|
40 |
markdown_str = markdown_str.replace(f"", f"")
|
41 |
return markdown_str
|
42 |
|
43 |
# Function to combine markdown from OCR response
|
44 |
-
def get_combined_markdown(ocr_response) -> tuple:
|
45 |
markdowns = []
|
46 |
raw_markdowns = []
|
|
|
47 |
for page in ocr_response.pages:
|
48 |
-
image_data = {}
|
49 |
for img in page.images:
|
50 |
image_data[img.id] = img.image_base64
|
51 |
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
|
52 |
raw_markdowns.append(page.markdown)
|
53 |
-
return "\n\n".join(markdowns), "\n\n".join(raw_markdowns)
|
54 |
|
55 |
# Perform OCR on uploaded file
|
56 |
def perform_ocr_file(file):
|
@@ -85,22 +85,23 @@ def perform_ocr_file(file):
|
|
85 |
include_image_base64=True
|
86 |
)
|
87 |
else:
|
88 |
-
return "Unsupported file type. Please provide a PDF or an image (png, jpeg, jpg).", ""
|
89 |
|
90 |
-
combined_markdown, raw_markdown = get_combined_markdown(ocr_response)
|
91 |
-
return combined_markdown, raw_markdown
|
92 |
except Exception as e:
|
93 |
-
return f"Error during OCR: {str(e)}", ""
|
94 |
|
95 |
-
# Function to extract
|
96 |
-
def
|
97 |
-
# Regex to match markdown image syntax
|
98 |
-
pattern = r"!\[
|
99 |
-
return re.findall(pattern, markdown_text)
|
100 |
|
101 |
# Function to chunk markdown text with image handling
|
102 |
def chunk_markdown(
|
103 |
markdown_text: str,
|
|
|
104 |
chunk_size: int = 1000,
|
105 |
chunk_overlap: int = 200,
|
106 |
strip_headers: bool = True
|
@@ -111,7 +112,6 @@ def chunk_markdown(
|
|
111 |
("#", "Header 1"),
|
112 |
("##", "Header 2"),
|
113 |
("###", "Header 3"),
|
114 |
-
("####", "Header 4")
|
115 |
]
|
116 |
|
117 |
# Initialize MarkdownHeaderTextSplitter
|
@@ -146,8 +146,8 @@ def chunk_markdown(
|
|
146 |
|
147 |
# Add images to metadata
|
148 |
for chunk in chunks:
|
149 |
-
|
150 |
-
chunk.metadata["images"] =
|
151 |
|
152 |
logger.info(f"Created {len(chunks)} chunks")
|
153 |
return chunks
|
@@ -167,29 +167,24 @@ def text_to_base64_dummy(text: str, chunk_index: int):
|
|
167 |
def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
|
168 |
try:
|
169 |
# Step 1: Perform OCR
|
170 |
-
combined_markdown, raw_markdown = perform_ocr_file(file)
|
171 |
if "Error" in combined_markdown:
|
172 |
return combined_markdown
|
173 |
|
174 |
# Step 2: Chunk the markdown
|
175 |
-
chunks = chunk_markdown(combined_markdown, chunk_size, chunk_overlap, strip_headers)
|
176 |
|
177 |
# Step 3: Prepare dataset
|
178 |
data = {
|
179 |
"chunk_id": [],
|
180 |
"content": [],
|
181 |
"metadata": [],
|
182 |
-
"images": [] # Changed to store list of images
|
183 |
}
|
184 |
|
185 |
for i, chunk in enumerate(chunks):
|
186 |
data["chunk_id"].append(i)
|
187 |
data["content"].append(chunk.page_content)
|
188 |
-
data["metadata"].append(
|
189 |
-
images = chunk.metadata.get("images", [])
|
190 |
-
if not images: # If no images, add a placeholder
|
191 |
-
images = [text_to_base64_dummy(chunk.page_content, i)]
|
192 |
-
data["images"].append(images)
|
193 |
|
194 |
# Step 4: Create and push dataset to Hugging Face
|
195 |
dataset = Dataset.from_dict(data)
|
|
|
1 |
import gradio as gr
|
2 |
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
3 |
from langchain.schema import Document
|
4 |
+
from typing import List, Dict
|
5 |
import logging
|
6 |
import re
|
7 |
from pathlib import Path
|
|
|
35 |
return f"Error: {e}"
|
36 |
|
37 |
# Function to replace images in markdown with base64 strings
|
38 |
+
def replace_images_in_markdown(markdown_str: str, images_dict: Dict[str, str]) -> str:
|
39 |
for img_name, base64_str in images_dict.items():
|
40 |
markdown_str = markdown_str.replace(f"", f"")
|
41 |
return markdown_str
|
42 |
|
43 |
# Function to combine markdown from OCR response
|
44 |
+
def get_combined_markdown(ocr_response) -> tuple[str, str]:
|
45 |
markdowns = []
|
46 |
raw_markdowns = []
|
47 |
+
image_data = {} # Collect all image data
|
48 |
for page in ocr_response.pages:
|
|
|
49 |
for img in page.images:
|
50 |
image_data[img.id] = img.image_base64
|
51 |
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
|
52 |
raw_markdowns.append(page.markdown)
|
53 |
+
return "\n\n".join(markdowns), "\n\n".join(raw_markdowns), image_data
|
54 |
|
55 |
# Perform OCR on uploaded file
|
56 |
def perform_ocr_file(file):
|
|
|
85 |
include_image_base64=True
|
86 |
)
|
87 |
else:
|
88 |
+
return "Unsupported file type. Please provide a PDF or an image (png, jpeg, jpg).", "", {}
|
89 |
|
90 |
+
combined_markdown, raw_markdown, image_data = get_combined_markdown(ocr_response)
|
91 |
+
return combined_markdown, raw_markdown, image_data
|
92 |
except Exception as e:
|
93 |
+
return f"Error during OCR: {str(e)}", "", {}
|
94 |
|
95 |
+
# Function to extract image names from markdown content
|
96 |
+
def extract_image_names_from_markdown(markdown_text: str) -> List[str]:
|
97 |
+
# Regex to match markdown image syntax
|
98 |
+
pattern = r"!\[(.*?)\]\("
|
99 |
+
return [match.replace(" for match in re.findall(pattern, markdown_text)]
|
100 |
|
101 |
# Function to chunk markdown text with image handling
|
102 |
def chunk_markdown(
|
103 |
markdown_text: str,
|
104 |
+
image_data: Dict[str, str],
|
105 |
chunk_size: int = 1000,
|
106 |
chunk_overlap: int = 200,
|
107 |
strip_headers: bool = True
|
|
|
112 |
("#", "Header 1"),
|
113 |
("##", "Header 2"),
|
114 |
("###", "Header 3"),
|
|
|
115 |
]
|
116 |
|
117 |
# Initialize MarkdownHeaderTextSplitter
|
|
|
146 |
|
147 |
# Add images to metadata
|
148 |
for chunk in chunks:
|
149 |
+
image_names = extract_image_names_from_markdown(chunk.page_content)
|
150 |
+
chunk.metadata["images"] = {name: image_data.get(name, None) for name in image_names}
|
151 |
|
152 |
logger.info(f"Created {len(chunks)} chunks")
|
153 |
return chunks
|
|
|
167 |
def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
|
168 |
try:
|
169 |
# Step 1: Perform OCR
|
170 |
+
combined_markdown, raw_markdown, image_data = perform_ocr_file(file)
|
171 |
if "Error" in combined_markdown:
|
172 |
return combined_markdown
|
173 |
|
174 |
# Step 2: Chunk the markdown
|
175 |
+
chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
|
176 |
|
177 |
# Step 3: Prepare dataset
|
178 |
data = {
|
179 |
"chunk_id": [],
|
180 |
"content": [],
|
181 |
"metadata": [],
|
|
|
182 |
}
|
183 |
|
184 |
for i, chunk in enumerate(chunks):
|
185 |
data["chunk_id"].append(i)
|
186 |
data["content"].append(chunk.page_content)
|
187 |
+
data["metadata"].append(chunk.metadata)
|
|
|
|
|
|
|
|
|
188 |
|
189 |
# Step 4: Create and push dataset to Hugging Face
|
190 |
dataset = Dataset.from_dict(data)
|