Svngoku commited on
Commit
4371128
·
verified ·
1 Parent(s): e991854

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -25
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
  from langchain.schema import Document
4
- from typing import List
5
  import logging
6
  import re
7
  from pathlib import Path
@@ -35,22 +35,22 @@ def encode_image(image_path):
35
  return f"Error: {e}"
36
 
37
  # Function to replace images in markdown with base64 strings
38
- def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
39
  for img_name, base64_str in images_dict.items():
40
  markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
41
  return markdown_str
42
 
43
  # Function to combine markdown from OCR response
44
- def get_combined_markdown(ocr_response) -> tuple:
45
  markdowns = []
46
  raw_markdowns = []
 
47
  for page in ocr_response.pages:
48
- image_data = {}
49
  for img in page.images:
50
  image_data[img.id] = img.image_base64
51
  markdowns.append(replace_images_in_markdown(page.markdown, image_data))
52
  raw_markdowns.append(page.markdown)
53
- return "\n\n".join(markdowns), "\n\n".join(raw_markdowns)
54
 
55
  # Perform OCR on uploaded file
56
  def perform_ocr_file(file):
@@ -85,22 +85,23 @@ def perform_ocr_file(file):
85
  include_image_base64=True
86
  )
87
  else:
88
- return "Unsupported file type. Please provide a PDF or an image (png, jpeg, jpg).", ""
89
 
90
- combined_markdown, raw_markdown = get_combined_markdown(ocr_response)
91
- return combined_markdown, raw_markdown
92
  except Exception as e:
93
- return f"Error during OCR: {str(e)}", ""
94
 
95
- # Function to extract base64 images from markdown content
96
- def extract_images_from_markdown(markdown_text: str) -> List[str]:
97
- # Regex to match markdown image syntax with base64 data
98
- pattern = r"!\[.*?\]\((data:image/[a-z]+;base64,[^\)]+)\)"
99
- return re.findall(pattern, markdown_text)
100
 
101
  # Function to chunk markdown text with image handling
102
  def chunk_markdown(
103
  markdown_text: str,
 
104
  chunk_size: int = 1000,
105
  chunk_overlap: int = 200,
106
  strip_headers: bool = True
@@ -111,7 +112,6 @@ def chunk_markdown(
111
  ("#", "Header 1"),
112
  ("##", "Header 2"),
113
  ("###", "Header 3"),
114
- ("####", "Header 4")
115
  ]
116
 
117
  # Initialize MarkdownHeaderTextSplitter
@@ -146,8 +146,8 @@ def chunk_markdown(
146
 
147
  # Add images to metadata
148
  for chunk in chunks:
149
- images = extract_images_from_markdown(chunk.page_content)
150
- chunk.metadata["images"] = images
151
 
152
  logger.info(f"Created {len(chunks)} chunks")
153
  return chunks
@@ -167,29 +167,24 @@ def text_to_base64_dummy(text: str, chunk_index: int):
167
  def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
168
  try:
169
  # Step 1: Perform OCR
170
- combined_markdown, raw_markdown = perform_ocr_file(file)
171
  if "Error" in combined_markdown:
172
  return combined_markdown
173
 
174
  # Step 2: Chunk the markdown
175
- chunks = chunk_markdown(combined_markdown, chunk_size, chunk_overlap, strip_headers)
176
 
177
  # Step 3: Prepare dataset
178
  data = {
179
  "chunk_id": [],
180
  "content": [],
181
  "metadata": [],
182
- "images": [] # Changed to store list of images
183
  }
184
 
185
  for i, chunk in enumerate(chunks):
186
  data["chunk_id"].append(i)
187
  data["content"].append(chunk.page_content)
188
- data["metadata"].append({k: v for k, v in chunk.metadata.items() if k != "images"}) # Exclude images from metadata column
189
- images = chunk.metadata.get("images", [])
190
- if not images: # If no images, add a placeholder
191
- images = [text_to_base64_dummy(chunk.page_content, i)]
192
- data["images"].append(images)
193
 
194
  # Step 4: Create and push dataset to Hugging Face
195
  dataset = Dataset.from_dict(data)
 
1
  import gradio as gr
2
  from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
3
  from langchain.schema import Document
4
+ from typing import List, Dict
5
  import logging
6
  import re
7
  from pathlib import Path
 
35
  return f"Error: {e}"
36
 
37
  # Function to replace images in markdown with base64 strings
38
+ def replace_images_in_markdown(markdown_str: str, images_dict: Dict[str, str]) -> str:
39
  for img_name, base64_str in images_dict.items():
40
  markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
41
  return markdown_str
42
 
43
  # Function to combine markdown from OCR response
44
+ def get_combined_markdown(ocr_response) -> tuple[str, str]:
45
  markdowns = []
46
  raw_markdowns = []
47
+ image_data = {} # Collect all image data
48
  for page in ocr_response.pages:
 
49
  for img in page.images:
50
  image_data[img.id] = img.image_base64
51
  markdowns.append(replace_images_in_markdown(page.markdown, image_data))
52
  raw_markdowns.append(page.markdown)
53
+ return "\n\n".join(markdowns), "\n\n".join(raw_markdowns), image_data
54
 
55
  # Perform OCR on uploaded file
56
  def perform_ocr_file(file):
 
85
  include_image_base64=True
86
  )
87
  else:
88
+ return "Unsupported file type. Please provide a PDF or an image (png, jpeg, jpg).", "", {}
89
 
90
+ combined_markdown, raw_markdown, image_data = get_combined_markdown(ocr_response)
91
+ return combined_markdown, raw_markdown, image_data
92
  except Exception as e:
93
+ return f"Error during OCR: {str(e)}", "", {}
94
 
95
+ # Function to extract image names from markdown content
96
+ def extract_image_names_from_markdown(markdown_text: str) -> List[str]:
97
+ # Regex to match markdown image syntax
98
+ pattern = r"!\[(.*?)\]\("
99
+ return [match.replace("![","").replace("](","") for match in re.findall(pattern, markdown_text)]
100
 
101
  # Function to chunk markdown text with image handling
102
  def chunk_markdown(
103
  markdown_text: str,
104
+ image_data: Dict[str, str],
105
  chunk_size: int = 1000,
106
  chunk_overlap: int = 200,
107
  strip_headers: bool = True
 
112
  ("#", "Header 1"),
113
  ("##", "Header 2"),
114
  ("###", "Header 3"),
 
115
  ]
116
 
117
  # Initialize MarkdownHeaderTextSplitter
 
146
 
147
  # Add images to metadata
148
  for chunk in chunks:
149
+ image_names = extract_image_names_from_markdown(chunk.page_content)
150
+ chunk.metadata["images"] = {name: image_data.get(name, None) for name in image_names}
151
 
152
  logger.info(f"Created {len(chunks)} chunks")
153
  return chunks
 
167
  def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
168
  try:
169
  # Step 1: Perform OCR
170
+ combined_markdown, raw_markdown, image_data = perform_ocr_file(file)
171
  if "Error" in combined_markdown:
172
  return combined_markdown
173
 
174
  # Step 2: Chunk the markdown
175
+ chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
176
 
177
  # Step 3: Prepare dataset
178
  data = {
179
  "chunk_id": [],
180
  "content": [],
181
  "metadata": [],
 
182
  }
183
 
184
  for i, chunk in enumerate(chunks):
185
  data["chunk_id"].append(i)
186
  data["content"].append(chunk.page_content)
187
+ data["metadata"].append(chunk.metadata)
 
 
 
 
188
 
189
  # Step 4: Create and push dataset to Hugging Face
190
  dataset = Dataset.from_dict(data)