asynchronousai commited on
Commit
cfdee1a
·
verified ·
1 Parent(s): 8168ab7

Chunker built in

Browse files
Files changed (1) hide show
  1. app.py +6 -1
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import gradio as gr
2
  from docling.document_converter import DocumentConverter
 
3
  import spaces
4
 
 
5
  def convert_document(file, output_format):
6
  # Load document and convert it using Docling
7
  converter = DocumentConverter()
@@ -9,6 +11,8 @@ def convert_document(file, output_format):
9
 
10
  # Check available attributes in DoclingDocument
11
  available_attributes = dir(result.document)
 
 
12
 
13
  # Choose the output format
14
  if output_format == "Markdown":
@@ -20,7 +24,8 @@ def convert_document(file, output_format):
20
 
21
  # Placeholder metadata extraction based on available attributes
22
  metadata = {
23
- "Available Attributes": available_attributes
 
24
  }
25
 
26
  return converted_text, metadata
 
1
  import gradio as gr
2
  from docling.document_converter import DocumentConverter
3
+ from docling_core.transforms.chunker import HierarchicalChunker
4
  import spaces
5
 
6
+
7
  def convert_document(file, output_format):
8
  # Load document and convert it using Docling
9
  converter = DocumentConverter()
 
11
 
12
  # Check available attributes in DoclingDocument
13
  available_attributes = dir(result.document)
14
+ document = result.document
15
+
16
 
17
  # Choose the output format
18
  if output_format == "Markdown":
 
24
 
25
  # Placeholder metadata extraction based on available attributes
26
  metadata = {
27
+ "Available Attributes": available_attributes,
28
+ "Chunked": list(HierarchicalChunker().chunk(document))
29
  }
30
 
31
  return converted_text, metadata