NEXAS commited on
Commit
0732be7
·
verified ·
1 Parent(s): 0a394f8

Update utils/ingestion.py

Browse files
Files changed (1) hide show
  1. utils/ingestion.py +19 -7
utils/ingestion.py CHANGED
@@ -1,7 +1,9 @@
1
  import json
2
  import time
3
  import os
 
4
  from pathlib import Path
 
5
  from typing import Dict, Any, List
6
  import chromadb
7
 
@@ -15,10 +17,11 @@ from docling.document_converter import (
15
  )
16
  from docling.pipeline.simple_pipeline import SimplePipeline
17
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
18
- from docling.document import DoclingDocument
19
  from docling.chunking.hierarchical_chunker import HierarchicalChunker
20
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
21
 
 
 
22
  class DocumentProcessor:
23
  def __init__(self):
24
  """Initialize document processor with Docling v2 changes"""
@@ -41,11 +44,13 @@ class DocumentProcessor:
41
  InputFormat.PPTX,
42
  InputFormat.TXT, # Added text format
43
  InputFormat.CSV, # Added CSV format
 
 
44
  ],
45
  format_options={
46
  InputFormat.PDF: PdfFormatOption(
47
- pipeline_options=pipeline_options,
48
- backend=PyPdfiumDocumentBackend()
49
  ),
50
  InputFormat.DOCX: WordFormatOption(
51
  pipeline_cls=SimplePipeline
@@ -61,17 +66,24 @@ class DocumentProcessor:
61
 
62
  try:
63
  conv_result = self.converter.convert(file_path)
64
- doc: DoclingDocument = conv_result.document
65
  except Exception as e:
66
  print(f"❌ Conversion failed: {e}")
67
  return None
68
 
69
- # Save document as markdown
70
  output_dir = Path("parsed-doc")
71
  output_dir.mkdir(parents=True, exist_ok=True)
72
  doc_filename = Path(file_path).stem
73
- md_filename = output_dir / f"{doc_filename}.md"
74
- doc.save_as_markdown(md_filename)
 
 
 
 
 
 
 
75
 
76
  chunker = HierarchicalChunker()
77
  chunks = list(chunker.chunk(doc))
 
1
  import json
2
  import time
3
  import os
4
+ import logging
5
  from pathlib import Path
6
+ import yaml
7
  from typing import Dict, Any, List
8
  import chromadb
9
 
 
17
  )
18
  from docling.pipeline.simple_pipeline import SimplePipeline
19
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 
20
  from docling.chunking.hierarchical_chunker import HierarchicalChunker
21
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
22
 
23
+ _log = logging.getLogger(__name__)
24
+
25
  class DocumentProcessor:
26
  def __init__(self):
27
  """Initialize document processor with Docling v2 changes"""
 
44
  InputFormat.PPTX,
45
  InputFormat.TXT, # Added text format
46
  InputFormat.CSV, # Added CSV format
47
+ InputFormat.ASCIIDOC, # Added AsciiDoc format
48
+ InputFormat.MD, # Added Markdown format
49
  ],
50
  format_options={
51
  InputFormat.PDF: PdfFormatOption(
52
+ pipeline_cls=StandardPdfPipeline,
53
+ backend=PyPdfiumDocumentBackend
54
  ),
55
  InputFormat.DOCX: WordFormatOption(
56
  pipeline_cls=SimplePipeline
 
66
 
67
  try:
68
  conv_result = self.converter.convert(file_path)
69
+ doc = conv_result.document
70
  except Exception as e:
71
  print(f"❌ Conversion failed: {e}")
72
  return None
73
 
74
+ # Save document as markdown, JSON, and YAML
75
  output_dir = Path("parsed-doc")
76
  output_dir.mkdir(parents=True, exist_ok=True)
77
  doc_filename = Path(file_path).stem
78
+
79
+ with (output_dir / f"{doc_filename}.md").open("w") as fp:
80
+ fp.write(doc.export_to_markdown())
81
+
82
+ with (output_dir / f"{doc_filename}.json").open("w") as fp:
83
+ fp.write(json.dumps(doc.export_to_dict()))
84
+
85
+ with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
86
+ fp.write(yaml.safe_dump(doc.export_to_dict()))
87
 
88
  chunker = HierarchicalChunker()
89
  chunks = list(chunker.chunk(doc))