mratanusarkar commited on
Commit
be6fbc6
·
1 Parent(s): 3494fdb

add: pdfplumber text loader

Browse files
medrag_multi_modal/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
  from .document_loader import (
2
  ImageLoader,
3
- MarkerTextLoader,
4
  PyMuPDF4LLMTextLoader,
5
  PyPDF2TextLoader,
6
  TextImageLoader,
@@ -10,7 +10,7 @@ from .retrieval import MultiModalRetriever
10
  __all__ = [
11
  "PyMuPDF4LLMTextLoader",
12
  "PyPDF2TextLoader",
13
- "MarkerTextLoader",
14
  "ImageLoader",
15
  "TextImageLoader",
16
  "MultiModalRetriever",
 
1
  from .document_loader import (
2
  ImageLoader,
3
+ PDFPlumberTextLoader,
4
  PyMuPDF4LLMTextLoader,
5
  PyPDF2TextLoader,
6
  TextImageLoader,
 
10
  __all__ = [
11
  "PyMuPDF4LLMTextLoader",
12
  "PyPDF2TextLoader",
13
+ "PDFPlumberTextLoader",
14
  "ImageLoader",
15
  "TextImageLoader",
16
  "MultiModalRetriever",
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,11 +1,11 @@
1
  from .load_image import ImageLoader
2
  from .load_text_image import TextImageLoader
3
- from .text_loader import MarkerTextLoader, PyMuPDF4LLMTextLoader, PyPDF2TextLoader
4
 
5
  __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
- "MarkerTextLoader",
9
  "ImageLoader",
10
  "TextImageLoader",
11
  ]
 
1
  from .load_image import ImageLoader
2
  from .load_text_image import TextImageLoader
3
+ from .text_loader import PDFPlumberTextLoader, PyMuPDF4LLMTextLoader, PyPDF2TextLoader
4
 
5
  __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
+ "PDFPlumberTextLoader",
9
  "ImageLoader",
10
  "TextImageLoader",
11
  ]
medrag_multi_modal/document_loader/text_loader/__init__.py CHANGED
@@ -1,9 +1,9 @@
1
- from .marker_text_loader import MarkerTextLoader
2
  from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
3
  from .pypdf2_text_loader import PyPDF2TextLoader
4
 
5
  __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
- "MarkerTextLoader",
9
  ]
 
1
+ from .pdfplumber_text_loader import PDFPlumberTextLoader
2
  from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
3
  from .pypdf2_text_loader import PyPDF2TextLoader
4
 
5
  __all__ = [
6
  "PyMuPDF4LLMTextLoader",
7
  "PyPDF2TextLoader",
8
+ "PDFPlumberTextLoader",
9
  ]
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ import pdfplumber
4
+
5
+ from .base_text_loader import BaseTextLoader
6
+
7
+
8
+ class PDFPlumberTextLoader(BaseTextLoader):
9
+ async def _process_page(self, page_idx: int) -> Dict[str, str]:
10
+ with pdfplumber.open(self.document_file_path) as pdf:
11
+ page = pdf.pages[page_idx]
12
+ text = page.extract_text()
13
+
14
+ return {
15
+ "text": text,
16
+ "page_idx": page_idx,
17
+ "document_name": self.document_name,
18
+ "file_path": self.document_file_path,
19
+ "file_url": self.url,
20
+ }