Spaces:
Sleeping
Sleeping
Commit
·
be6fbc6
1
Parent(s):
3494fdb
add: pdfplumber text loader
Browse files
medrag_multi_modal/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from .document_loader import (
|
2 |
ImageLoader,
|
3 |
-
|
4 |
PyMuPDF4LLMTextLoader,
|
5 |
PyPDF2TextLoader,
|
6 |
TextImageLoader,
|
@@ -10,7 +10,7 @@ from .retrieval import MultiModalRetriever
|
|
10 |
__all__ = [
|
11 |
"PyMuPDF4LLMTextLoader",
|
12 |
"PyPDF2TextLoader",
|
13 |
-
"
|
14 |
"ImageLoader",
|
15 |
"TextImageLoader",
|
16 |
"MultiModalRetriever",
|
|
|
1 |
from .document_loader import (
|
2 |
ImageLoader,
|
3 |
+
PDFPlumberTextLoader,
|
4 |
PyMuPDF4LLMTextLoader,
|
5 |
PyPDF2TextLoader,
|
6 |
TextImageLoader,
|
|
|
10 |
__all__ = [
|
11 |
"PyMuPDF4LLMTextLoader",
|
12 |
"PyPDF2TextLoader",
|
13 |
+
"PDFPlumberTextLoader",
|
14 |
"ImageLoader",
|
15 |
"TextImageLoader",
|
16 |
"MultiModalRetriever",
|
medrag_multi_modal/document_loader/__init__.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
from .load_image import ImageLoader
|
2 |
from .load_text_image import TextImageLoader
|
3 |
-
from .text_loader import
|
4 |
|
5 |
__all__ = [
|
6 |
"PyMuPDF4LLMTextLoader",
|
7 |
"PyPDF2TextLoader",
|
8 |
-
"
|
9 |
"ImageLoader",
|
10 |
"TextImageLoader",
|
11 |
]
|
|
|
1 |
from .load_image import ImageLoader
|
2 |
from .load_text_image import TextImageLoader
|
3 |
+
from .text_loader import PDFPlumberTextLoader, PyMuPDF4LLMTextLoader, PyPDF2TextLoader
|
4 |
|
5 |
__all__ = [
|
6 |
"PyMuPDF4LLMTextLoader",
|
7 |
"PyPDF2TextLoader",
|
8 |
+
"PDFPlumberTextLoader",
|
9 |
"ImageLoader",
|
10 |
"TextImageLoader",
|
11 |
]
|
medrag_multi_modal/document_loader/text_loader/__init__.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
from .
|
2 |
from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
|
3 |
from .pypdf2_text_loader import PyPDF2TextLoader
|
4 |
|
5 |
__all__ = [
|
6 |
"PyMuPDF4LLMTextLoader",
|
7 |
"PyPDF2TextLoader",
|
8 |
-
"
|
9 |
]
|
|
|
1 |
+
from .pdfplumber_text_loader import PDFPlumberTextLoader
|
2 |
from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
|
3 |
from .pypdf2_text_loader import PyPDF2TextLoader
|
4 |
|
5 |
__all__ = [
|
6 |
"PyMuPDF4LLMTextLoader",
|
7 |
"PyPDF2TextLoader",
|
8 |
+
"PDFPlumberTextLoader",
|
9 |
]
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
import pdfplumber
|
4 |
+
|
5 |
+
from .base_text_loader import BaseTextLoader
|
6 |
+
|
7 |
+
|
8 |
+
class PDFPlumberTextLoader(BaseTextLoader):
|
9 |
+
async def _process_page(self, page_idx: int) -> Dict[str, str]:
|
10 |
+
with pdfplumber.open(self.document_file_path) as pdf:
|
11 |
+
page = pdf.pages[page_idx]
|
12 |
+
text = page.extract_text()
|
13 |
+
|
14 |
+
return {
|
15 |
+
"text": text,
|
16 |
+
"page_idx": page_idx,
|
17 |
+
"document_name": self.document_name,
|
18 |
+
"file_path": self.document_file_path,
|
19 |
+
"file_url": self.url,
|
20 |
+
}
|