|
from abc import ABC, abstractmethod |
|
from pathlib import Path |
|
from urllib.parse import urlparse |
|
import requests |
|
import fitz |
|
import io |
|
import re |
|
import hashlib |
|
import os |
|
|
|
|
|
class FileTypeError(Exception): |
|
"""Raised when the file type does not match the expected file type.""" |
|
|
|
|
|
class FileSchemeError(Exception): |
|
"""Raised when the file scheme does not match the expected file scheme.""" |
|
|
|
|
|
class FileProcessor(ABC): |
|
type = None |
|
|
|
def __init__(self, path): |
|
self.path = path |
|
self.file_scheme = self._get_file_scheme() |
|
self.__class__._check_file_type(path) |
|
|
|
@abstractmethod |
|
def get_file_data(self): |
|
pass |
|
|
|
@abstractmethod |
|
def _get_file_metadata(self): |
|
pass |
|
|
|
@abstractmethod |
|
def _get_file_paragraphs(self): |
|
pass |
|
|
|
@classmethod |
|
def _check_file_type(cls, path): |
|
file_type = Path(path).suffix.lower()[1:] |
|
if file_type != cls.type: |
|
raise FileTypeError( |
|
f"Invalid file type. {cls.__name__} expects a {cls.type} file" |
|
) |
|
|
|
def _get_file_scheme(self): |
|
parsed_path = urlparse(self.path) |
|
if ( |
|
not parsed_path.scheme |
|
or parsed_path.scheme.lower() == "file" |
|
or os.path.isfile(self.path) |
|
): |
|
return "local" |
|
elif parsed_path.scheme.lower() in ["http", "https", "ftp"]: |
|
return "url" |
|
else: |
|
raise FileSchemeError("Unknown scheme") |
|
|
|
def _preprocess_text(self, text): |
|
text = text.replace("\n", " ") |
|
text = re.sub("\s+", " ", text) |
|
text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") |
|
return text |
|
|
|
def _generate_hash(self, string): |
|
hash_object = hashlib.md5() |
|
hash_object.update(string.encode("utf-8", "ignore")) |
|
hex_dig = hash_object.hexdigest() |
|
|
|
return hex_dig |
|
|
|
def generate_paragraphs(): |
|
raise NotImplementedError |
|
|
|
def generate_metadata(): |
|
raise NotImplementedError |
|
|
|
|
|
class PDFProcessor(FileProcessor): |
|
type = "pdf" |
|
|
|
def __init__(self, path): |
|
super().__init__(path) |
|
|
|
def get_file_data(self, merge_length=200): |
|
file = self._open_file() |
|
|
|
file_metadata = self._get_file_metadata(file) |
|
file_paragraphs = self._get_file_paragraphs( |
|
file, file_metadata, start_page=1, end_page=None, merge_length=merge_length |
|
) |
|
|
|
file.close() |
|
|
|
return file_metadata, file_paragraphs |
|
|
|
def _get_file_metadata(self, file): |
|
file_metadata = dict() |
|
|
|
metadata = file.metadata |
|
|
|
unique_string = str(Path(self.path).name) + metadata["title"] |
|
|
|
file_metadata["id"] = self._generate_hash(unique_string) |
|
file_metadata["title"] = metadata["title"] |
|
file_metadata["author"] = metadata["author"] |
|
file_metadata["subject"] = metadata["subject"] |
|
file_metadata["creation_date"] = metadata["creationDate"] |
|
file_metadata["modification_date"] = metadata["modDate"] |
|
file_metadata["n_pages"] = file.page_count |
|
if self.file_scheme == "local": |
|
file_metadata["url"] = str(Path(self.path).resolve()) |
|
else: |
|
file_metadata["url"] = self.path |
|
file_metadata["file_name"] = Path(self.path).name |
|
file_metadata["short_name"] = Path(self.path).name |
|
file_metadata["release_date"] = "" |
|
file_metadata["report_type"] = "" |
|
file_metadata["source"] = "" |
|
|
|
return file_metadata |
|
|
|
def _get_file_paragraphs( |
|
self, file, file_metadata, start_page=1, end_page=None, merge_length=200 |
|
): |
|
if end_page is None: |
|
end_page = file_metadata["n_pages"] |
|
|
|
file_paragraphs = [] |
|
|
|
for page_num in range(start_page - 1, end_page): |
|
page = file.load_page(page_num) |
|
blocks = page.get_text("blocks") |
|
|
|
for block in blocks: |
|
paragraph = self._process_block( |
|
block, page, page_num + start_page, file_metadata["id"] |
|
) |
|
if paragraph is None: |
|
continue |
|
|
|
first_char = paragraph["content"][0] |
|
if len(file_paragraphs) > 0: |
|
if ( |
|
len(file_paragraphs[-1]["content"]) + len(paragraph["content"]) |
|
< merge_length |
|
) or (first_char.islower() and first_char.isalpha()): |
|
file_paragraphs[-1]["content"] += " " + paragraph["content"] |
|
file_paragraphs[-1]["length"] = len( |
|
file_paragraphs[-1]["content"] |
|
) |
|
else: |
|
file_paragraphs.append(paragraph) |
|
else: |
|
file_paragraphs.append(paragraph) |
|
|
|
return file_paragraphs |
|
|
|
def _open_file(self): |
|
if self.file_scheme == "url": |
|
response = requests.get(self.path) |
|
file = fitz.open(stream=io.BytesIO(response.content), filetype="pdf") |
|
elif self.file_scheme == "local": |
|
file = fitz.open(self.path) |
|
return file |
|
|
|
def _process_block(self, block, page, page_number, file_id): |
|
x0, y0, x1, y1, content, block_no, block_type = block |
|
|
|
if content.isspace() or block_type == 1: |
|
return None |
|
|
|
content = self._preprocess_text(content) |
|
unique_content_string = "_".join(map(str, block)) |
|
paragraph_id = self._generate_hash(unique_content_string) |
|
|
|
w, h = page.rect.width, page.rect.height |
|
paragraph = { |
|
"id": paragraph_id, |
|
"document_id": file_id, |
|
"content_type": "text" if block_type == 0 else "image", |
|
"content": content, |
|
"length": len(content), |
|
"idx_block": block_no, |
|
"page_number": page_number, |
|
"x0": x0 / h, |
|
"y0": y0 / w, |
|
"x1": x1 / h, |
|
"y1": y1 / w, |
|
} |
|
|
|
return paragraph |
|
|
|
|
|
class HTMLProcessor(FileProcessor): |
|
type = "html" |
|
|
|
def __init__(self, path): |
|
super().__init__(path) |
|
|
|
def get_file_data(self): |
|
pass |
|
|
|
def _get_file_metadata(self): |
|
pass |
|
|
|
def _get_file_paragraphs(self): |
|
pass |
|
|
|
def _open_file(self): |
|
if self.file_scheme == "url": |
|
response = requests.get(self.path) |
|
file = response.text |
|
elif self.file_scheme == "local": |
|
file = open(self.path, "r").read() |
|
return file |
|
|