Spaces:

abadesalex
/

DocuRAG

Sleeping

File size: 1,669 Bytes

819bacd

import io
import os

import PyPDF2
from fastapi import UploadFile
from PyPDF2 import PdfReader
import pdfplumber

from src.db_local_storage.files_db import TEXT_FILES_DIRECTORY
from src.db_local_storage.documents_db import documents_text


class ExtractTextFeature:

    @staticmethod
    def ensure_directory_exists(folder_path: str) -> None:
        """Ensure that the directory exists."""
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

    @staticmethod
    async def saveFile(content, filename, directory: str) -> str:
        """Save the uploaded file to the specified directory."""
        file_path = os.path.join(directory, filename)
        with open(file_path, "w") as file:
            file.write(content)
        return file_path

    @staticmethod
    async def save_text_from_pdf(file: UploadFile, text) -> str:

        ExtractTextFeature.ensure_directory_exists(TEXT_FILES_DIRECTORY)
        await ExtractTextFeature.saveFile(text, file.filename, TEXT_FILES_DIRECTORY)

        return text

    @staticmethod
    async def extract_text_from_pdf(file: UploadFile) -> str:

        content = await file.read()
        with pdfplumber.open(io.BytesIO(content)) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()

        for document in documents_text:
            if document["filename"] == file.filename:
                return {"message": "Document already exists"}

        data = {
            "id": len(documents_text) + 1,
            "filename": file.filename,
            "text": text,
        }

        documents_text.append(data)

        return text