Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

peter2000 commited on Jul 28, 2022

Commit

72663fa

•

1 Parent(s): be432e1

Create new file

Browse files

Files changed (1) hide show

udfPreprocess/ docPreprocessing.py +66 -0

udfPreprocess/ docPreprocessing.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Callable, Dict, List, Optional
+from pathlib import Path
+import re
+import logging
+import string
+import streamlit as st
+logger = logging.getLogger(__name__)
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from haystack.utils import convert_files_to_docs, fetch_archive_from_http
+from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
+from haystack.schema import Document
+import pdfplumber
+import pandas as pd
+import tempfile
+import sqlite3
+def load_document(
+    file: str,
+    file_name,
+    encoding: Optional[str] = None,
+    id_hash_keys: Optional[List[str]] = None,
+) -> List[Document]:
+    """
+    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
+    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
+    via Haystack.
+    Returns a list of type haystack.schema.Document
+    """
+    if file_name.name.endswith('.pdf'):
+        converter = PDFToTextConverter(remove_numeric_tables=True)
+    if file_name.name.endswith('.txt'):
+        converter = TextConverter()
+    if file_name.name.endswith('.docx'):
+        converter = DocxToTextConverter()
+    documents = []
+    logger.info("Converting {}".format(file_name))
+    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
+    document = converter.convert(
+                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
+            )[0]
+    text = document.content
+    documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
+    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
+    for i in documents:
+        if i.content == "":
+            st.write("using pdfplumber")
+            text = []
+            with pdfplumber.open(file) as pdf:
+                for page in pdf.pages:
+                    text.append(page.extract_text())
+            i.content = ' '.join([page for page in text])
+    return documents