peter2000 commited on
Commit
72663fa
1 Parent(s): be432e1

Create new file

Browse files
Files changed (1) hide show
  1. udfPreprocess/ docPreprocessing.py +66 -0
udfPreprocess/ docPreprocessing.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Optional
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import logging
6
+ import string
7
+ import streamlit as st
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
+ from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
15
+ from haystack.schema import Document
16
+ import pdfplumber
17
+
18
+ import pandas as pd
19
+
20
+ import tempfile
21
+ import sqlite3
22
+
23
+
24
+
25
+ def load_document(
26
+ file: str,
27
+ file_name,
28
+ encoding: Optional[str] = None,
29
+ id_hash_keys: Optional[List[str]] = None,
30
+ ) -> List[Document]:
31
+
32
+ """
33
+ takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
34
+ does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
35
+ via Haystack.
36
+ Returns a list of type haystack.schema.Document
37
+ """
38
+
39
+ if file_name.name.endswith('.pdf'):
40
+ converter = PDFToTextConverter(remove_numeric_tables=True)
41
+ if file_name.name.endswith('.txt'):
42
+ converter = TextConverter()
43
+ if file_name.name.endswith('.docx'):
44
+ converter = DocxToTextConverter()
45
+
46
+
47
+ documents = []
48
+ logger.info("Converting {}".format(file_name))
49
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
50
+ document = converter.convert(
51
+ file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
52
+ )[0]
53
+ text = document.content
54
+ documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
55
+
56
+ '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
57
+ for i in documents:
58
+ if i.content == "":
59
+ st.write("using pdfplumber")
60
+ text = []
61
+ with pdfplumber.open(file) as pdf:
62
+ for page in pdf.pages:
63
+ text.append(page.extract_text())
64
+ i.content = ' '.join([page for page in text])
65
+
66
+ return documents