peter2000 commited on
Commit
0fed137
1 Parent(s): f6b92bd

Delete udfPreprocess/ docPreprocessing.py

Browse files
Files changed (1) hide show
  1. udfPreprocess/ docPreprocessing.py +0 -66
udfPreprocess/ docPreprocessing.py DELETED
@@ -1,66 +0,0 @@
1
- from typing import Callable, Dict, List, Optional
2
-
3
- from pathlib import Path
4
- import re
5
- import logging
6
- import string
7
- import streamlit as st
8
- logger = logging.getLogger(__name__)
9
-
10
- import os
11
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
-
13
- from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
- from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
15
- from haystack.schema import Document
16
- import pdfplumber
17
-
18
- import pandas as pd
19
-
20
- import tempfile
21
- import sqlite3
22
-
23
-
24
-
25
- def load_document(
26
- file: str,
27
- file_name,
28
- encoding: Optional[str] = None,
29
- id_hash_keys: Optional[List[str]] = None,
30
- ) -> List[Document]:
31
-
32
- """
33
- takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
34
- does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
35
- via Haystack.
36
- Returns a list of type haystack.schema.Document
37
- """
38
-
39
- if file_name.name.endswith('.pdf'):
40
- converter = PDFToTextConverter(remove_numeric_tables=True)
41
- if file_name.name.endswith('.txt'):
42
- converter = TextConverter()
43
- if file_name.name.endswith('.docx'):
44
- converter = DocxToTextConverter()
45
-
46
-
47
- documents = []
48
- logger.info("Converting {}".format(file_name))
49
- # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
50
- document = converter.convert(
51
- file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
52
- )[0]
53
- text = document.content
54
- documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
55
-
56
- '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
57
- for i in documents:
58
- if i.content == "":
59
- st.write("using pdfplumber")
60
- text = []
61
- with pdfplumber.open(file) as pdf:
62
- for page in pdf.pages:
63
- text.append(page.extract_text())
64
- i.content = ' '.join([page for page in text])
65
-
66
- return documents