Spaces:
Runtime error
Runtime error
Update scripts/process.py
Browse files- scripts/process.py +17 -2
scripts/process.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
2 |
import os
|
3 |
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
|
4 |
from haystack.schema import Answer
|
@@ -19,10 +21,11 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
|
|
19 |
# docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
|
20 |
# document_store.write_documents(docs)
|
21 |
|
22 |
-
|
23 |
def load_document(
|
24 |
file_path: str,
|
25 |
file_name,
|
|
|
26 |
id_hash_keys: Optional[List[str]] = None,
|
27 |
) -> List[Document]:
|
28 |
|
@@ -49,14 +52,26 @@ def load_document(
|
|
49 |
# return a list containing a single Document
|
50 |
document = converter.convert(
|
51 |
file_path=file_path, meta=None,
|
52 |
-
|
53 |
)[0]
|
54 |
text = document.content
|
55 |
documents.append(Document(content=text,
|
56 |
meta={"name": file_name},
|
57 |
id_hash_keys=id_hash_keys))
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
return documents
|
|
|
60 |
|
61 |
def preprocessing(document):
|
62 |
"""
|
|
|
1 |
import streamlit as st
|
2 |
+
from typing import Callable, Dict, List, Optional
|
3 |
+
|
4 |
import os
|
5 |
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
|
6 |
from haystack.schema import Answer
|
|
|
21 |
# docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
|
22 |
# document_store.write_documents(docs)
|
23 |
|
24 |
+
|
25 |
def load_document(
|
26 |
file_path: str,
|
27 |
file_name,
|
28 |
+
encoding: Optional[str] = None,
|
29 |
id_hash_keys: Optional[List[str]] = None,
|
30 |
) -> List[Document]:
|
31 |
|
|
|
52 |
# return a list containing a single Document
|
53 |
document = converter.convert(
|
54 |
file_path=file_path, meta=None,
|
55 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
56 |
)[0]
|
57 |
text = document.content
|
58 |
documents.append(Document(content=text,
|
59 |
meta={"name": file_name},
|
60 |
id_hash_keys=id_hash_keys))
|
61 |
|
62 |
+
'''check if text is empty and apply different pdf processor. \
|
63 |
+
This can happen whith certain pdf types.'''
|
64 |
+
for i in documents:
|
65 |
+
if i.content == "":
|
66 |
+
st.write("using pdfplumber")
|
67 |
+
text = []
|
68 |
+
with pdfplumber.open(file_path) as pdf:
|
69 |
+
for page in pdf.pages:
|
70 |
+
text.append(page.extract_text())
|
71 |
+
i.content = ' '.join([page for page in text])
|
72 |
+
|
73 |
return documents
|
74 |
+
|
75 |
|
76 |
def preprocessing(document):
|
77 |
"""
|