Spaces:

peter2000
/

policy_test

Runtime error

peter2000 commited on Sep 27, 2022

Commit

6fa214d

1 Parent(s): b913c31

Update scripts/process.py

Files changed (1) hide show

scripts/process.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import streamlit as st
 import os
 from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
 from haystack.schema import Answer
@@ -19,10 +21,11 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
 #    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 #    document_store.write_documents(docs)
-#pipeline = start_haystack()
 def load_document(
     file_path: str,
     file_name,
     id_hash_keys: Optional[List[str]] = None,
 ) -> List[Document]:
@@ -49,14 +52,26 @@ def load_document(
     # return a list containing a single Document
     document = converter.convert(
                 file_path=file_path, meta=None,
-                 id_hash_keys=id_hash_keys
                 )[0]
     text = document.content
     documents.append(Document(content=text,
                               meta={"name": file_name},
                               id_hash_keys=id_hash_keys))
     return documents
 def preprocessing(document):
     """

 import streamlit as st
+from typing import Callable, Dict, List, Optional
 import os
 from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
 from haystack.schema import Answer
 #    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 #    document_store.write_documents(docs)
 def load_document(
     file_path: str,
     file_name,
+    encoding: Optional[str] = None,
     id_hash_keys: Optional[List[str]] = None,
 ) -> List[Document]:
     # return a list containing a single Document
     document = converter.convert(
                 file_path=file_path, meta=None,
+                encoding=encoding, id_hash_keys=id_hash_keys
                 )[0]
     text = document.content
     documents.append(Document(content=text,
                               meta={"name": file_name},
                               id_hash_keys=id_hash_keys))
+    '''check if text is empty and apply different pdf processor. \
+    This can happen whith certain pdf types.'''
+    for i in documents:
+        if i.content == "":
+            st.write("using pdfplumber")
+            text = []
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    text.append(page.extract_text())
+            i.content = ' '.join([page for page in text])
     return documents
 def preprocessing(document):
     """