Spaces:
Runtime error
Runtime error
import streamlit as st | |
from typing import Callable, Dict, List, Optional | |
import re | |
import os | |
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs | |
from haystack.schema import Answer | |
from haystack.document_stores import InMemoryDocumentStore | |
from haystack.pipelines import ExtractiveQAPipeline | |
from haystack.nodes import FARMReader, TfidfRetriever | |
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter | |
from haystack.schema import Document | |
from haystack.nodes import PreProcessor | |
import logging | |
from markdown import markdown | |
from annotated_text import annotation | |
from PIL import Image | |
logger = logging.getLogger(__name__) | |
os.environ['TOKENIZERS_PARALLELISM'] ="false" | |
#def load_and_write_data(document_store): | |
# doc_dir = './article_txt_got' | |
# docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | |
# document_store.write_documents(docs) | |
def basic(s): | |
""" | |
:param s: string to be processed | |
:return: processed string: see comments in the source code for more info | |
""" | |
# Text Lowercase | |
#s = s.lower() | |
# Remove punctuation | |
#translator = str.maketrans(' ', ' ', string.punctuation) | |
#s = s.translate(translator) | |
# Remove URLs | |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE) | |
s = re.sub(r"http\S+", " ", s) | |
# Remove new line characters | |
#s = re.sub('\n', ' ', s) | |
# Remove distracting single quotes | |
#s = re.sub("\'", " ", s) | |
# Remove all remaining numbers and non alphanumeric characters | |
#s = re.sub(r'\d+', ' ', s) | |
#s = re.sub(r'\W+', ' ', s) | |
# define custom words to replace: | |
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s) | |
return s.strip() | |
def load_document( | |
file_path: str, | |
file_name, | |
encoding: Optional[str] = None, | |
id_hash_keys: Optional[List[str]] = None, | |
) -> List[Document]: | |
""" | |
takes docx, txt and pdf files as input and \ | |
extracts text as well as the filename as metadata. \ | |
Since haystack does not take care of all pdf files, \ | |
pdfplumber is attached to the pipeline in case the pdf \ | |
extraction fails via Haystack. | |
Returns a list of type haystack.schema.Document | |
""" | |
st.write(file_name) | |
if file_name.endswith('.pdf'): | |
converter = PDFToTextConverter(remove_numeric_tables=True) | |
if file_name.endswith('.txt'): | |
converter = TextConverter() | |
if file_name.endswith('.docx'): | |
converter = DocxToTextConverter() | |
documents = [] | |
logger.info("Converting {}".format(file_name)) | |
# PDFToTextConverter, TextConverter, and DocxToTextConverter | |
# return a list containing a single Document | |
document = converter.convert( | |
file_path=file_path, meta=None, | |
encoding=encoding, id_hash_keys=id_hash_keys | |
)[0] | |
text = document.content | |
documents.append(Document(content=text, | |
meta={"name": file_name}, | |
id_hash_keys=id_hash_keys)) | |
'''check if text is empty and apply different pdf processor. \ | |
This can happen whith certain pdf types.''' | |
for i in documents: | |
if i.content == "": | |
st.write("using pdfplumber") | |
text = [] | |
with pdfplumber.open(file_path) as pdf: | |
for page in pdf.pages: | |
text.append(page.extract_text()) | |
i.content = ' '.join([page for page in text]) | |
return documents | |
def preprocessing(document): | |
""" | |
takes in haystack document object and splits it into paragraphs and applies simple cleaning. | |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and | |
list that contains all text joined together. | |
""" | |
preprocessor = PreProcessor( | |
clean_empty_lines=True, | |
clean_whitespace=True, | |
clean_header_footer=True, | |
split_by="sentence", | |
split_length=3, | |
split_respect_sentence_boundary=False, | |
split_overlap=1 | |
) | |
for i in document: | |
docs_processed = preprocessor.process([i]) | |
for item in docs_processed: | |
item.content = basic(item.content) | |
st.write("your document has been splitted to", len(docs_processed), "paragraphs") | |
# create dataframe of text and list of all text | |
#df = pd.DataFrame(docs_processed) | |
#all_text = " ".join(df.content.to_list()) | |
#par_list = df.content.to_list() | |
return docs_processed #, df, all_text, par_list | |