Spaces:
Runtime error
Runtime error
File size: 4,622 Bytes
91975ca 6fa214d c992322 91975ca 0c277f0 8e1e329 d4df53b daf15f4 0c277f0 91975ca ce202e4 0c277f0 91975ca 0c277f0 c992322 6fa214d c992322 91975ca c2c2862 6fa214d c2c2862 91975ca c2c2862 91975ca 5e46932 0c277f0 91975ca 0c277f0 91975ca 0c277f0 91975ca c2c2862 91975ca c2c2862 6fa214d c2c2862 91975ca c2c2862 6fa214d 0c277f0 6fa214d bbe4709 8481e1e 0c277f0 bbe4709 0c277f0 bbe4709 0c277f0 bbe4709 0c277f0 bbe4709 0c277f0 bbe4709 91975ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import streamlit as st
from typing import Callable, Dict, List, Optional
import re
import os
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
from haystack.schema import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
from haystack.schema import Document
from haystack.nodes import PreProcessor
import logging
from markdown import markdown
from annotated_text import annotation
from PIL import Image
logger = logging.getLogger(__name__)
os.environ['TOKENIZERS_PARALLELISM'] ="false"
#def load_and_write_data(document_store):
# doc_dir = './article_txt_got'
# docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# document_store.write_documents(docs)
def basic(s):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Text Lowercase
#s = s.lower()
# Remove punctuation
#translator = str.maketrans(' ', ' ', string.punctuation)
#s = s.translate(translator)
# Remove URLs
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
s = re.sub(r"http\S+", " ", s)
# Remove new line characters
#s = re.sub('\n', ' ', s)
# Remove distracting single quotes
#s = re.sub("\'", " ", s)
# Remove all remaining numbers and non alphanumeric characters
#s = re.sub(r'\d+', ' ', s)
#s = re.sub(r'\W+', ' ', s)
# define custom words to replace:
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
return s.strip()
def load_document(
file_path: str,
file_name,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
takes docx, txt and pdf files as input and \
extracts text as well as the filename as metadata. \
Since haystack does not take care of all pdf files, \
pdfplumber is attached to the pipeline in case the pdf \
extraction fails via Haystack.
Returns a list of type haystack.schema.Document
"""
st.write(file_name)
if file_name.endswith('.pdf'):
converter = PDFToTextConverter(remove_numeric_tables=True)
if file_name.endswith('.txt'):
converter = TextConverter()
if file_name.endswith('.docx'):
converter = DocxToTextConverter()
documents = []
logger.info("Converting {}".format(file_name))
# PDFToTextConverter, TextConverter, and DocxToTextConverter
# return a list containing a single Document
document = converter.convert(
file_path=file_path, meta=None,
encoding=encoding, id_hash_keys=id_hash_keys
)[0]
text = document.content
documents.append(Document(content=text,
meta={"name": file_name},
id_hash_keys=id_hash_keys))
'''check if text is empty and apply different pdf processor. \
This can happen whith certain pdf types.'''
for i in documents:
if i.content == "":
st.write("using pdfplumber")
text = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text.append(page.extract_text())
i.content = ' '.join([page for page in text])
return documents
def preprocessing(document):
"""
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
list that contains all text joined together.
"""
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="sentence",
split_length=3,
split_respect_sentence_boundary=False,
split_overlap=1
)
for i in document:
docs_processed = preprocessor.process([i])
for item in docs_processed:
item.content = basic(item.content)
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
# create dataframe of text and list of all text
#df = pd.DataFrame(docs_processed)
#all_text = " ".join(df.content.to_list())
#par_list = df.content.to_list()
return docs_processed #, df, all_text, par_list
|