Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

policy_test / scripts /process.py

peter2000

Update scripts/process.py

c992322 about 2 years ago

raw

history blame

4.62 kB

	import streamlit as st
	from typing import Callable, Dict, List, Optional

	import re
	import os
	from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
	from haystack.schema import Answer
	from haystack.document_stores import InMemoryDocumentStore
	from haystack.pipelines import ExtractiveQAPipeline
	from haystack.nodes import FARMReader, TfidfRetriever
	from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
	from haystack.schema import Document
	from haystack.nodes import PreProcessor
	import logging
	from markdown import markdown
	from annotated_text import annotation
	from PIL import Image

	logger = logging.getLogger(__name__)

	os.environ['TOKENIZERS_PARALLELISM'] ="false"



	#def load_and_write_data(document_store):
	# doc_dir = './article_txt_got'
	# docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
	# document_store.write_documents(docs)

	def basic(s):
	"""
	:param s: string to be processed
	:return: processed string: see comments in the source code for more info
	"""
	# Text Lowercase
	#s = s.lower()
	# Remove punctuation
	#translator = str.maketrans(' ', ' ', string.punctuation)
	#s = s.translate(translator)
	# Remove URLs
	s = re.sub(r'^https?:\/\/.[\r\n]', ' ', s, flags=re.MULTILINE)
	s = re.sub(r"http\S+", " ", s)
	# Remove new line characters
	#s = re.sub('\n', ' ', s)

	# Remove distracting single quotes
	#s = re.sub("\'", " ", s)
	# Remove all remaining numbers and non alphanumeric characters
	#s = re.sub(r'\d+', ' ', s)
	#s = re.sub(r'\W+', ' ', s)

	# define custom words to replace:
	#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)

	return s.strip()

	def load_document(
	file_path: str,
	file_name,
	encoding: Optional[str] = None,
	id_hash_keys: Optional[List[str]] = None,
	) -> List[Document]:

	"""
	takes docx, txt and pdf files as input and \
	extracts text as well as the filename as metadata. \
	Since haystack does not take care of all pdf files, \
	pdfplumber is attached to the pipeline in case the pdf \
	extraction fails via Haystack.
	Returns a list of type haystack.schema.Document
	"""
	st.write(file_name)
	if file_name.endswith('.pdf'):
	converter = PDFToTextConverter(remove_numeric_tables=True)
	if file_name.endswith('.txt'):
	converter = TextConverter()
	if file_name.endswith('.docx'):
	converter = DocxToTextConverter()


	documents = []
	logger.info("Converting {}".format(file_name))
	# PDFToTextConverter, TextConverter, and DocxToTextConverter
	# return a list containing a single Document
	document = converter.convert(
	file_path=file_path, meta=None,
	encoding=encoding, id_hash_keys=id_hash_keys
	)[0]
	text = document.content
	documents.append(Document(content=text,
	meta={"name": file_name},
	id_hash_keys=id_hash_keys))

	'''check if text is empty and apply different pdf processor. \
	This can happen whith certain pdf types.'''
	for i in documents:
	if i.content == "":
	st.write("using pdfplumber")
	text = []
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text.append(page.extract_text())
	i.content = ' '.join([page for page in text])

	return documents


	def preprocessing(document):
	"""
	takes in haystack document object and splits it into paragraphs and applies simple cleaning.
	Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
	list that contains all text joined together.
	"""

	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=True,
	split_by="sentence",
	split_length=3,
	split_respect_sentence_boundary=False,
	split_overlap=1
	)
	for i in document:
	docs_processed = preprocessor.process([i])
	for item in docs_processed:
	item.content = basic(item.content)

	st.write("your document has been splitted to", len(docs_processed), "paragraphs")

	# create dataframe of text and list of all text
	#df = pd.DataFrame(docs_processed)
	#all_text = " ".join(df.content.to_list())
	#par_list = df.content.to_list()

	return docs_processed #, df, all_text, par_list