peter2000 commited on
Commit
c992322
1 Parent(s): daf15f4

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +27 -0
scripts/process.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  from typing import Callable, Dict, List, Optional
3
 
 
4
  import os
5
  from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
6
  from haystack.schema import Answer
@@ -26,7 +27,33 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
26
  # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
27
  # document_store.write_documents(docs)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
30
  def load_document(
31
  file_path: str,
32
  file_name,
 
1
  import streamlit as st
2
  from typing import Callable, Dict, List, Optional
3
 
4
+ import re
5
  import os
6
  from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
7
  from haystack.schema import Answer
 
27
  # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
28
  # document_store.write_documents(docs)
29
 
30
+ def basic(s):
31
+ """
32
+ :param s: string to be processed
33
+ :return: processed string: see comments in the source code for more info
34
+ """
35
+ # Text Lowercase
36
+ #s = s.lower()
37
+ # Remove punctuation
38
+ #translator = str.maketrans(' ', ' ', string.punctuation)
39
+ #s = s.translate(translator)
40
+ # Remove URLs
41
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
42
+ s = re.sub(r"http\S+", " ", s)
43
+ # Remove new line characters
44
+ #s = re.sub('\n', ' ', s)
45
+
46
+ # Remove distracting single quotes
47
+ #s = re.sub("\'", " ", s)
48
+ # Remove all remaining numbers and non alphanumeric characters
49
+ #s = re.sub(r'\d+', ' ', s)
50
+ #s = re.sub(r'\W+', ' ', s)
51
 
52
+ # define custom words to replace:
53
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
54
+
55
+ return s.strip()
56
+
57
  def load_document(
58
  file_path: str,
59
  file_name,