ppsingh commited on
Commit
330533c
1 Parent(s): 8ee40d6

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +30 -1
auditqa/doc_process.py CHANGED
@@ -2,6 +2,8 @@ import glob
2
  import os
3
  from langchain_text_splitters import MarkdownHeaderTextSplitter
4
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
 
 
5
  path_to_data = "./data/"
6
 
7
  def process_markdown():
@@ -25,4 +27,31 @@ def process_markdown():
25
  print("Exception: ", e)
26
  docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
27
  print(len(docs_processed))
28
- print(docs_processed[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  from langchain_text_splitters import MarkdownHeaderTextSplitter
4
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
6
+ from transformers import AutoTokenizer
7
  path_to_data = "./data/"
8
 
9
  def process_markdown():
 
27
  print("Exception: ", e)
28
  docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
29
  print(len(docs_processed))
30
+ print(docs_processed[0])
31
+
32
+ def process_pdf():
33
+ files = glob.glob(path_to_data+"*.md")
34
+ docs = []
35
+ for file in files:
36
+ try:
37
+ docs.append(PyMuPDFLoader(file).load())
38
+ except Exception as e:
39
+ print("Exception: ", e)
40
+
41
+
42
+ chunk_size = 256
43
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
44
+ AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
45
+ chunk_size=chunk_size,
46
+ chunk_overlap=int(chunk_size / 10),
47
+ add_start_index=True,
48
+ strip_whitespace=True,
49
+ separators=["\n\n", "\n", ".", " ", ""],
50
+ )
51
+ docs_processed = [text_splitter.split_documents(doc) for doc in docs]
52
+ docs_processed = [item for sublist in docs_processed for item in sublist]
53
+
54
+ print(len(docs_processed))
55
+ print(docs_processed[0])
56
+
57
+