Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +3 -3
auditqa/doc_process.py
CHANGED
@@ -13,8 +13,8 @@ path_to_data = "./data/"
|
|
13 |
|
14 |
|
15 |
def process_pdf():
|
16 |
-
files = {'
|
17 |
-
'
|
18 |
docs = {}
|
19 |
for file,value in files.items():
|
20 |
try:
|
@@ -26,7 +26,7 @@ def process_pdf():
|
|
26 |
# text splitter based on the tokenizer of a model of your choosing
|
27 |
# to make texts fit exactly a transformer's context window size
|
28 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
29 |
-
chunk_size =
|
30 |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
31 |
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
32 |
chunk_size=chunk_size,
|
|
|
13 |
|
14 |
|
15 |
def process_pdf():
|
16 |
+
files = {'MWTS2021':'./data/MWTS2021.pdf',
|
17 |
+
'MWTS2022':'./data/MWTS2022.pdf'}
|
18 |
docs = {}
|
19 |
for file,value in files.items():
|
20 |
try:
|
|
|
26 |
# text splitter based on the tokenizer of a model of your choosing
|
27 |
# to make texts fit exactly a transformer's context window size
|
28 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
29 |
+
chunk_size = 512
|
30 |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
31 |
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
32 |
chunk_size=chunk_size,
|