Spaces:
Runtime error
Runtime error
Commit
·
e5e07a1
1
Parent(s):
0adc74f
cleaned rag documents
Browse files- conversation.py +9 -1
conversation.py
CHANGED
@@ -136,8 +136,16 @@ import time
|
|
136 |
from db_func import insert_one
|
137 |
from langchain.agents import AgentExecutor
|
138 |
import re
|
|
|
139 |
|
|
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
def get_bert_embeddings(sentence):
|
142 |
embeddings = []
|
143 |
input_ids = tokenizer.encode(sentence, return_tensors="pt")
|
@@ -214,7 +222,7 @@ def run(input_):
|
|
214 |
for i in range(len(sources)):
|
215 |
temp = sources[i].replace('.pdf', '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace(" ", " ")
|
216 |
source_text += f"{i+1}. {temp}\n"
|
217 |
-
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', docs[i]
|
218 |
doc_text += f"{i+1}. {cleaned_text}\n"
|
219 |
|
220 |
# output_text = f"{output_text} \n\nSources: \n{source_text}"
|
|
|
136 |
from db_func import insert_one
|
137 |
from langchain.agents import AgentExecutor
|
138 |
import re
|
139 |
+
import wordninja
|
140 |
|
141 |
+
def clean_text(text):
|
142 |
|
143 |
+
text = text.strip().lower()
|
144 |
+
utput_paragraph = ' '.join(''.join(input_paragraph.split()).split(' '))
|
145 |
+
words = wordninja.split(output_paragraph)
|
146 |
+
|
147 |
+
return ' '.join(words)
|
148 |
+
|
149 |
def get_bert_embeddings(sentence):
|
150 |
embeddings = []
|
151 |
input_ids = tokenizer.encode(sentence, return_tensors="pt")
|
|
|
222 |
for i in range(len(sources)):
|
223 |
temp = sources[i].replace('.pdf', '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace(" ", " ")
|
224 |
source_text += f"{i+1}. {temp}\n"
|
225 |
+
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text(docs[i]))
|
226 |
doc_text += f"{i+1}. {cleaned_text}\n"
|
227 |
|
228 |
# output_text = f"{output_text} \n\nSources: \n{source_text}"
|