aakash0017 commited on
Commit
e5e07a1
·
1 Parent(s): 0adc74f

cleaned rag documents

Browse files
Files changed (1) hide show
  1. conversation.py +9 -1
conversation.py CHANGED
@@ -136,8 +136,16 @@ import time
136
  from db_func import insert_one
137
  from langchain.agents import AgentExecutor
138
  import re
 
139
 
 
140
 
 
 
 
 
 
 
141
  def get_bert_embeddings(sentence):
142
  embeddings = []
143
  input_ids = tokenizer.encode(sentence, return_tensors="pt")
@@ -214,7 +222,7 @@ def run(input_):
214
  for i in range(len(sources)):
215
  temp = sources[i].replace('.pdf', '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace(" ", " ")
216
  source_text += f"{i+1}. {temp}\n"
217
- cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', docs[i].strip().lower())
218
  doc_text += f"{i+1}. {cleaned_text}\n"
219
 
220
  # output_text = f"{output_text} \n\nSources: \n{source_text}"
 
136
  from db_func import insert_one
137
  from langchain.agents import AgentExecutor
138
  import re
139
+ import wordninja
140
 
141
+ def clean_text(text):
142
 
143
+ text = text.strip().lower()
144
+ utput_paragraph = ' '.join(''.join(input_paragraph.split()).split(' '))
145
+ words = wordninja.split(output_paragraph)
146
+
147
+ return ' '.join(words)
148
+
149
  def get_bert_embeddings(sentence):
150
  embeddings = []
151
  input_ids = tokenizer.encode(sentence, return_tensors="pt")
 
222
  for i in range(len(sources)):
223
  temp = sources[i].replace('.pdf', '').replace('.txt', '').replace("AAO", "").replace("2022-2023", "").replace("data/book", "").replace("text", "").replace(" ", " ")
224
  source_text += f"{i+1}. {temp}\n"
225
+ cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text(docs[i]))
226
  doc_text += f"{i+1}. {cleaned_text}\n"
227
 
228
  # output_text = f"{output_text} \n\nSources: \n{source_text}"