pdf_rag_chatbot / src /indexing.py
hohieu's picture
init project
841b0ff
import pandas as pd
from services.generate_embedding import generate_embedding
from pyvi.ViTokenizer import tokenize
import pymongo
SHEET_ID = "1MKB6MHgL_lrPB1I69fj2VcVrgmSAMLVNZR1EwSyTSeA"
SHEET_NAME = "Q&A"
# Connect DB
client = pymongo.MongoClient(
"mongodb+srv://rag:[email protected]/?retryWrites=true&w=majority&appName=RAG"
)
db = client.rag
collection = db.questionAndAnswers
def insertQuestionAndAnswers(questionAndAnswers):
return collection.insert_many(questionAndAnswers)
def deleteByUserId(user_id: str):
return collection.delete_many({'user_id': user_id})
def readDataFromGoogleSheet(sheet_id: str, sheet_name: str):
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df = pd.read_csv(url)
items = []
for index, row in df.iterrows():
items.append(
{
"question": row["Question"],
"answer": row["Answer"],
}
)
print(f'read from google sheet {df.size} items')
return items
def indexData(sheet_id: str, sheet_name: str):
items = readDataFromGoogleSheet(sheet_id, sheet_name)
questionAndAnswers = []
for item in items:
tokenized_question = tokenize(item["question"])
questionAndAnswer = {
"question": tokenized_question,
"answer": item["answer"],
"question_embedding": generate_embedding(tokenized_question),
"user_id": sheet_id,
}
questionAndAnswers.append(questionAndAnswer)
deleteByUserId(sheet_id)
insertQuestionAndAnswers(questionAndAnswers)
# for index, article in enumerate(data):
# if(index< 6580):
# continue;
# if(len(str(article['title'])) == 0 or len(str(article['description'])) == 0 or len(str(article['link'])) == 0 ):
# continue
# tokenized_title = tokenize(article['title'])
# tokenized_description = tokenize(article['description'])
# article = {
# 'title': tokenized_title,
# 'description': tokenized_description,
# 'link': article['link'],
# # 'title_embedding': generate_embedding(tokenized_title),
# 'title_embedding': [],
# 'description_embedding': generate_embedding(tokenized_title + ": " + tokenized_description),
# }
# print(f"processed {index}/{len(articles)}")
# save_db(article)