Spaces:
Runtime error
Runtime error
import pandas as pd | |
from services.generate_embedding import generate_embedding | |
from pyvi.ViTokenizer import tokenize | |
import pymongo | |
SHEET_ID = "1MKB6MHgL_lrPB1I69fj2VcVrgmSAMLVNZR1EwSyTSeA" | |
SHEET_NAME = "Q&A" | |
# Connect DB | |
client = pymongo.MongoClient( | |
"mongodb+srv://rag:[email protected]/?retryWrites=true&w=majority&appName=RAG" | |
) | |
db = client.rag | |
collection = db.questionAndAnswers | |
def insertQuestionAndAnswers(questionAndAnswers): | |
return collection.insert_many(questionAndAnswers) | |
def deleteByUserId(user_id: str): | |
return collection.delete_many({'user_id': user_id}) | |
def readDataFromGoogleSheet(sheet_id: str, sheet_name: str): | |
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}" | |
df = pd.read_csv(url) | |
items = [] | |
for index, row in df.iterrows(): | |
items.append( | |
{ | |
"question": row["Question"], | |
"answer": row["Answer"], | |
} | |
) | |
print(f'read from google sheet {df.size} items') | |
return items | |
def indexData(sheet_id: str, sheet_name: str): | |
items = readDataFromGoogleSheet(sheet_id, sheet_name) | |
questionAndAnswers = [] | |
for item in items: | |
tokenized_question = tokenize(item["question"]) | |
questionAndAnswer = { | |
"question": tokenized_question, | |
"answer": item["answer"], | |
"question_embedding": generate_embedding(tokenized_question), | |
"user_id": sheet_id, | |
} | |
questionAndAnswers.append(questionAndAnswer) | |
deleteByUserId(sheet_id) | |
insertQuestionAndAnswers(questionAndAnswers) | |
# for index, article in enumerate(data): | |
# if(index< 6580): | |
# continue; | |
# if(len(str(article['title'])) == 0 or len(str(article['description'])) == 0 or len(str(article['link'])) == 0 ): | |
# continue | |
# tokenized_title = tokenize(article['title']) | |
# tokenized_description = tokenize(article['description']) | |
# article = { | |
# 'title': tokenized_title, | |
# 'description': tokenized_description, | |
# 'link': article['link'], | |
# # 'title_embedding': generate_embedding(tokenized_title), | |
# 'title_embedding': [], | |
# 'description_embedding': generate_embedding(tokenized_title + ": " + tokenized_description), | |
# } | |
# print(f"processed {index}/{len(articles)}") | |
# save_db(article) | |