from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging | |
import os | |
from datasets import load_dataset | |
import pandas as pd | |
import pyarrow as pa | |
import pyarrow.dataset as ds | |
from datasets import Dataset | |
import re | |
from langchain_community.embeddings import SentenceTransformerEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from sklearn.metrics.pairwise import cosine_similarity | |
import json | |
import pickle | |
import numpy as np | |
model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name) | |
loaded_vector_db = FAISS.load_local('index.faiss', embedding_llm, allow_dangerous_deserialization=True) | |
def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'): | |
with open(text_filename, 'r', encoding='utf-8') as f: | |
docs_text = json.load(f) | |
with open(embeddings_filename, 'r') as f: | |
docs_embeddings = json.load(f) | |
return docs_text, docs_embeddings | |
#docs_text, docs_embeddings = load_data() | |
def mot_cle(path): | |
with open(path, 'r') as fichier: | |
contenu = fichier.read() | |
# Séparer les mots en utilisant la virgule comme séparateur | |
mots = contenu.split(',') | |
# Afficher les mots pour vérifier | |
for mot in mots: | |
print(mot.strip()) | |
# stocker les mots dans un tableau (une liste) | |
tableau_de_mots = [mot.strip() for mot in mots] | |
return tableau_de_mots | |
def pip(question,docs_text, docs_embeddings,mots_a_verifier): | |
query_text = question | |
query_embedding = embedding_llm.embed_query(query_text) | |
query_embedding_array = np.array(query_embedding) | |
docs_embeddings=np.array(docs_embeddings) | |
# Question à analyser | |
question = query_text | |
# Convertir la question en une liste de mots | |
mots_question = question.lower().split() | |
bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)] | |
#mots_a_verifier_lower=[mot.lower() for mot in mots_a_verifier] | |
mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier} | |
mots_question_lower=[mot.lower() for mot in mots_question] | |
bi_grammes_lower=[mot.lower() for mot in bi_grammes] | |
# Trouver les mots de la question qui sont dans le tableau | |
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in bi_grammes_lower] | |
if not mots_trouves1: | |
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ] | |
# Afficher les mots trouvés | |
mots_trouves=mots_trouves1 | |
if not mots_trouves: | |
similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings] | |
sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True) | |
similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72] | |
if not similar_docs1: | |
similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65] | |
if not similar_docs2: | |
similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4] | |
if not similar_docs: | |
similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)] | |
if not similar_docsA: | |
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.") | |
generate2="As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." | |
generates.append(generate2) | |
else: | |
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.") | |
generate1="I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help." | |
generates.append(generate1) | |
else: | |
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]]) | |
system_message=" " | |
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" | |
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task | |
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000) | |
#result = pipe(prompt) | |
#repons=result[0]['generated_text'].split('[/INST]')[1].strip() | |
#generate=repons.replace("<start_of_turn>model", "") | |
#generates.append(generate) | |
#print(generate) | |
#print(result[0]['generated_text']) | |
else: | |
context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]]) | |
system_message=" " | |
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" | |
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task | |
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000) | |
#result = pipe(prompt) | |
#repons=result[0]['generated_text'].split('[/INST]')[1].strip() | |
#generate=repons.replace("<start_of_turn>model", "") | |
#generates.append(generate) | |
#print(generate) | |
#print(result[0]['generated_text']) | |
else: | |
context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]]) | |
system_message=" " | |
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread 3 times each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}<</SYS>>\n\n {query_text}[/INST]" | |
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task | |
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000) | |
#result = pipe(prompt) | |
#repons=result[0]['generated_text'].split('[/INST]')[1].strip() | |
#generate=repons.replace("<start_of_turn>model", "") | |
#generates.append(generate) | |
#print(generate) | |
#print(result[0]['generated_text']) | |
else: | |
i=0 | |
similar_docs=[] | |
for i in range(len(mots_trouves)): | |
k=mots_trouves[i] | |
result=vector_db.similarity_search( | |
query_text, | |
k=1, | |
filter={'document':mots_trouves[i] } | |
) | |
similar_docs.append(result[0]) | |
context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))]) | |
system_message=" " | |
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" | |
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task | |
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000) | |
#result = pipe(prompt) | |
#repons=result[0]['generated_text'].split('[/INST]')[1].strip() | |
#generate=repons.replace("<start_of_turn>model", "") | |
#generates.append(generate) | |
#print(generate) | |
#print(result[0]['generated_text']) | |
return prompt | |