File size: 20,740 Bytes
702d7ab 3b3520e a8f4003 ca0effb 2f2cdfe a8f4003 d236dca a8f4003 7d74414 c36a40b 1281b5f 702d7ab c36a40b db8ea96 28f53a4 00d851b 1d37135 9bf431d 1d37135 d236dca 1f67682 51d1a9e db8ea96 00d851b db8ea96 c396b0b db8ea96 1281b5f 702d7ab a8f4003 1281b5f a8f4003 1281b5f 7e486cf 1281b5f a8f4003 e1abe10 8920296 1281b5f ba0d2a5 d236dca be321c7 d236dca 1281b5f 301c860 1281b5f 3051da3 1281b5f 3051da3 1281b5f 301c860 1281b5f 38f4aec d236dca 2867533 1281b5f 38f4aec d236dca 2867533 d236dca 1275d53 1281b5f 640c692 1281b5f ee90423 be321c7 fd04a06 1281b5f 640c692 1281b5f ee90423 be321c7 ee90423 be321c7 1281b5f 640c692 1281b5f ee90423 be321c7 ba0d2a5 be321c7 1281b5f 640c692 1281b5f 539436b be321c7 ba0d2a5 be321c7 1281b5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
import os,torch
from datasets import load_dataset
from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
import re
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle
import numpy as np
import shutil
import tempfile
index_source='index.faiss'
hh_source='index.pkl'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-base")
model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
with tempfile.TemporaryDirectory() as temp_dir:
# Chemins des fichiers cibles dans le répertoire temporaire
index_target = os.path.join(temp_dir, 'index.faiss')
hh_target = os.path.join(temp_dir, 'index.pkl')
# Copier les fichiers dans le répertoire temporaire
shutil.copy(index_source, index_target)
shutil.copy(hh_source, hh_target)
# Charger la base de données FAISS à partir du répertoire temporaire
vector_db = FAISS.load_local(temp_dir, embedding_llm, allow_dangerous_deserialization=True)
def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'):
with open(text_filename, 'r', encoding='utf-8') as f:
docs_text = json.load(f)
with open(embeddings_filename, 'r') as f:
docs_embeddings = json.load(f)
return docs_text, docs_embeddings
#docs_text, docs_embeddings = load_data()
def mot_cle(path):
with open(path, 'r') as fichier:
contenu = fichier.read()
# Séparer les mots en utilisant la virgule comme séparateur
mots = contenu.split(',')
# Afficher les mots pour vérifier
for mot in mots:
print(mot.strip())
# stocker les mots dans un tableau (une liste)
tableau_de_mots = [mot.strip() for mot in mots]
return tableau_de_mots
def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
query_text = question
q1=question
print(q1)
detected_languages=detect_langs(question)
main_language = max(detected_languages, key=lambda lang: lang.prob)
lang = main_language.lang
print(lang)
if lang=='fr':
input_text = f"translate french to English: {query_text}"
input_ids = tokenizer1(input_text, return_tensors="pt").input_ids
outputs = model1.generate(input_ids,max_length = 100)
print(tokenizer1.decode(outputs[0]))
text=tokenizer1.decode(outputs[0])
cleaned_text = re.sub(r'<.*?>', '', text) # Supprime les balises HTML
cleaned_text = cleaned_text.strip() # Enlève les espaces de début et de fin
query_text=cleaned_text
query_embedding = embedding_llm.embed_query(query_text)
query_embedding_array = np.array(query_embedding)
docs_embeddings=np.array(docs_embeddings)
# Question à analyser
question = query_text
print(question)
# Convertir la question en une liste de mots
mots_question = question.lower().split()
bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)]
#mots_a_verifier_lower=[mot.lower() for mot in mots_a_verifier]
mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier}
mots_question_lower=[mot.lower() for mot in mots_question]
bi_grammes_lower=[mot.lower() for mot in bi_grammes]
# Trouver les mots de la question qui sont dans le tableau
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in bi_grammes_lower]
if not mots_trouves1:
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ]
# Afficher les mots trouvés
mots_trouves=mots_trouves1
if not mots_trouves:
similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings]
print(similarities)
sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True)
similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72]
if not similar_docs1:
similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65]
if not similar_docs2:
similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4]
if not similar_docs:
similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)]
if not similar_docsA:
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.")
prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
if lang=='fr':
prompt="for this question translate this answer in frensh and write theme , don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
else:
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.")
prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help."
if lang=='fr':
prompt="for this question translate this answer in frensh and write theme,don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
else:
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]])
print(context)
system_message=" "
prompt = f"As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context ###context:{context}\n ###question: {query_text} "
if lang=='fr':
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]"
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
#result = pipe(prompt)
#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
#generate=repons.replace("<start_of_turn>model", "")
#generates.append(generate)
#print(generate)
#print(result[0]['generated_text'])
else:
context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]])
print(context)
system_message=" "
prompt = f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n ###question: {query_text} "
if lang=='fr':
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer\n ###context:{context}\n ###question: {query_text}"
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
#result = pipe(prompt)
#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
#generate=repons.replace("<start_of_turn>model", "")
#generates.append(generate)
#print(generate)
#print(result[0]['generated_text'])
else:
context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]])
print(context)
system_message=" "
prompt = f"As Djezzy's chatbot\nread 3 times each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}\n {query_text}"
if lang=='fr':
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french ,don't mention that you translate the answer\n ###context:{context}\n ###question: {q1} "
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
#result = pipe(prompt)
#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
#generate=repons.replace("<start_of_turn>model", "")
#generates.append(generate)
#print(generate)
#print(result[0]['generated_text'])
else:
i=0
similar_docs=[]
for i in range(len(mots_trouves)):
k=mots_trouves[i]
result=vector_db.similarity_search(
query_text,
k=1,
filter={'document':mots_trouves[i] }
)
similar_docs.append(result[0])
context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))])
print(context)
system_message=" "
prompt = f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n ###question: {query_text} "
if lang=='fr':
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n give me the answer in french language \n ###context:{context}\n ###question: {q1}"
#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
#result = pipe(prompt)
#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
#generate=repons.replace("<start_of_turn>model", "")
#generates.append(generate)
#print(generate)
#print(result[0]['generated_text'])
return prompt
|