|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging |
|
|
|
import os |
|
from datasets import load_dataset |
|
from langdetect import detect |
|
from langdetect import detect_langs |
|
from langdetect import DetectorFactory |
|
import pandas as pd |
|
import pyarrow as pa |
|
import pyarrow.dataset as ds |
|
from datasets import Dataset |
|
import re |
|
from langchain_community.embeddings import SentenceTransformerEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import json |
|
import pickle |
|
import numpy as np |
|
import shutil |
|
import tempfile |
|
index_source='index.faiss' |
|
hh_source='index.pkl' |
|
model_name = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
|
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name) |
|
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
|
|
tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-base") |
|
model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") |
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
index_target = os.path.join(temp_dir, 'index.faiss') |
|
hh_target = os.path.join(temp_dir, 'index.pkl') |
|
|
|
|
|
shutil.copy(index_source, index_target) |
|
shutil.copy(hh_source, hh_target) |
|
|
|
|
|
vector_db = FAISS.load_local(temp_dir, embedding_llm, allow_dangerous_deserialization=True) |
|
|
|
|
|
|
|
def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'): |
|
|
|
|
|
|
|
with open(text_filename, 'r', encoding='utf-8') as f: |
|
docs_text = json.load(f) |
|
|
|
|
|
with open(embeddings_filename, 'r') as f: |
|
docs_embeddings = json.load(f) |
|
|
|
return docs_text, docs_embeddings |
|
|
|
def mot_cle(path): |
|
with open(path, 'r') as fichier: |
|
contenu = fichier.read() |
|
|
|
|
|
mots = contenu.split(',') |
|
|
|
|
|
for mot in mots: |
|
print(mot.strip()) |
|
|
|
|
|
tableau_de_mots = [mot.strip() for mot in mots] |
|
return tableau_de_mots |
|
|
|
|
|
|
|
def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db): |
|
query_text = question |
|
detected_languages=detect_langs(question) |
|
main_language = max(detected_languages, key=lambda lang: lang.prob) |
|
lang = main_language.lang |
|
print(lang) |
|
|
|
|
|
|
|
if lang=='fr': |
|
input_text = f"translate french to English: {query_text}" |
|
input_ids = tokenizer1(input_text, return_tensors="pt").input_ids |
|
|
|
outputs = model1.generate(input_ids,max_length = 100) |
|
print(tokenizer1.decode(outputs[0])) |
|
text=tokenizer1.decode(outputs[0]) |
|
cleaned_text = re.sub(r'<.*?>', '', text) |
|
cleaned_text = cleaned_text.strip() |
|
query_text=cleaned_text |
|
|
|
query_embedding = embedding_llm.embed_query(query_text) |
|
query_embedding_array = np.array(query_embedding) |
|
docs_embeddings=np.array(docs_embeddings) |
|
|
|
|
|
|
|
|
|
question = query_text |
|
|
|
|
|
mots_question = question.lower().split() |
|
bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)] |
|
|
|
mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier} |
|
mots_question_lower=[mot.lower() for mot in mots_question] |
|
bi_grammes_lower=[mot.lower() for mot in bi_grammes] |
|
|
|
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in bi_grammes_lower] |
|
if not mots_trouves1: |
|
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ] |
|
|
|
|
|
mots_trouves=mots_trouves1 |
|
if not mots_trouves: |
|
|
|
similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings] |
|
sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True) |
|
similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72] |
|
if not similar_docs1: |
|
similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65] |
|
if not similar_docs2: |
|
similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4] |
|
if not similar_docs: |
|
similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)] |
|
if not similar_docsA: |
|
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.") |
|
prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
if lang=='fr': |
|
prompt="for this question translate this answer in frensh and write theme , don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
else: |
|
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.") |
|
prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help." |
|
if lang=='fr': |
|
prompt="for this question translate this answer in frensh and write theme,don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
|
|
|
|
else: |
|
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]]) |
|
system_message=" " |
|
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
if lang=='fr': |
|
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer and don't write [frensh]<> ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]]) |
|
system_message=" " |
|
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
if lang=='fr': |
|
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer , don't write [frensh]<> ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]]) |
|
system_message=" " |
|
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread 3 times each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}<</SYS>>\n\n {query_text}[/INST]" |
|
if lang=='fr': |
|
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate and give me the answer in french ,don't mention that you translate the answer ,don't write [frensh]<> ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
i=0 |
|
similar_docs=[] |
|
for i in range(len(mots_trouves)): |
|
k=mots_trouves[i] |
|
result=vector_db.similarity_search( |
|
query_text, |
|
k=1, |
|
filter={'document':mots_trouves[i] } |
|
) |
|
similar_docs.append(result[0]) |
|
context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))]) |
|
system_message=" " |
|
prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
if lang=='fr': |
|
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate and give me the answer in french ,don't mention that you translate the answer ,don't write [frensh]<> ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return prompt |
|
|