Spaces:

tarekfer8
/

tarek

Sleeping

File size: 20,740 Bytes

702d7ab
3b3520e
a8f4003
ca0effb
2f2cdfe
a8f4003
d236dca
 
 
a8f4003
 
 
 
 
7d74414
c36a40b
1281b5f
702d7ab
c36a40b
 
db8ea96
 
28f53a4
00d851b
 
1d37135
9bf431d
1d37135
 
d236dca
 
1f67682
51d1a9e
 
db8ea96
 
00d851b
 
db8ea96
 
 
 
 
 
 
c396b0b
 
db8ea96
1281b5f
702d7ab
a8f4003
 
1281b5f
 
a8f4003
 
1281b5f
 
 
 
 
7e486cf
 
1281b5f
 
 
 
 
 
 
 
 
 
 
 
a8f4003
 
e1abe10
8920296
1281b5f
ba0d2a5
 
d236dca
 
 
be321c7
d236dca
 
 
 
 
 
 
 
 
 
 
 
 
 
1281b5f
 
 
 
 
 
 
 
301c860
1281b5f
 
 
 
 
 
 
 
 
3051da3
 
1281b5f
 
3051da3
1281b5f
 
 
301c860
1281b5f
 
 
 
 
 
 
 
 
 
38f4aec
d236dca
2867533
1281b5f
 
38f4aec
d236dca
2867533
d236dca
1275d53
1281b5f
 
640c692
1281b5f
ee90423
be321c7
fd04a06
1281b5f
 
 
 
 
 
 
 
 
 
640c692
1281b5f
ee90423
be321c7
ee90423
be321c7
1281b5f
 
 
 
 
 
 
 
 
 
 
640c692
1281b5f
ee90423
be321c7
ba0d2a5
be321c7
1281b5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640c692
1281b5f
539436b
be321c7
ba0d2a5
be321c7
1281b5f



from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging

import os,torch
from datasets import load_dataset
from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
import re
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle
import numpy as np
import shutil
import tempfile

index_source='index.faiss'
hh_source='index.pkl'
model_name = "sentence-transformers/all-MiniLM-L6-v2"


embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-base")
model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
with tempfile.TemporaryDirectory() as temp_dir:
    # Chemins des fichiers cibles dans le répertoire temporaire
    index_target = os.path.join(temp_dir, 'index.faiss')
    hh_target = os.path.join(temp_dir, 'index.pkl')
    
    # Copier les fichiers dans le répertoire temporaire
    shutil.copy(index_source, index_target)
    shutil.copy(hh_source, hh_target)
    
    # Charger la base de données FAISS à partir du répertoire temporaire
    vector_db = FAISS.load_local(temp_dir, embedding_llm, allow_dangerous_deserialization=True)



def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'):



    with open(text_filename, 'r', encoding='utf-8') as f:
        docs_text = json.load(f)


    with open(embeddings_filename, 'r') as f:
        docs_embeddings = json.load(f)

    return docs_text, docs_embeddings
#docs_text, docs_embeddings = load_data()
def mot_cle(path):
 with open(path, 'r') as fichier:
     contenu = fichier.read()

 # Séparer les mots en utilisant la virgule comme séparateur
 mots = contenu.split(',')

 # Afficher les mots pour vérifier
 for mot in mots:
     print(mot.strip())

 # stocker les mots dans un tableau (une liste)
 tableau_de_mots = [mot.strip() for mot in mots]
 return  tableau_de_mots



def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
 query_text = question
 q1=question
 print(q1)
 detected_languages=detect_langs(question)
 main_language = max(detected_languages, key=lambda lang: lang.prob)
 lang = main_language.lang
 print(lang)



 if lang=='fr':
   input_text = f"translate french to English: {query_text}"
   input_ids = tokenizer1(input_text, return_tensors="pt").input_ids

   outputs = model1.generate(input_ids,max_length = 100)
   print(tokenizer1.decode(outputs[0]))
   text=tokenizer1.decode(outputs[0])
   cleaned_text = re.sub(r'<.*?>', '', text)  # Supprime les balises HTML
   cleaned_text = cleaned_text.strip()  # Enlève les espaces de début et de fin
   query_text=cleaned_text   
 
 query_embedding = embedding_llm.embed_query(query_text)
 query_embedding_array = np.array(query_embedding)
 docs_embeddings=np.array(docs_embeddings)



  # Question à analyser
 question = query_text
 print(question)
  # Convertir la question en une liste de mots
 mots_question = question.lower().split()
 bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)]
  #mots_a_verifier_lower=[mot.lower() for mot in mots_a_verifier]
 mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier}
 mots_question_lower=[mot.lower() for mot in mots_question]
 bi_grammes_lower=[mot.lower() for mot in bi_grammes]
  # Trouver les mots de la question qui sont dans le tableau
 mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if  mot in bi_grammes_lower]
 if not mots_trouves1:
  mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ]
  # Afficher les mots trouvés

  mots_trouves=mots_trouves1
  if not mots_trouves:

   similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings]
   print(similarities)
   sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True)
   similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72]
   if  not similar_docs1:
    similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65]
    if  not similar_docs2:
     similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4]
     if  not similar_docs:
       similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)]
       if  not similar_docsA:
         print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.")
         prompt=" for this question write this answer and don't add anything  :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
         if lang=='fr':
             prompt="for this question translate this answer in frensh  and write theme , don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
       else:
        print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.")
        prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help."
        if lang=='fr':
             prompt="for this question translate this answer in frensh  and write theme,don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."

        
     else:
        context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]])
        print(context)
        system_message=" "
        prompt = f"As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context  ###context:{context}\n  ###question: {query_text} "
        if lang=='fr':
             prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer \n   ###context:{context}<</SYS>>\n\n  ###question: {query_text} [/INST]" 
        #prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
        #pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
        #result = pipe(prompt)
        #repons=result[0]['generated_text'].split('[/INST]')[1].strip()
        #generate=repons.replace("<start_of_turn>model", "")
        #generates.append(generate)
        #print(generate)
        #print(result[0]['generated_text'])
    else:
     context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]])
     print(context)
     system_message=" "
     prompt = f"  As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n  ###question: {query_text} "
     if lang=='fr':
             prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer\n   ###context:{context}\n  ###question: {query_text}" 
        
     #prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
     #pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
     #result = pipe(prompt)
     #repons=result[0]['generated_text'].split('[/INST]')[1].strip()
     #generate=repons.replace("<start_of_turn>model", "")
     #generates.append(generate)
     #print(generate)
     #print(result[0]['generated_text'])

   else:
    context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]])
    print(context)
    system_message=" "
    prompt = f"As Djezzy's chatbot\nread 3 times  each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}\n {query_text}"
    if lang=='fr':
             prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french ,don't mention that you translate the answer\n   ###context:{context}\n  ###question: {q1} " 
           
    #prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
    #pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
    #result = pipe(prompt)
    #repons=result[0]['generated_text'].split('[/INST]')[1].strip()
    #generate=repons.replace("<start_of_turn>model", "")
    #generates.append(generate)
    #print(generate)
    #print(result[0]['generated_text'])
  else:
     i=0
     similar_docs=[]
     for i in range(len(mots_trouves)):
       k=mots_trouves[i]
       result=vector_db.similarity_search(
                                       query_text,
                                       k=1,
                                       filter={'document':mots_trouves[i] }
                                     )
       similar_docs.append(result[0])
     context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))])
     print(context)
     system_message=" "
     prompt = f" As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n  ###question: {query_text} "
     if lang=='fr':
             prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n give me the  answer in french language  \n ###context:{context}\n  ###question: {q1}" 
         
     #prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer  the  question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
     #pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
     #result = pipe(prompt)
     #repons=result[0]['generated_text'].split('[/INST]')[1].strip()
     #generate=repons.replace("<start_of_turn>model", "")
     #generates.append(generate)
     #print(generate)
     #print(result[0]['generated_text'])
 return prompt