Spaces:

tarekfer8
/

tarek

Sleeping

App Files Files Community

tarek / djezzy.py

tarekfer8

Update djezzy.py

b643fd1 verified about 1 year ago

raw

history blame

18 kB



	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging

	import os
	from datasets import load_dataset
	from langdetect import detect
	from langdetect import detect_langs
	from langdetect import DetectorFactory
	import pandas as pd
	import pyarrow as pa
	import pyarrow.dataset as ds
	from datasets import Dataset
	import re
	from langchain_community.embeddings import SentenceTransformerEmbeddings
	from langchain_community.vectorstores import FAISS
	from sklearn.metrics.pairwise import cosine_similarity
	import json
	import pickle
	import numpy as np
	import shutil
	import tempfile
	index_source='index.faiss'
	hh_source='index.pkl'
	model_name = "sentence-transformers/all-MiniLM-L6-v2"

	embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)

	from transformers import T5Tokenizer, T5ForConditionalGeneration

	tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-base")
	model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
	with tempfile.TemporaryDirectory() as temp_dir:
	# Chemins des fichiers cibles dans le répertoire temporaire
	index_target = os.path.join(temp_dir, 'index.faiss')
	hh_target = os.path.join(temp_dir, 'index.pkl')

	# Copier les fichiers dans le répertoire temporaire
	shutil.copy(index_source, index_target)
	shutil.copy(hh_source, hh_target)

	# Charger la base de données FAISS à partir du répertoire temporaire
	vector_db = FAISS.load_local(temp_dir, embedding_llm, allow_dangerous_deserialization=True)



	def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'):



	with open(text_filename, 'r', encoding='utf-8') as f:
	docs_text = json.load(f)


	with open(embeddings_filename, 'r') as f:
	docs_embeddings = json.load(f)

	return docs_text, docs_embeddings
	#docs_text, docs_embeddings = load_data()
	def mot_cle(path):
	with open(path, 'r') as fichier:
	contenu = fichier.read()

	# Séparer les mots en utilisant la virgule comme séparateur
	mots = contenu.split(',')

	# Afficher les mots pour vérifier
	for mot in mots:
	print(mot.strip())

	# stocker les mots dans un tableau (une liste)
	tableau_de_mots = [mot.strip() for mot in mots]
	return tableau_de_mots



	def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
	query_text = question
	detected_languages=detect_langs(question)
	main_language = max(detected_languages, key=lambda lang: lang.prob)
	lang = main_language.lang



	if lang=='fr':
	input_text = f"translate french to English: {query_text}"
	input_ids = tokenizer1(input_text, return_tensors="pt").input_ids

	outputs = model1.generate(input_ids,max_length = 100)
	print(tokenizer1.decode(outputs[0]))
	text=tokenizer1.decode(outputs[0])
	cleaned_text = re.sub(r'<.*?>', '', text) # Supprime les balises HTML
	cleaned_text = cleaned_text.strip() # Enlève les espaces de début et de fin
	query_text=cleaned_text

	query_embedding = embedding_llm.embed_query(query_text)
	query_embedding_array = np.array(query_embedding)
	docs_embeddings=np.array(docs_embeddings)



	# Question à analyser
	question = query_text

	# Convertir la question en une liste de mots
	mots_question = question.lower().split()
	bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)]
	#mots_a_verifier_lower=[mot.lower() for mot in mots_a_verifier]
	mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier}
	mots_question_lower=[mot.lower() for mot in mots_question]
	bi_grammes_lower=[mot.lower() for mot in bi_grammes]
	# Trouver les mots de la question qui sont dans le tableau
	mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in bi_grammes_lower]
	if not mots_trouves1:
	mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ]
	# Afficher les mots trouvés

	mots_trouves=mots_trouves1
	if not mots_trouves:

	similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings]
	sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True)
	similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72]
	if not similar_docs1:
	similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65]
	if not similar_docs2:
	similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4]
	if not similar_docs:
	similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)]
	if not similar_docsA:
	print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.")
	prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
	if lang=='fr':
	prompt="pour cette question écrivez cette réponse et n'ajoutez rien :En tant que chatbot pour Djezzy, je peux fournir des informations exclusivement sur nos sociétés affiliées. Malheureusement, je ne suis pas en mesure de répondre aux demandes en dehors de ce cadre."
	else:
	print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.")
	prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help."
	if lang=='fr':
	prompt="pour cette question écrivez cette réponse et n'ajoutez rien:Je m'excuse, je ne comprends pas bien votre question. Vous pouvez contacter notre service client pour obtenir des réponses à vos besoins, ou si vous pouvez fournir plus de détails, je serai heureux de vous aider."


	else:
	context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]])
	system_message=" "
	prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]"
	#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
	#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
	#result = pipe(prompt)
	#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
	#generate=repons.replace("<start_of_turn>model", "")
	#generates.append(generate)
	#print(generate)
	#print(result[0]['generated_text'])
	else:
	context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]])
	system_message=" "
	prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]"
	#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
	#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
	#result = pipe(prompt)
	#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
	#generate=repons.replace("<start_of_turn>model", "")
	#generates.append(generate)
	#print(generate)
	#print(result[0]['generated_text'])

	else:
	context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]])
	system_message=" "
	prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread 3 times each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}<</SYS>>\n\n {query_text}[/INST]"
	#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
	#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
	#result = pipe(prompt)
	#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
	#generate=repons.replace("<start_of_turn>model", "")
	#generates.append(generate)
	#print(generate)
	#print(result[0]['generated_text'])
	else:
	i=0
	similar_docs=[]
	for i in range(len(mots_trouves)):
	k=mots_trouves[i]
	result=vector_db.similarity_search(
	query_text,
	k=1,
	filter={'document':mots_trouves[i] }
	)
	similar_docs.append(result[0])
	context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))])
	system_message=" "
	prompt = f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]"
	#prompt = f" <bos><start_of_turn>user \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[0]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[0]}<eos>\nuser \n read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{contexts[1]}\n ###question:\nWhat are the benefits of opting for the Djezzy Legend 100 DA package? \n###answer:\n{reponses[1]}<eos>\nuser read each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n###context:\n{context}\n###question:\n{query_text}\n###answer:\n<end_of_turn>\n <start_of_turn>model" # replace the command here with something relevant to your task
	#pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,temperature=0.1,top_p=0.9, max_length=4000)
	#result = pipe(prompt)
	#repons=result[0]['generated_text'].split('[/INST]')[1].strip()
	#generate=repons.replace("<start_of_turn>model", "")
	#generates.append(generate)
	#print(generate)
	#print(result[0]['generated_text'])
	return prompt