Update djezzy.py
Browse files
djezzy.py
CHANGED
@@ -4,7 +4,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
4 |
|
5 |
import os
|
6 |
from datasets import load_dataset
|
7 |
-
|
|
|
|
|
8 |
import pandas as pd
|
9 |
import pyarrow as pa
|
10 |
import pyarrow.dataset as ds
|
@@ -23,6 +25,11 @@ hh_source='index.pkl'
|
|
23 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
24 |
|
25 |
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)
|
|
|
|
|
|
|
|
|
|
|
26 |
with tempfile.TemporaryDirectory() as temp_dir:
|
27 |
# Chemins des fichiers cibles dans le répertoire temporaire
|
28 |
index_target = os.path.join(temp_dir, 'index.faiss')
|
@@ -69,6 +76,23 @@ def mot_cle(path):
|
|
69 |
|
70 |
def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
|
71 |
query_text = question
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
query_embedding = embedding_llm.embed_query(query_text)
|
73 |
query_embedding_array = np.array(query_embedding)
|
74 |
docs_embeddings=np.array(docs_embeddings)
|
@@ -106,10 +130,14 @@ def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
|
|
106 |
if not similar_docsA:
|
107 |
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.")
|
108 |
prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
|
109 |
-
|
|
|
110 |
else:
|
111 |
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.")
|
112 |
prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help."
|
|
|
|
|
|
|
113 |
|
114 |
else:
|
115 |
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]])
|
|
|
4 |
|
5 |
import os
|
6 |
from datasets import load_dataset
|
7 |
+
from langdetect import detect
|
8 |
+
from langdetect import detect_langs
|
9 |
+
from langdetect import DetectorFactory
|
10 |
import pandas as pd
|
11 |
import pyarrow as pa
|
12 |
import pyarrow.dataset as ds
|
|
|
25 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
26 |
|
27 |
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)
|
28 |
+
|
29 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
30 |
+
|
31 |
+
tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-xl")
|
32 |
+
model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map=device_map, load_in_8bit=True)
|
33 |
with tempfile.TemporaryDirectory() as temp_dir:
|
34 |
# Chemins des fichiers cibles dans le répertoire temporaire
|
35 |
index_target = os.path.join(temp_dir, 'index.faiss')
|
|
|
76 |
|
77 |
def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db):
|
78 |
query_text = question
|
79 |
+
detected_languages=detect_langs(question)
|
80 |
+
main_language = max(detected_languages, key=lambda lang: lang.prob)
|
81 |
+
lang = main_language.lang
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
if lang=='fr':
|
86 |
+
input_text = f"translate french to English: {query_text}"
|
87 |
+
input_ids = tokenizer1(input_text, return_tensors="pt").input_ids
|
88 |
+
|
89 |
+
outputs = model1.generate(input_ids,max_length = 100)
|
90 |
+
print(tokenizer1.decode(outputs[0]))
|
91 |
+
text=tokenizer1.decode(outputs[0])
|
92 |
+
cleaned_text = re.sub(r'<.*?>', '', text) # Supprime les balises HTML
|
93 |
+
cleaned_text = cleaned_text.strip() # Enlève les espaces de début et de fin
|
94 |
+
query_text=cleaned_text
|
95 |
+
|
96 |
query_embedding = embedding_llm.embed_query(query_text)
|
97 |
query_embedding_array = np.array(query_embedding)
|
98 |
docs_embeddings=np.array(docs_embeddings)
|
|
|
130 |
if not similar_docsA:
|
131 |
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.")
|
132 |
prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope."
|
133 |
+
if lang=='fr':
|
134 |
+
prompt="pour cette question écrivez cette réponse et n'ajoutez rien :En tant que chatbot pour Djezzy, je peux fournir des informations exclusivement sur nos sociétés affiliées. Malheureusement, je ne suis pas en mesure de répondre aux demandes en dehors de ce cadre."
|
135 |
else:
|
136 |
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.")
|
137 |
prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help."
|
138 |
+
if lang=='fr':
|
139 |
+
prompt="pour cette question écrivez cette réponse et n'ajoutez rien:Je m'excuse, je ne comprends pas bien votre question. Vous pouvez contacter notre service client pour obtenir des réponses à vos besoins, ou si vous pouvez fournir plus de détails, je serai heureux de vous aider."
|
140 |
+
|
141 |
|
142 |
else:
|
143 |
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]])
|