MohammedNasser commited on
Commit
bf18e69
1 Parent(s): 3ba55e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -81
app.py CHANGED
@@ -1,46 +1,55 @@
 
1
  import os
2
  import fitz
3
  from dotenv import load_dotenv
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from langchain_community.vectorstores import FAISS
6
- from langchain_huggingface import HuggingFaceEmbeddings
7
-
8
  from langchain_text_splitters import CharacterTextSplitter
9
  from langchain_groq import ChatGroq
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.chains import ConversationalRetrievalChain
12
- from pdf2image import convert_from_path
13
- import pytesseract
14
  from gtts import gTTS
15
- import uuid
16
- import gradio as gr
17
- import warnings
18
- warnings.filterwarnings("ignore", category=FutureWarning)
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Load environment variables
21
  load_dotenv()
22
  secret_key = os.getenv("GROQ_API_KEY")
23
 
24
  os.environ["GROQ_API_KEY"] = secret_key
25
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
26
-
27
- # File directories
28
  UPLOAD_FOLDER = 'uploads/'
29
- AUDIO_FOLDER = 'static/audio/'
30
-
31
- # Ensure directories exist
32
  for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
33
  if not os.path.exists(folder):
34
  os.makedirs(folder)
35
 
 
36
  def load_pdf(file_path):
37
- """
38
- Load and preprocess Arabic text from a PDF file.
39
- """
40
  pages = convert_from_path(file_path, 500)
41
  documents = []
42
- for imgBlob in pages:
43
- # Perform OCR on each image
44
  text = pytesseract.image_to_string(imgBlob, lang="ara")
45
  documents.append(text)
46
  return documents
@@ -49,10 +58,7 @@ def prepare_vectorstore(data):
49
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
50
  texts = data
51
  vectorstore = FAISS.from_texts(texts, embeddings)
52
-
53
- # Save FAISS index to disk
54
  vectorstore.save_local("faiss_index")
55
-
56
  return vectorstore
57
 
58
  def load_vectorstore():
@@ -73,73 +79,138 @@ def create_chain(vectorstore):
73
  return chain
74
 
75
  def process_pdf(pdf_file):
76
- if pdf_file is not None:
77
- file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
78
- pdf_file.save(file_path)
79
-
80
- # Load PDF, prepare vectorstore
81
- data = load_pdf(file_path)
82
- vectorstore = prepare_vectorstore(data)
83
- chain = create_chain(vectorstore)
84
-
85
- return chain, f"تم تحميل الملف '{pdf_file.name}' بنجاح!"
86
- return None, "الرجاء تحميل ملف PDF ."
87
-
88
- def chat_with_bot(user_input, chain):
89
- if chain is None:
90
- return "يرجى تحميل ملف PDF أولاً."
91
-
92
- prompt=f"""
93
- You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
94
-
95
- When responding, ensure the following:
96
-
97
- - Your answer directly reflects the content of the document.
98
- - If the requested information is not available in the document, clearly state that.
99
- - Keep your response concise yet comprehensive, addressing the question fully.
100
- - Always respond in formal Arabic, without using English.\n
101
-
102
- Question: {user_input}\n
103
- Helpful Answer:"""
104
-
105
  response = chain({"question": prompt})
106
  assistant_response = response["answer"]
107
-
108
- # Generate and save audio response
109
- audio_id = str(uuid.uuid4())
110
- audio_file = f"{audio_id}.mp3"
111
  tts = gTTS(text=assistant_response, lang='ar')
 
112
  tts.save(os.path.join(AUDIO_FOLDER, audio_file))
113
 
114
- return assistant_response, f"{AUDIO_FOLDER}/{audio_file}"
115
-
116
- # Gradio app interface
117
- def chatbot_interface(pdf_file, user_input):
118
- chain, message = process_pdf(pdf_file)
119
-
120
- if user_input and chain:
121
- response_text, audio_path = chat_with_bot(user_input, chain)
122
- return response_text, audio_path
123
- else:
124
- return "يرجى إدخال السؤال.", None
125
-
126
-
127
- with gr.Blocks() as demo:
128
- gr.Markdown("<h1 style='text-align:center;'>الشات بوت العربي لـ PDF</h1>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  with gr.Row():
131
- pdf_input = gr.File(label="اختر ملف 📑 PDF للدردشة", type="filepath")
 
132
 
133
- with gr.Row():
134
- user_input = gr.Textbox(label="سؤالك")
135
-
136
- with gr.Row():
137
- submit_button = gr.Button("رفع وبدء الدردشة")
138
-
139
- with gr.Row():
140
- output_text = gr.Textbox(label="الجواب")
141
- audio_output = gr.Audio(label="الرد الصوتي")
 
 
 
 
 
 
142
 
143
- submit_button.click(chatbot_interface, inputs=[pdf_input, user_input], outputs=[output_text, audio_output])
 
144
 
145
  demo.launch()
 
1
+ import gradio as gr
2
  import os
3
  import fitz
4
  from dotenv import load_dotenv
5
  from langchain_community.document_loaders import UnstructuredPDFLoader
6
  from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
 
8
  from langchain_text_splitters import CharacterTextSplitter
9
  from langchain_groq import ChatGroq
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.chains import ConversationalRetrievalChain
 
 
12
  from gtts import gTTS
13
+ import sys
14
+
15
+ try:
16
+ import pytesseract
17
+ from pdf2image import convert_from_path
18
+ except ImportError as e:
19
+ print(f"Error: {e}. Please make sure all system dependencies are installed.")
20
+ sys.exit(1)
21
+
22
+ # Rest of your imports...
23
+
24
+ # Set the Tesseract path
25
+ pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
26
+
27
+ # Test Tesseract installation
28
+ try:
29
+ pytesseract.get_languages()
30
+ except pytesseract.TesseractNotFoundError:
31
+ print("Error: Tesseract is not installed or not in the system PATH.")
32
+ sys.exit(1)
33
+
34
  # Load environment variables
35
  load_dotenv()
36
  secret_key = os.getenv("GROQ_API_KEY")
37
 
38
  os.environ["GROQ_API_KEY"] = secret_key
39
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
40
+ # Ensure the necessary folders exist
 
41
  UPLOAD_FOLDER = 'uploads/'
42
+ AUDIO_FOLDER = 'audio/'
 
 
43
  for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
44
  if not os.path.exists(folder):
45
  os.makedirs(folder)
46
 
47
+
48
  def load_pdf(file_path):
49
+ """Load and preprocess Arabic text from a PDF file."""
 
 
50
  pages = convert_from_path(file_path, 500)
51
  documents = []
52
+ for pageNum, imgBlob in enumerate(pages):
 
53
  text = pytesseract.image_to_string(imgBlob, lang="ara")
54
  documents.append(text)
55
  return documents
 
58
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
59
  texts = data
60
  vectorstore = FAISS.from_texts(texts, embeddings)
 
 
61
  vectorstore.save_local("faiss_index")
 
62
  return vectorstore
63
 
64
  def load_vectorstore():
 
79
  return chain
80
 
81
  def process_pdf(pdf_file):
82
+ file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
83
+ with open(file_path, "wb") as f:
84
+ f.write(pdf_file.read())
85
+ data = load_pdf(file_path)
86
+ vectorstore = prepare_vectorstore(data)
87
+ return "PDF processed successfully. You can now start chatting!"
88
+
89
+ def chat(user_input, history):
90
+ vectorstore = load_vectorstore()
91
+ chain = create_chain(vectorstore)
92
+
93
+ prompt = f"""
94
+ You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
95
+
96
+ When responding, ensure the following:
97
+ - Your answer directly reflects the content of the document.
98
+ - If the requested information is not available in the document, clearly state that.
99
+ - Keep your response concise yet comprehensive, addressing the question fully.
100
+ - Always respond in formal Arabic, without using English.
101
+
102
+ Question: {user_input}
103
+ Helpful Answer:"""
104
+
 
 
 
 
 
 
105
  response = chain({"question": prompt})
106
  assistant_response = response["answer"]
107
+
108
+ # Generate audio file
 
 
109
  tts = gTTS(text=assistant_response, lang='ar')
110
+ audio_file = f"response_{len(history)}.mp3"
111
  tts.save(os.path.join(AUDIO_FOLDER, audio_file))
112
 
113
+ return assistant_response, audio_file
114
+
115
+ custom_css = """
116
+ body {
117
+ font-family: 'Noto Kufi Arabic', sans-serif;
118
+ background: linear-gradient(135deg, #799351 0%, #A67B5B 100%);
119
+ background-size: cover;
120
+ background-position: center;
121
+ background-attachment: fixed;
122
+ }
123
+
124
+ .gradio-container {
125
+ max-width: 800px !important;
126
+ margin: auto !important;
127
+ background: rgba(255, 255, 255, 0.9);
128
+ border-radius: 20px;
129
+ box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
130
+ backdrop-filter: blur(4px);
131
+ border: 1px solid rgba(255, 255, 255, 0.18);
132
+ padding: 20px;
133
+ }
134
+
135
+ h1, h2, h3 {
136
+ color: #1A4D2E;
137
+ font-weight: bold;
138
+ text-align: center;
139
+ }
140
+
141
+ p {
142
+ color: #A89F91;
143
+ }
144
+
145
+ .gradio-button {
146
+ background-color: #5F6F65 !important;
147
+ color: #FFFFFF !important;
148
+ }
149
+
150
+ .gradio-button:hover {
151
+ background-color: #FFFFFF !important;
152
+ color: #5F6F65 !important;
153
+ }
154
+
155
+ .chat-message {
156
+ border-radius: 10px;
157
+ padding: 10px;
158
+ margin-bottom: 10px;
159
+ }
160
+
161
+ .chat-message.user {
162
+ background-color: #E7F0DC;
163
+ }
164
+
165
+ .chat-message.bot {
166
+ background-color: #F7EED3;
167
+ }
168
+
169
+ .chat-message::before {
170
+ content: '';
171
+ display: inline-block;
172
+ width: 24px;
173
+ height: 24px;
174
+ background-size: contain;
175
+ background-repeat: no-repeat;
176
+ margin-right: 10px;
177
+ vertical-align: middle;
178
+ }
179
+
180
+ .chat-message.user::before {
181
+ content: '👤';
182
+ }
183
+
184
+ .chat-message.bot::before {
185
+ content: '🤖';
186
+ }
187
+ """
188
+ # Gradio interface
189
+ with gr.Blocks(css=custom_css) as demo:
190
+ gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
191
+ gr.Markdown("## المنعقد السبت 14 - سبتمبر 2024")
192
 
193
  with gr.Row():
194
+ pdf_input = gr.File(label="اختر ملف PDF للدردشة")
195
+ process_button = gr.Button("رفع وبدء الدردشة")
196
 
197
+ chat_interface = gr.ChatInterface(
198
+ chat,
199
+ chatbot=gr.Chatbot(height=400),
200
+ textbox=gr.Textbox(placeholder="اكتب سؤالك هنا...", container=False),
201
+ title="الدردشة مع البوت",
202
+ description="اسأل أي سؤال عن محتوى الملف PDF",
203
+ theme="soft",
204
+ examples=["ما هو موضوع الوثيقة؟", "من هم الأشخاص المذكورون؟", "ما هي التواريخ الرئيسية المذكورة؟"],
205
+ cache_examples=True,
206
+ retry_btn=None,
207
+ undo_btn="مسح آخر رسالة",
208
+ clear_btn="مسح المحادثة",
209
+ )
210
+
211
+ audio_output = gr.Audio(label="الرد الصوتي")
212
 
213
+ process_button.click(process_pdf, inputs=[pdf_input], outputs=[chat_interface.textbox])
214
+ chat_interface.submit(lambda x, y: y[-1][1], inputs=[chat_interface.textbox, chat_interface.chatbot], outputs=[audio_output])
215
 
216
  demo.launch()