MohammedNasser commited on
Commit
d800d23
1 Parent(s): 345a26b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -79
app.py CHANGED
@@ -37,63 +37,31 @@ for folder in [UPLOAD_FOLDER, AUDIO_FOLDER]:
37
  if not os.path.exists(folder):
38
  os.makedirs(folder)
39
 
40
-
41
  def load_pdf(file_path):
42
  """Load and preprocess Arabic text from a PDF file."""
43
- pages = convert_from_path(file_path, 500)
 
 
 
 
 
44
  documents = []
45
  for pageNum, imgBlob in enumerate(pages):
46
- text = pytesseract.image_to_string(imgBlob, lang="ara")
47
- documents.append(text)
48
- return documents
49
-
 
 
50
 
51
- import os
52
- from langchain.vectorstores import FAISS
53
- from huggingface_hub import Repository
54
-
55
- def save_faiss_index_to_hub(vectorstore, repo_id="MohammedNasser/faiss-index"):
56
- index_dir = "faiss_index"
57
-
58
- # Ensure the index directory exists
59
- if not os.path.exists(index_dir):
60
- os.makedirs(index_dir)
61
-
62
- # Save FAISS index locally
63
- vectorstore.save_local(index_dir)
64
-
65
- # Initialize Hugging Face repository
66
- repo = Repository(local_dir=index_dir, clone_from=repo_id, repo_type="dataset")
67
-
68
- # Push the FAISS index files to the Hugging Face Hub
69
- repo.push_to_hub(commit_message="Pushing FAISS index")
70
-
71
- print(f"FAISS index saved to Hugging Face Hub: {repo_id}")
72
 
73
  def prepare_vectorstore(data):
74
- index_dir = "faiss_index"
75
- if not os.path.exists(index_dir):
76
- os.makedirs(index_dir)
77
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
78
- texts = data
79
  vectorstore = FAISS.from_texts(texts, embeddings)
80
- save_faiss_index_to_hub(vectorstore)
81
- return vectorstore
82
-
83
-
84
- def load_vectorstore(repo_id="MohammedNasser/faiss-index"):
85
- index_dir = "faiss_index"
86
 
87
- # Ensure the index directory exists
88
- if not os.path.exists(index_dir):
89
- os.makedirs(index_dir)
90
-
91
-
92
-
93
- # Download the FAISS index files from Hugging Face Hub
94
- hf_hub_download(repo_id=repo_id, filename="index.faiss", local_dir=index_dir, repo_type="dataset")
95
- hf_hub_download(repo_id=repo_id, filename="index.json", local_dir=index_dir, repo_type="dataset")
96
- vectorstore = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)
97
  return vectorstore
98
 
99
  def create_chain(vectorstore):
@@ -108,41 +76,56 @@ def create_chain(vectorstore):
108
  chain_type="map_reduce"
109
  )
110
  return chain
111
-
112
  def process_pdf(pdf_file):
 
113
  file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
114
- with open(file_path, "wb") as f:
115
- f.write(pdf_file.read())
116
- data = load_pdf(file_path)
117
- vectorstore = prepare_vectorstore(data)
118
- return "PDF processed successfully. You can now start chatting!"
 
 
 
 
119
 
120
  def chat(user_input, history):
121
- vectorstore = load_vectorstore()
122
- chain = create_chain(vectorstore)
123
-
124
- prompt = f"""
125
- You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
126
-
127
- When responding, ensure the following:
128
- - Your answer directly reflects the content of the document.
129
- - If the requested information is not available in the document, clearly state that.
130
- - Keep your response concise yet comprehensive, addressing the question fully.
131
- - Always respond in formal Arabic, without using English.
132
-
133
- Question: {user_input}
134
- Helpful Answer:"""
135
-
136
- response = chain({"question": prompt})
137
- assistant_response = response["answer"]
138
-
139
- # Generate audio file
140
- tts = gTTS(text=assistant_response, lang='ar')
141
- audio_file = f"response_{len(history)}.mp3"
142
- tts.save(os.path.join(AUDIO_FOLDER, audio_file))
143
-
144
- return assistant_response, audio_file
145
-
 
 
 
 
 
 
 
 
 
 
146
  custom_css = """
147
  body {
148
  font-family: 'Noto Kufi Arabic', sans-serif;
@@ -216,6 +199,7 @@ p {
216
  content: '🤖';
217
  }
218
  """
 
219
  # Gradio interface
220
  with gr.Blocks(css=custom_css) as demo:
221
  gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
@@ -251,4 +235,3 @@ with gr.Blocks(css=custom_css) as demo:
251
 
252
  demo.launch()
253
 
254
-
 
37
  if not os.path.exists(folder):
38
  os.makedirs(folder)
39
 
40
+ vectorstore=None
41
  def load_pdf(file_path):
42
  """Load and preprocess Arabic text from a PDF file."""
43
+ try:
44
+ pages = convert_from_path(file_path, 500)
45
+ except Exception as e:
46
+ print(f"Error loading PDF: {e}")
47
+ return []
48
+
49
  documents = []
50
  for pageNum, imgBlob in enumerate(pages):
51
+ try:
52
+ text = pytesseract.image_to_string(imgBlob, lang="ara")
53
+ documents.append(text)
54
+ except Exception as e:
55
+ print(f"Error processing page {pageNum}: {e}")
56
+ documents.append("") # Append empty string for pages where OCR failed
57
 
58
+ return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def prepare_vectorstore(data):
 
 
 
61
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separator="\n")
62
+ texts = text_splitter.split_documents(data)
63
  vectorstore = FAISS.from_texts(texts, embeddings)
 
 
 
 
 
 
64
 
 
 
 
 
 
 
 
 
 
 
65
  return vectorstore
66
 
67
  def create_chain(vectorstore):
 
76
  chain_type="map_reduce"
77
  )
78
  return chain
79
+
80
  def process_pdf(pdf_file):
81
+ global vectorstore
82
  file_path = os.path.join(UPLOAD_FOLDER, pdf_file.name)
83
+ try:
84
+ with open(file_path, "wb") as f:
85
+ f.write(pdf_file.read())
86
+ data = load_pdf(file_path)
87
+ vectorstore = prepare_vectorstore(data)
88
+ return "PDF processed successfully. You can now start chatting!"
89
+ except Exception as e:
90
+ print(f"Error processing PDF: {e}")
91
+ return "Error processing PDF."
92
 
93
  def chat(user_input, history):
94
+ if vectorstore is None:
95
+ return "Please process a PDF file first.", ""
96
+
97
+ try:
98
+ chain = create_chain(vectorstore)
99
+ prompt = f"""
100
+ You are an expert Arabic-language assistant specialized in analyzing and responding to queries about Arabic PDF documents. Your responses should be precise, informative, and reflect the professional tone and structure expected in formal Arabic communication. Focus on extracting and presenting relevant information from the document clearly and systematically, while avoiding colloquial or informal language.
101
+
102
+ When responding, ensure the following:
103
+ - Your answer directly reflects the content of the document.
104
+ - If the requested information is not available in the document, clearly state that.
105
+ - Keep your response concise yet comprehensive, addressing the question fully.
106
+ - Always respond in formal Arabic, without using English.
107
+
108
+ Question: {user_input}
109
+ Helpful Answer:"""
110
+
111
+ response = chain({"question": prompt})
112
+ assistant_response = response["answer"]
113
+
114
+ # Generate audio file
115
+ audio_file = f"response_{len(history)}.mp3"
116
+ try:
117
+ tts = gTTS(text=assistant_response, lang='ar')
118
+ tts.save(os.path.join(AUDIO_FOLDER, audio_file))
119
+ except Exception as e:
120
+ print(f"Error generating audio file: {e}")
121
+ audio_file = "" # Fallback if audio generation fails
122
+
123
+ return assistant_response, audio_file
124
+
125
+ except Exception as e:
126
+ print(f"Error during chat: {e}")
127
+ return "An error occurred while processing your request.", ""
128
+
129
  custom_css = """
130
  body {
131
  font-family: 'Noto Kufi Arabic', sans-serif;
 
199
  content: '🤖';
200
  }
201
  """
202
+
203
  # Gradio interface
204
  with gr.Blocks(css=custom_css) as demo:
205
  gr.Markdown("# ديمو بوت للقاء مركز حضرموت للدراسات التاريخية")
 
235
 
236
  demo.launch()
237