|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
import pdfplumber |
|
import re |
|
|
|
|
|
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = "" |
|
with pdfplumber.open(pdf_path) as pdf: |
|
for page in pdf.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text |
|
return text |
|
|
|
|
|
def clean_extracted_text(text): |
|
|
|
cleaned_text = re.sub(r'file://[^\n]*', '', text) |
|
cleaned_text = re.sub(r'\d{1,2}/\d{1,2}/\d{4}', '', cleaned_text) |
|
cleaned_text = re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s\u00C0-\u00FF]+', '', cleaned_text) |
|
return cleaned_text.strip() |
|
|
|
|
|
pdf_path = "Noor-Book.com القاموس عربي فرنسي بالمصطلحات العلمية و الصور 3 (1).pdf" |
|
|
|
|
|
pdf_text = extract_text_from_pdf(pdf_path) |
|
cleaned_text = clean_extracted_text(pdf_text) |
|
|
|
|
|
def chunk_text(text, chunk_size=300): |
|
sentences = text.split('. ') |
|
chunks, current_chunk = [], "" |
|
for sentence in sentences: |
|
if len(current_chunk) + len(sentence) <= chunk_size: |
|
current_chunk += sentence + ". " |
|
else: |
|
chunks.append(current_chunk.strip()) |
|
current_chunk = sentence + ". " |
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
return chunks |
|
|
|
|
|
chunked_text = chunk_text(cleaned_text) |
|
|
|
|
|
model = SentenceTransformer("all-MiniLM-L6-v2") |
|
index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension()) |
|
|
|
|
|
embeddings = model.encode(chunked_text, convert_to_tensor=True).detach().cpu().numpy() |
|
index.add(embeddings) |
|
|
|
|
|
def respond(message, history, system_message, max_tokens, temperature, top_p): |
|
|
|
query_embedding = model.encode([message], convert_to_tensor=True).detach().cpu().numpy() |
|
k = 5 |
|
_, indices = index.search(query_embedding, k) |
|
relevant_chunks = " ".join([chunked_text[idx] for idx in indices[0]]) |
|
|
|
|
|
prompt = f"{system_message}\n\nUser Query: {message}\n\nRelevant Information: {relevant_chunks}" |
|
response = "" |
|
|
|
|
|
for message in client.chat_completion( |
|
[{"role": "system", "content": system_message}, {"role": "user", "content": message}], |
|
max_tokens=max_tokens, |
|
stream=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
): |
|
token = message.choices[0].delta.content |
|
response += token |
|
yield response |
|
|
|
|
|
demo = gr.ChatInterface( |
|
respond, |
|
additional_inputs=[ |
|
gr.Textbox(value="You are a helpful and empathetic mental health assistant.", label="System message"), |
|
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
|
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), |
|
], |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |