File size: 3,804 Bytes
9ae730c
 
500f141
 
 
 
 
9ae730c
500f141
9ae730c
 
500f141
 
 
 
 
 
 
 
 
9ae730c
500f141
 
 
 
 
 
 
9ae730c
500f141
4311981
9ae730c
500f141
 
 
9ae730c
500f141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ae730c
 
500f141
9ae730c
500f141
9ae730c
 
 
 
 
 
 
 
 
500f141
9ae730c
 
 
500f141
9ae730c
 
500f141
9ae730c
 
 
500f141
9ae730c
500f141
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pdfplumber
import re

# Initialize the InferenceClient
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

# Clean the extracted text
def clean_extracted_text(text):
    # Removing any unnecessary characters, such as file paths and non-text data
    cleaned_text = re.sub(r'file://[^\n]*', '', text)  # Remove file paths
    cleaned_text = re.sub(r'\d{1,2}/\d{1,2}/\d{4}', '', cleaned_text)  # Remove dates
    cleaned_text = re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s\u00C0-\u00FF]+', '', cleaned_text)  # Keep Arabic and basic text
    return cleaned_text.strip()

# Path to the uploaded PDF file
pdf_path = "Noor-Book.com  القاموس عربي فرنسي بالمصطلحات العلمية و الصور 3  (1).pdf"

# Extract and clean text from the provided PDF
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_extracted_text(pdf_text)

# Split the cleaned text into chunks for processing
def chunk_text(text, chunk_size=300):
    sentences = text.split('. ')
    chunks, current_chunk = [], ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Chunk the cleaned text
chunked_text = chunk_text(cleaned_text)

# Load pre-trained Sentence Transformer model for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())

# Generate embeddings for the chunks
embeddings = model.encode(chunked_text, convert_to_tensor=True).detach().cpu().numpy()
index.add(embeddings)

# Function to generate response from the model
def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Step 1: Retrieve relevant chunks based on the user query
    query_embedding = model.encode([message], convert_to_tensor=True).detach().cpu().numpy()
    k = 5  # Number of relevant chunks to retrieve
    _, indices = index.search(query_embedding, k)
    relevant_chunks = " ".join([chunked_text[idx] for idx in indices[0]])

    # Step 2: Create prompt for the language model
    prompt = f"{system_message}\n\nUser Query: {message}\n\nRelevant Information: {relevant_chunks}"
    response = ""

    # Step 3: Generate response using the HuggingFace model
    for message in client.chat_completion(
        [{"role": "system", "content": system_message}, {"role": "user", "content": message}],
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        yield response

# Create the Gradio interface with additional inputs
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful and empathetic mental health assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

# Launch the Gradio interface
if __name__ == "__main__":
    demo.launch()