File size: 3,356 Bytes
998f824
ffbf6d9
09ec353
ffbf6d9
62ce3e8
87cdd83
b23b89a
4b10535
ffbf6d9
 
 
 
 
 
 
 
 
 
 
 
09ec353
 
ffbf6d9
b23b89a
 
 
ffbf6d9
87cdd83
 
ffbf6d9
 
b23b89a
 
 
 
 
ffbf6d9
 
87cdd83
 
ffbf6d9
09ec353
 
 
 
ffbf6d9
 
b23b89a
4b10535
 
 
 
 
 
 
 
 
 
 
 
 
ffbf6d9
09ec353
 
 
b23b89a
 
 
09ec353
998f824
ffbf6d9
 
09ec353
 
ffbf6d9
 
998f824
ffbf6d9
 
 
 
8ca9de9
ffbf6d9
 
 
a0ecec2
ffbf6d9
 
 
 
 
 
 
998f824
ffbf6d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
import gradio as gr
from datasets import Dataset, load_from_disk
from sentence_transformers import SentenceTransformer
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Load model and tokenizer
model_name = "scb10x/llama-3-typhoon-v1.5x-8b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Load a sentence transformer model for embedding generation
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Extract text from the provided PDF
pdf_path = "/home/user/app/TOPF 2564.pdf"  # Ensure this path is correct
pdf_text = extract_text_from_pdf(pdf_path)
passages = [{"title": "", "text": line} for line in pdf_text.split('\n') if line.strip()]

# Convert text to embeddings
embeddings = embedding_model.encode([passage["text"] for passage in passages])

# Create a Dataset with embeddings
dataset = Dataset.from_dict({"title": [p["title"] for p in passages], "text": [p["text"] for p in passages], "embeddings": embeddings.tolist()})

# Save the dataset and create an index in the current working directory
dataset_path = "/home/user/app/rag_document_dataset"
index_path = "/home/user/app/rag_document_index"

# Ensure the directory exists
os.makedirs(dataset_path, exist_ok=True)
os.makedirs(index_path, exist_ok=True)

# Save the dataset to disk and create an index
dataset.save_to_disk(dataset_path)
dataset = load_from_disk(dataset_path)

# Add FAISS index while addressing numpy object deprecation
def add_faiss_index(dataset, column):
    import faiss  # Make sure faiss is installed
    embeddings = np.array(dataset[column])
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    dataset.add_faiss_index(column=column)
    return dataset

dataset = add_faiss_index(dataset, column="embeddings")
dataset.save(index_path)

# Custom retriever
def retrieve(query):
    # Use FAISS index to retrieve relevant passages
    query_embedding = embedding_model.encode([query])
    scores, samples = dataset.get_nearest_examples("embeddings", query_embedding, k=5)
    retrieved_passages = " ".join([sample["text"] for sample in samples])
    return retrieved_passages

# Define the chat function
def answer_question(question, context):
    retrieved_context = retrieve(question)
    inputs = tokenizer(question + " " + retrieved_context, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate the answer
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Gradio interface setup
def ask(question):
    return answer_question(question, pdf_text)

demo = gr.Interface(
    fn=ask,
    inputs=gr.inputs.Textbox(lines=2, placeholder="Ask something..."),
    outputs="text",
    title="Document QA with RAG",
    description="Ask questions based on the provided document."
)

if __name__ == "__main__":
    demo.launch()