File size: 3,356 Bytes
998f824 ffbf6d9 09ec353 ffbf6d9 62ce3e8 87cdd83 b23b89a 4b10535 ffbf6d9 09ec353 ffbf6d9 b23b89a ffbf6d9 87cdd83 ffbf6d9 b23b89a ffbf6d9 87cdd83 ffbf6d9 09ec353 ffbf6d9 b23b89a 4b10535 ffbf6d9 09ec353 b23b89a 09ec353 998f824 ffbf6d9 09ec353 ffbf6d9 998f824 ffbf6d9 8ca9de9 ffbf6d9 a0ecec2 ffbf6d9 998f824 ffbf6d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
import gradio as gr
from datasets import Dataset, load_from_disk
from sentence_transformers import SentenceTransformer
import numpy as np
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
text += page.extract_text()
return text
# Load model and tokenizer
model_name = "scb10x/llama-3-typhoon-v1.5x-8b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Load a sentence transformer model for embedding generation
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Extract text from the provided PDF
pdf_path = "/home/user/app/TOPF 2564.pdf" # Ensure this path is correct
pdf_text = extract_text_from_pdf(pdf_path)
passages = [{"title": "", "text": line} for line in pdf_text.split('\n') if line.strip()]
# Convert text to embeddings
embeddings = embedding_model.encode([passage["text"] for passage in passages])
# Create a Dataset with embeddings
dataset = Dataset.from_dict({"title": [p["title"] for p in passages], "text": [p["text"] for p in passages], "embeddings": embeddings.tolist()})
# Save the dataset and create an index in the current working directory
dataset_path = "/home/user/app/rag_document_dataset"
index_path = "/home/user/app/rag_document_index"
# Ensure the directory exists
os.makedirs(dataset_path, exist_ok=True)
os.makedirs(index_path, exist_ok=True)
# Save the dataset to disk and create an index
dataset.save_to_disk(dataset_path)
dataset = load_from_disk(dataset_path)
# Add FAISS index while addressing numpy object deprecation
def add_faiss_index(dataset, column):
import faiss # Make sure faiss is installed
embeddings = np.array(dataset[column])
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
dataset.add_faiss_index(column=column)
return dataset
dataset = add_faiss_index(dataset, column="embeddings")
dataset.save(index_path)
# Custom retriever
def retrieve(query):
# Use FAISS index to retrieve relevant passages
query_embedding = embedding_model.encode([query])
scores, samples = dataset.get_nearest_examples("embeddings", query_embedding, k=5)
retrieved_passages = " ".join([sample["text"] for sample in samples])
return retrieved_passages
# Define the chat function
def answer_question(question, context):
retrieved_context = retrieve(question)
inputs = tokenizer(question + " " + retrieved_context, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
# Generate the answer
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer
# Gradio interface setup
def ask(question):
return answer_question(question, pdf_text)
demo = gr.Interface(
fn=ask,
inputs=gr.inputs.Textbox(lines=2, placeholder="Ask something..."),
outputs="text",
title="Document QA with RAG",
description="Ask questions based on the provided document."
)
if __name__ == "__main__":
demo.launch()
|