|
import os |
|
import gradio as gr |
|
from langchain.vectorstores.faiss import FAISS |
|
from langchain.embeddings import HuggingFaceBgeEmbeddings |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from PyPDF2 import PdfReader |
|
|
|
|
|
|
|
|
|
|
|
|
|
current_directory = os.getcwd() |
|
print("Current Working Directory:", current_directory) |
|
|
|
def get_pdf_text(pdf_docs): |
|
""" |
|
Extract text from a list of PDF documents. |
|
|
|
Parameters |
|
---------- |
|
pdf_docs : list |
|
List of PDF documents to extract text from. |
|
|
|
Returns |
|
------- |
|
str |
|
Extracted text from all the PDF documents. |
|
|
|
""" |
|
text = "" |
|
|
|
pdf_reader = PdfReader(pdf_docs) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def get_text_chunks(text): |
|
""" |
|
Split the input text into chunks. |
|
|
|
Parameters |
|
---------- |
|
text : str |
|
The input text to be split. |
|
|
|
Returns |
|
------- |
|
list |
|
List of text chunks. |
|
|
|
""" |
|
text_splitter = CharacterTextSplitter( |
|
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len |
|
) |
|
chunks = text_splitter.split_text(text) |
|
return chunks |
|
|
|
|
|
def get_vectorstore(text_chunks): |
|
""" |
|
Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings. |
|
|
|
Parameters |
|
---------- |
|
text_chunks : list |
|
List of text chunks to be embedded. |
|
|
|
Returns |
|
------- |
|
FAISS |
|
A FAISS vector store containing the embeddings of the text chunks. |
|
|
|
""" |
|
model = "BAAI/bge-base-en-v1.5" |
|
encode_kwargs = { |
|
"normalize_embeddings": True |
|
} |
|
embeddings = HuggingFaceBgeEmbeddings( |
|
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"} |
|
) |
|
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) |
|
print("-----") |
|
print(vectorstore.similarity_search("What is ALiBi?")) |
|
print("-----") |
|
return vectorstore |
|
|
|
|
|
pdf_path = r"new_papers/ALiBi.pdf" |
|
pdf_text = get_pdf_text(pdf_path) |
|
|
|
text_chunks = get_text_chunks(pdf_text) |
|
api_db = get_vectorstore(text_chunks) |
|
|
|
|
|
|
|
|
|
def pdf_retrieval(query): |
|
|
|
response = api_db.similarity_search(query) |
|
print(response) |
|
return response |
|
|
|
|
|
api_tool = gr.Interface( |
|
fn=pdf_retrieval, |
|
inputs=[gr.Textbox()], |
|
outputs=gr.Textbox(), |
|
live=True, |
|
title="API PDF Retrieval Tool", |
|
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).", |
|
) |
|
|
|
|
|
api_tool.launch() |
|
|