File size: 2,731 Bytes
c33d1d0 018fb30 e5633a7 842c848 018fb30 f7493dd e5633a7 c33d1d0 b08204e 1f5e9cb 18cb8f3 142d17f b08204e 6976271 b08204e ea07eae b08204e 142d17f 68b31c9 b08204e 68b31c9 018fb30 037c950 68b31c9 018fb30 037c950 842c848 018fb30 c33d1d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import gradio as gr
from dotenv import load_dotenv
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
# Load environment variables
#load_dotenv()
def get_pdf_text(pdf_docs):
"""
Extract text from a list of PDF documents.
Parameters
----------
pdf_docs : list
List of PDF documents to extract text from.
Returns
-------
str
Extracted text from all the PDF documents.
"""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""
Split the input text into chunks.
Parameters
----------
text : str
The input text to be split.
Returns
-------
list
List of text chunks.
"""
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
"""
Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
Parameters
----------
text_chunks : list
List of text chunks to be embedded.
Returns
-------
FAISS
A FAISS vector store containing the embeddings of the text chunks.
"""
model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
"normalize_embeddings": True
} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
print("-----")
print(vectorstore.as_retriever.similarity("What is ALiBi?"))
print("-----")
return vectorstore
pdf_text = get_pdf_text("./new_papers/ALiBi.pdf")
text_chunks = get_text_chunks(pdf_text)
api_db = get_vectorstore(text_chunks)
# Define the PDF retrieval function
def pdf_retrieval(query):
# Run the query through the retriever
response = api_db.similarity_search(query)
print(response)
return response
# Create Gradio interface for the API retriever
api_tool = gr.Interface(
fn=pdf_retrieval,
inputs=[gr.Textbox()],
outputs=gr.Textbox(),
live=True,
title="API PDF Retrieval Tool",
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)
# Launch the Gradio interface
api_tool.launch()
|