File size: 2,553 Bytes
cae3cb9 6663cb2 018fb30 e5633a7 018fb30 e5633a7 018fb30 f7493dd 037c950 66fc16c e5633a7 1f5e9cb 037c950 008f20f 037c950 018fb30 38e2fac cbed288 c7297e1 f8472cb c7297e1 18cb8f3 142d17f ea07eae e181ae7 ea07eae f8472cb 037c950 ea07eae 142d17f 68b31c9 018fb30 68b31c9 037c950 68b31c9 018fb30 037c950 018fb30 42d7c62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
#!pip install -q gradio langchain pypdf chromadb
import gradio as gr
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
# Use Hugging Face Inference API embeddings
inference_api_key = os.environ['HF']
api_hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
api_key=inference_api_key,
model_name="sentence-transformers/all-MiniLM-l6-v2"
)
# Load and process the PDF files
loader = PyPDFLoader("./new_papers/ALiBi.pdf")
documents = loader.load()
print("-----------")
print(documents[0])
print("-----------")
# Split the documents into chunks and embed them using the HfApiEmbeddingTool
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
vdocuments = text_splitter.split_documents(documents)
model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
"normalize_embeddings": True
} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
api_db = FAISS.from_texts(texts=vdocuments, embedding=embeddings)
api_db.as_retriever.similarity("What is ICD?")
# Extract the embedding arrays from the PDF documents
#embeddings = []
#for doc in vdocuments:
# embeddings.extend(api_hf_embeddings.get_embeddings(doc))
# Create Chroma vector store for API embeddings
#api_db = Chroma.from_documents(vdocuments, HfApiEmbeddingRetriever, collection_name="api-collection")
# Define the PDF retrieval function
def pdf_retrieval(query):
# Run the query through the retriever
response = api_db.similarity_search(query)
return response
# Create Gradio interface for the API retriever
# Create Gradio interface for the API retriever
api_tool = gr.Interface(
fn=pdf_retrieval,
inputs=[gr.Textbox()],
outputs=gr.Textbox(),
live=True,
title="API PDF Retrieval Tool",
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HF Inference API Embeddings).",
)
# Launch the Gradio interface
api_tool.launch() |