Last commit not found
import os | |
#!pip install -q gradio langchain pypdf chromadb | |
import gradio as gr | |
from langchain.vectorstores import Chroma | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings | |
# Use Hugging Face Inference API embeddings | |
inference_api_key = os.environ['HF'] | |
api_hf_embeddings = HuggingFaceInferenceAPIEmbeddings( | |
api_key=inference_api_key, | |
model_name="sentence-transformers/all-MiniLM-l6-v2" | |
) | |
# Load and process the PDF files | |
loader = PyPDFLoader("./new_papers/ALiBi.pdf") | |
documents = loader.load() | |
print("-----------") | |
print(documents[0]) | |
print("-----------") | |
# Load the document, split it into chunks, embed each chunk, and load it into the vector store. | |
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) | |
vdocuments = text_splitter.split_documents(documents) | |
# Add these lines before creating the Chroma vector store | |
#print("Length of embeddings: %s", len(api_hf_embeddings)) | |
print("Length of documents: %s", len(documents)) | |
print("Length of vdocuments: %s", len(vdocuments)) | |
# Add these lines before creating the Chroma vector store | |
#logger.debug("Length of vdocuments: %s", len(vdocuments)) | |
if vdocuments and 'embeddings' in vdocuments[0]: | |
first_document_embeddings = vdocuments[0]['embeddings'] | |
print("Length of embeddings for the first document: %s", len(first_document_embeddings)) | |
# Create Chroma vector store for API embeddings | |
api_db = Chroma.from_documents(vdocuments, api_hf_embeddings, collection_name="api-collection") | |
# Define the PDF retrieval function | |
def pdf_retrieval(query): | |
# Run the query through the retriever | |
response = api_db.similarity_search(query) | |
return response | |
# Create Gradio interface for the API retriever | |
# Create Gradio interface for the API retriever | |
api_tool = gr.Interface( | |
fn=pdf_retrieval, | |
inputs=[gr.Textbox()], | |
outputs=gr.Textbox(), | |
live=True, | |
title="API PDF Retrieval Tool", | |
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HF Inference API Embeddings).", | |
) | |
# Launch the Gradio interface | |
api_tool.launch() | |