In [2]:
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
 FastembedSparseDocumentEmbedder
)
from haystack.dataclasses import Document
from typing import List


In [None]:
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
 FastembedSparseDocumentEmbedder,
 FastembedDocumentEmbedder
)
from haystack.dataclasses import Document
from typing import List

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

 def to_dict(self):
 return {
 "api_key": self.client.api_key,
 "model_name": self.model_name,
 "base_url": self.client.base_url
 }

 @classmethod
 def from_dict(cls, data):
 return cls(
 api_key=data["api_key"],
 model_name=data["model_name"],
 base_url=data["base_url"]
 )

# Initialize Qdrant document store
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="aaa_test",
 recreate_index=True,
 use_sparse_embeddings=True, # Enable hybrid search
 sparse_idf=True, # Enable IDF calculation for sparse embeddings
 embedding_dim=768 # Adjust based on your DeepInfra model's dimension
)

cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
 remove_repeated_substrings=False
)

# Create pipeline components
converter = PyPDFToDocument()

document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300,
 split_overlap=30
)

# Configure embedders
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

# Create document writer
writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE
)

# Create and configure the pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect the components
indexing_pipeline.connect("converter", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Get list of PDF files and prepare metadata
pdf_folder = "/root/hbr"
pdf_files = []
metadata_list = []

for filename in os.listdir(pdf_folder):
 if filename.endswith('.pdf'):
 file_path = Path(os.path.join(pdf_folder, filename))
 pdf_files.append(file_path)
 metadata_list.append({
 "file_name": filename,
 "file_path": str(file_path),
 "source_type": "pdf"
 })

# Run the pipeline with metadata
indexing_pipeline.run({
 "converter": {
 "sources": pdf_files,
 "meta": metadata_list
 }
})

print(f"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}")

# Optional: Verify metadata
documents = document_store.filter_documents(filters={})
if documents:
 print("\nSample document metadata:")
 print(f"File name: {documents[0].meta.get('file_name')}")
 print(f"File path: {documents[0].meta.get('file_path')}")


Calculating sparse embeddings: 100%|██████████| 393/393 [02:01<00:00, 3.25it/s]
400it [00:02, 161.43it/s] 


Indexed documents in Qdrant. Total documents: 393

Sample document metadata:
File name: HBR_Tech.pdf
File path: /root/hbr/HBR_Tech.pdf


In [23]:
PyPDFToDocument()


Inputs:
 - sources: List[Union[str, Path, ByteStream]]
 - meta: Union[Dict[str, Any], List[Dict[str, Any]]]
Outputs:
 - documents: List[Document]

In [None]:
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder
from openai import OpenAI
import numpy as np
from haystack import component
from haystack.dataclasses import Document
from typing import List

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(embedding=List[float])
 def run(self, text: str) -> dict:
 response = self.client.embeddings.create(
 model=self.model_name,
 input=[text],
 encoding_format="float"
 )
 embedding = np.array(response.data[0].embedding)
 return {"embedding": embedding}


prompt_template = """
I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.

Context:
{% for doc in documents %}
Content: {{ doc.content }}
Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}
---
{% endfor %}

Search Query: {{question}}

Relevant Matches:
Here are the exact matches found in the documents:

"[exact quote]" 
Location: [filename, page X]

"[exact quote]" 
Location: [filename, page X]

If no relevant matches are found, I will respond:
"No matching information found in the provided documents."
"""
prompt_builder = PromptBuilder(template=prompt_template)
llm = OpenAIGenerator(
 api_key=Secret.from_token("XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"),
 api_base_url="https://api.deepinfra.com/v1/openai",
 model="meta-llama/Meta-Llama-3.1-8B-Instruct",
 generation_kwargs={
 "max_tokens": 512,
 "temperature": 0.7,
 }
)

deep_infra_embedder = DeepInfraEmbeddings()

hybrid_query = Pipeline()
hybrid_query.add_component('sparse_text_embedder',FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1"))
hybrid_query.add_component('dense_text_embedder',deep_infra_embedder)
hybrid_query.add_component("retriever", QdrantHybridRetriever(document_store=document_store))
hybrid_query.add_component("prompt_builder", prompt_builder)
hybrid_query.add_component("llm", llm)


hybrid_query.connect('sparse_text_embedder.sparse_embedding','retriever.query_sparse_embedding')
hybrid_query.connect("dense_text_embedder.embedding", "retriever.query_embedding")
hybrid_query.connect("retriever", "prompt_builder.documents")
hybrid_query.connect("prompt_builder", "llm")


question = "what is this document about ?"
response = hybrid_query.run({
 "dense_text_embedder": {"text": question},
 "sparse_text_embedder": {"text": question},
 "retriever": {"top_k": 5},
 "prompt_builder": {"question": question}
})

Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 13.05it/s]


In [18]:
response

{'llm': {'replies': ['Here are the relevant matches:\n\n* "Synthetic data is a tool that addresses many data challenges, particularly artificial intelligence and analytics issues such as privacy protection, regulatory compliance, accessibility, data scarcity, and bias, as well as data sharing and time to data (and therefore time to market)." \nLocation: HBR_Synthetic_Data.pdf, Page: 2\n* "Synthetic data is a tool that addresses many data challenges, particularly artificial intelligence and analytics issues such as privacy protection, regulatory compliance, accessibility, data scarcity, and bias, as well as data sharing and time to data (and therefore time to market)." \nLocation: HBR_Synthetic_Data.pdf, Page: 2\n* "The synthetic data opportunity means something different for every organization, but new revenue streams; faster, easier, GDPR-compliant data access; better pricing models; and scalable, ethical, and explainable AI all are within reach for those business leaders ready to rem

In [32]:
from haystack import Pipeline
from haystack.utils import Secret
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from openai import OpenAI
import numpy as np
from haystack import component
from haystack.dataclasses import Document
from typing import List
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder
from openai import OpenAI
import numpy as np
from haystack import component
from haystack.dataclasses import Document
from typing import List


# Connect to existing ChromaDB document store
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="aaa_test",
 recreate_index=False,
 use_sparse_embeddings=True, # Enable hybrid search
 sparse_idf=True, # Enable IDF calculation for sparse embeddings
 embedding_dim=768, # Adjust based on your DeepInfra model's dimension
 similarity="cosine"
)

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(embedding=List[float])
 def run(self, text: str) -> dict:
 response = self.client.embeddings.create(
 model=self.model_name,
 input=[text],
 encoding_format="float"
 )
 embedding = np.array(response.data[0].embedding)
 return {"embedding": embedding}

# Create prompt template for QA
# Modified prompt template to include metadata
prompt_template = """
I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.

For each search query, I will:
1. Identify matching sentences/passages from the documents
2. Show the exact quotes with their source locations
3. Provide file names and page numbers where the information can be found

Context:
{% for doc in documents %}
Content: {{ doc.content }}
Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}
---
{% endfor %}

Search Query: {{question}}

Relevant Matches:
Here are the exact matches found in the documents:

"[exact quote]" 
Location: [filename, page X]

"[exact quote]" 
Location: [filename, page X]

If no relevant matches are found, I will respond:
"No matching information found in the provided documents."

Note: I do not provide answers or interpretations. I only help locate where information exists within the documents.
"""

def run_pipeline(question, top_k=10):
 """
 Run the pipeline and return both the answer and retrieved documents.
 """
 # Initialize components
 embedder = DeepInfraEmbeddings()
 # embedder = FastembedTextEmbedder(model="BAAI/bge-small-en-v1.5", prefix="Represent this sentence for searching relevant passages: ")
 sparse_embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1")
 retriever = QdrantHybridRetriever(document_store=document_store,
 top_k=10
 )
 prompt_builder = PromptBuilder(template=prompt_template)
 llm = OpenAIGenerator(
 api_key=Secret.from_token("XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"),
 api_base_url="https://api.deepinfra.com/v1/openai",
 model="meta-llama/Meta-Llama-3.1-8B-Instruct",
 generation_kwargs={
 "max_tokens": 512,
 "temperature": 0.7,
 }
 )

 # Create and connect pipeline components
 pipeline = Pipeline()
 pipeline.add_component("embedder", embedder)
 pipeline.add_component("sparse_embedder", sparse_embedder)
 pipeline.add_component("retriever", retriever)
 pipeline.add_component("prompt_builder", prompt_builder)
 pipeline.add_component("llm", llm)

 pipeline.connect("sparse_embedder.sparse_embedding", "retriever.query_sparse_embedding")
 pipeline.connect("embedder.embedding", "retriever.query_embedding")
 pipeline.connect("retriever", "prompt_builder.documents")
 pipeline.connect("prompt_builder", "llm")

 # Run pipeline
 response = pipeline.run({
 "sparse_embedder": {"text": question},
 "embedder": {"text": question},
 "retriever": {"top_k": top_k},
 "prompt_builder": {"question": question}
 })

 # Get documents with metadata - Corrected retriever call
 dense_embedding = embedder.run(text=question)["embedding"]
 sparse_embedding = sparse_embedder.run(text=question)["sparse_embedding"]
 
 retriever_response = retriever.run(
 query_embedding=dense_embedding,
 query_sparse_embedding=sparse_embedding,
 top_k=top_k
 )
 
 documents = retriever_response.get("documents", [])

 # Extract answer
 answer = response["llm"]["replies"][0]

 print("\nRetrieved Documents with Metadata:")
 print("=" * 50)
 if documents:
 for i, doc in enumerate(documents, 1):
 print(f"\nDocument {i}:")
 print("-" * 30)
 print(f"Content: {doc.content}")
 if hasattr(doc, 'meta') and doc.meta:
 print("\nMetadata:")
 for key, value in doc.meta.items():
 print(f"- {key}: {value}")

 return answer, documents


def display_results(question):
 """
 Display the results in a formatted way
 """
 print(f"\nQuestion: {question}")
 print(f"\nDocument Store Status:")
 print(f"Total documents: {document_store.count_documents()}")
 
 answer, documents = run_pipeline(question)
 
 print(f"\nAnswer: {answer}")
 print(f"\nNumber of documents retrieved: {len(documents)}")

# Usage
if __name__ == "__main__":
 question = """
 where are the instructions ??
 
 """
 display_results(question)




Question: 
 where are the instructions ??
 
 

Document Store Status:
Total documents: 80


Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 16.56it/s]
Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 26.54it/s]


Retrieved Documents with Metadata:

Document 1:
------------------------------
Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or 




In [31]:
from haystack import Pipeline
from haystack.utils import Secret
# from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
# from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from openai import OpenAI
import numpy as np
from haystack import component
from haystack.dataclasses import Document
from typing import List
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder
from openai import OpenAI
import numpy as np
from haystack import component
from haystack.dataclasses import Document
from typing import List



from haystack_integrations.components.embedders.fastembed import (
 FastembedTextEmbedder,
 FastembedSparseTextEmbedder
)

# Connect to existing ChromaDB document store
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="aaa_test",
 recreate_index=False,
 use_sparse_embeddings=True, # Enable hybrid search
 sparse_idf=True, # Enable IDF calculation for sparse embeddings
 embedding_dim=768, # Adjust based on your DeepInfra model's dimension
 similarity="cosine"
)

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(embedding=List[float])
 def run(self, text: str) -> dict:
 response = self.client.embeddings.create(
 model=self.model_name,
 input=[text],
 encoding_format="float"
 )
 embedding = np.array(response.data[0].embedding)
 return {"embedding": embedding}

# Create prompt template for QA
# Modified prompt template to include metadata
prompt_template = """
I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.

For each search query, I will:
1. Identify matching sentences/passages from the documents
2. Show the exact quotes with their source locations
3. Provide file names and page numbers where the information can be found

Context:
{% for doc in documents %}
Content: {{ doc.content }}
Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}
---
{% endfor %}

Search Query: {{question}}

Relevant Matches:
Here are the exact matches found in the documents:

"[exact quote]" 
Location: [filename, page X]

"[exact quote]" 
Location: [filename, page X]

If no relevant matches are found, I will respond:
"No matching information found in the provided documents."

Note: I do not provide answers or interpretations. I only help locate where information exists within the documents.
"""

def run_pipeline(question, top_k=10):
 """
 Run the pipeline and return both the answer and retrieved documents.
 """
 # Initialize components
 embedder = FastembedTextEmbedder(
 model="BAAI/bge-base-en-v1.5",
 prefix="Represent this sentence for searching relevant passages: "
 )
 sparse_embedder = FastembedSparseTextEmbedder(
 model="prithvida/Splade_PP_en_v1"
 )
 retriever = QdrantHybridRetriever(
 document_store=document_store,
 top_k=top_k
 # sparse_weight=0.5, # Add weights for hybrid search
 # dense_weight=0.5
 )
 prompt_builder = PromptBuilder(template=prompt_template)
 llm = OpenAIGenerator(
 api_key=Secret.from_token("XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"),
 api_base_url="https://api.deepinfra.com/v1/openai",
 model="meta-llama/Meta-Llama-3.1-8B-Instruct",
 generation_kwargs={
 "max_tokens": 512,
 "temperature": 0.7,
 }
 )

 # Create pipeline
 pipeline = Pipeline()
 pipeline.add_component("embedder", embedder)
 pipeline.add_component("sparse_embedder", sparse_embedder)
 pipeline.add_component("retriever", retriever)
 pipeline.add_component("prompt_builder", prompt_builder)
 pipeline.add_component("llm", llm)

 # Connect components
 pipeline.connect("sparse_embedder.sparse_embedding", "retriever.query_sparse_embedding")
 pipeline.connect("embedder.embedding", "retriever.query_embedding")
 pipeline.connect("retriever", "prompt_builder.documents")
 pipeline.connect("prompt_builder", "llm")

 # Run pipeline
 response = pipeline.run({
 "sparse_embedder": {"text": question},
 "embedder": {"text": question},
 "retriever": {"top_k": top_k},
 "prompt_builder": {"question": question}
 })

 # Get documents with metadata
 dense_embedding = embedder.run(text=question)["embedding"]
 sparse_embedding = sparse_embedder.run(text=question)["sparse_embedding"]
 
 retriever_response = retriever.run(
 query_embedding=dense_embedding,
 query_sparse_embedding=sparse_embedding,
 top_k=top_k
 )
 
 documents = retriever_response.get("documents", [])

 # Extract answer
 answer = response["llm"]["replies"][0]

 print("\nRetrieved Documents with Metadata:")
 print("=" * 50)
 if documents:
 for i, doc in enumerate(documents, 1):
 print(f"\nDocument {i}:")
 print("-" * 30)
 print(f"Content: {doc.content}")
 if hasattr(doc, 'meta') and doc.meta:
 print("\nMetadata:")
 for key, value in doc.meta.items():
 print(f"- {key}: {value}")

 return answer, documents


def display_results(question):
 """
 Display the results in a formatted way
 """
 print(f"\nQuestion: {question}")
 print(f"\nDocument Store Status:")
 print(f"Total documents: {document_store.count_documents()}")
 
 answer, documents = run_pipeline(question)
 
 print(f"\nAnswer: {answer}")
 print(f"\nNumber of documents retrieved: {len(documents)}")

# Usage
if __name__ == "__main__":
 question = """
 where are the instructions ?
 
 """
 display_results(question)






Question: 
 where are the instructions ?
 
 

Document Store Status:
Total documents: 80


Fetching 5 files: 100%|██████████| 5/5 [00:06<00:00, 1.25s/it]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 1.22it/s]
Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 17.84it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 1.51it/s]
Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 22.32it/s]


Retrieved Documents with Metadata:

Document 1:
------------------------------
Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or 




## ocr

In [16]:
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import base64
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
 api_key="XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 base_url="https://api.deepinfra.com/v1/openai",
)

# Function to get the total number of pages in a PDF file
def get_pdf_page_count(pdf_path):
 """Get the total number of pages in a PDF file."""
 with open(pdf_path, "rb") as f:
 pdf_reader = PdfReader(f)
 return len(pdf_reader.pages)

# Function to convert PDF pages to images, limiting to actual page count
def convert_pdf_to_images(pdf_path, dpi=100):
 """Convert PDF pages to images, up to the actual number of pages."""
 total_pages = get_pdf_page_count(pdf_path)
 return convert_from_path(pdf_path, dpi=dpi, last_page=total_pages)

def encode_image_to_base64(image):
 from io import BytesIO
 buffered = BytesIO()
 image.save(buffered, format="JPEG")
 return base64.b64encode(buffered.getvalue()).decode('utf-8')

# Path to your scanned PDF
pdf_path = "/root/PublicWaterMassMailing.pdf"

# Convert scanned PDF pages into images (limiting to actual page count)
images = convert_pdf_to_images(pdf_path)

# Process each page image
for i, image in enumerate(images):
 # Save each image temporarily (optional: you can skip saving and directly encode)
 image_path = f"page_{i+1}.jpg"
 image.save(image_path, 'JPEG')
 
 # Encode image to base64
 base64_image = encode_image_to_base64(image)
 
 # Send the base64-encoded image to DeepInfra's Vision model for OCR
 chat_completion = openai.chat.completions.create(
 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
 messages=[
 {
 "role": "user",
 "content": [
 {
 "type": "image_url",
 "image_url": {
 "url": f"data:image/jpeg;base64,{base64_image}"
 }
 },
 {
 "type": "text",
 "text": "only response with all the extracted text from the document and nothing else"
 }
 ]
 }
 ]
 )
 
 # Print or process OCR result for each page
 print(f"Text from page {i+1}:", chat_completion.choices[0].message.content)

Text from page 1: **Missouri Department of Health and Senior Services**

P.O. Box 570, Jefferson City, MO 65102-0570 Phone: 573-751-5400 FAX: 573-751-6010 RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466

**Peter Lyskowski**

Acting Director

**Jeremiah W. (Jay) Nixon**

Governor

**Missouri Public Water Systems**

November 10, 2015

Dear Public Water System Owners/Operators:

The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWI

In [10]:
from PyPDF2 import PdfReader

pdf_path = "/root/PublicWaterMassMailing.pdf"

def get_pdf_page_count(pdf_path):
 """Get the total number of pages in a PDF file."""
 with open(pdf_path, "rb") as f:
 pdf_reader = PdfReader(f)
 return len(pdf_reader.pages)
 
print(get_pdf_page_count(pdf_path))

8


In [None]:
from pdf2image import convert_from_path
from PIL import Image

# Convert PDF pages to images
def convert_pdf_to_images(pdf_path, dpi=300):
 """Convert PDF pages to images."""
 return convert_from_path(pdf_path, dpi=dpi)

# Display each page as an image
def display_pdf_pages_as_images(pdf_path):
 """Display each page of the PDF as an image."""
 images = convert_pdf_to_images(pdf_path)
 
 for i, image in enumerate(images):
 # Display the image using PIL's show method
 print(f"Displaying page {i+1}")
 image.show()

# Path to your scanned PDF
pdf_path = "/root/PublicWaterMassMailing.pdf"

# Display all pages
display_pdf_pages_as_images(pdf_path)


Displaying page 1
Displaying page 2


Error: no "view" mailcap rules found for type "image/png"


Displaying page 3


Error: no "view" mailcap rules found for type "image/png"


Displaying page 4


Error: no "view" mailcap rules found for type "image/png"


Displaying page 5


Error: no "view" mailcap rules found for type "image/png"


Displaying page 6


Error: no "view" mailcap rules found for type "image/png"


Displaying page 7


Error: no "view" mailcap rules found for type "image/png"


Displaying page 8


Error: no "view" mailcap rules found for type "image/png"


Error: no "view" mailcap rules found for type "image/png"


In [19]:
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
 FastembedSparseDocumentEmbedder,
 # FastembedDocumentEmbedder
)
from haystack.dataclasses import Document
from typing import List
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
import base64
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from io import BytesIO

# Initialize OpenAI client for DeepInfra embeddings and OCR
openai = OpenAI(
 api_key="XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 base_url="https://api.deepinfra.com/v1/openai",
)

# Function to check if PDF is scanned (no extractable text)
def is_scanned_pdf(pdf_path):
 try:
 with open(pdf_path, "rb") as f:
 pdf_reader = PdfReader(f)
 for page in pdf_reader.pages:
 if page.extract_text():
 return False # Text found, not a scanned document
 return True # No extractable text found, likely a scanned document
 except Exception as e:
 raise ValueError(f"Error reading PDF file: {e}")

# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model
def perform_ocr_on_pdf(pdf_path):
 try:
 images = convert_from_path(pdf_path)
 ocr_text = ""
 ocr_metadata = []

 for i, image in enumerate(images):
 # Convert image to base64 for OCR processing
 buffered = BytesIO()
 image.save(buffered, format="JPEG")
 base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 # Perform OCR using DeepInfra Vision model
 chat_completion = openai.chat.completions.create(
 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
 messages=[
 {
 "role": "user",
 "content": [
 {
 "type": "image_url",
 "image_url": {
 "url": f"data:image/jpeg;base64,{base64_image}"
 }
 },
 {
 "type": "text",
 "text": "only respond with all the extracted text from the document and nothing else"
 }
 ]
 }
 ]
 )
 
 # Append OCR result for each page along with page number metadata
 ocr_text += chat_completion.choices[0].message.content + "\n"
 ocr_metadata.append({
 "page_number": i + 1,
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path)
 })
 
 return ocr_text, ocr_metadata
 
 except Exception as e:
 raise ValueError(f"Error during OCR processing: {e}")

@component
class PdfToTextOrOCR:
 def run(self, documents: List[Path]) -> List[Document]:
 processed_documents = []
 
 for pdf_path in documents:
 if is_scanned_pdf(pdf_path):
 print(f"Performing OCR on scanned PDF: {pdf_path}")
 extracted_text, ocr_metadata = perform_ocr_on_pdf(pdf_path)
 
 # Create Haystack Document object with extracted content and metadata per page.
 for meta in ocr_metadata:
 doc = Document(content=extracted_text, meta=meta)
 processed_documents.append(doc)
 else:
 print(f"Extracting text from non-scanned PDF: {pdf_path}")
 with open(pdf_path, "rb") as f:
 reader = PdfReader(f)
 extracted_text = "\n".join([page.extract_text() for page in reader.pages])
 
 # Create Haystack Document object with extracted content and metadata.
 doc = Document(content=extracted_text, meta={
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "source_type": "pdf"
 })
 processed_documents.append(doc)
 
 return processed_documents # Return list of Document objects directly
 
@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

# Initialize Qdrant document store as before
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="aaa_test",
 recreate_index=True,
 use_sparse_embeddings=True,
 sparse_idf=True,
 embedding_dim=768 # Adjust based on your DeepInfra model's dimension
)

# Initialize other components as before
cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
)

document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300,
 split_overlap=30,
)

# Initialize embedders (DeepInfra and Fastembed)
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE,
)

# Create and configure the pipeline with new PdfToTextOrOCR component
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("pdf_to_text_or_ocr", PdfToTextOrOCR())
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect components in the pipeline flow 
indexing_pipeline.connect("pdf_to_text_or_ocr", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Get list of PDF files and prepare metadata as before
pdf_folder = "/root/hbr"
pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]

# Run the pipeline on all PDFs in folder (with automatic OCR handling)
indexing_pipeline.run({"pdf_to_text_or_ocr": pdf_files})

print(f"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}")



PipelineConnectError: Cannot connect 'pdf_to_text_or_ocr' with 'cleaner': no matching connections available.
'pdf_to_text_or_ocr':

'cleaner':
 - documents: List[Document] (available)

In [52]:
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
 FastembedSparseDocumentEmbedder,
 FastembedDocumentEmbedder
)
from haystack.dataclasses import Document
from typing import List
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
import base64
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from io import BytesIO

# Initialize OpenAI client for DeepInfra embeddings and OCR
openai = OpenAI(
 api_key="XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 base_url="https://api.deepinfra.com/v1/openai",
)

# Function to check if PDF is scanned (no extractable text)
def is_scanned_pdf(pdf_path):
 try:
 with open(pdf_path, "rb") as f:
 pdf_reader = PdfReader(f)
 for page in pdf_reader.pages:
 if page.extract_text():
 return False # Text found, not a scanned document
 return True # No extractable text found, likely a scanned document
 except Exception as e:
 raise ValueError(f"Error reading PDF file: {e}")

# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model
def perform_ocr_on_pdf(pdf_path):
 try:
 images = convert_from_path(pdf_path)
 ocr_text = ""
 ocr_metadata = []

 for i, image in enumerate(images):
 # Convert image to base64 for OCR processing
 buffered = BytesIO()
 image.save(buffered, format="JPEG")
 base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 # Perform OCR using DeepInfra Vision model
 chat_completion = openai.chat.completions.create(
 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
 messages=[
 {
 "role": "user",
 "content": [
 {
 "type": "image_url",
 "image_url": {
 "url": f"data:image/jpeg;base64,{base64_image}"
 }
 },
 {
 "type": "text",
 "text": "only respond with all the extracted text from the document and nothing else"
 }
 ]
 }
 ]
 )
 
 # Append OCR result for each page along with page number metadata
 ocr_text += chat_completion.choices[0].message.content + "\n"
 ocr_metadata.append({
 "page_number": i + 1,
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "content": ocr_text
 })
 
 return ocr_metadata
 
 except Exception as e:
 raise ValueError(f"Error during OCR processing: {e}")

@component
class PdfToTextOrOCR:
 @component.output_types(documents=List[Document])
 def run(self, documents: List[Path]) -> List[Document]:
 processed_documents = []
 
 for pdf_path in documents:
 if is_scanned_pdf(pdf_path):
 print(f"Performing OCR on scanned PDF: {pdf_path}")
 ocr_metadata = perform_ocr_on_pdf(pdf_path)
 
 # Create Haystack Document object with extracted content and metadata per page.
 for meta in ocr_metadata:
 meta_without_content = {key: value for key, value in meta.items() if key != 'content'}
 doc = Document(content=meta['content'], meta=meta_without_content)
 processed_documents.append(doc)
 else:
 print(f"Extracting text from non-scanned PDF: {pdf_path}")
 with open(pdf_path, "rb") as f:
 reader = PdfReader(f)
 extracted_text = "\n".join([page.extract_text() for page in reader.pages])
 
 # Create Haystack Document object with extracted content and metadata.
 doc = Document(content=extracted_text, meta={
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "source_type": "pdf"
 })
 processed_documents.append(doc)
 
 return {'documents':processed_documents} # Return list of Document objects directly

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

 def to_dict(self):
 return {
 "api_key": self.client.api_key,
 "model_name": self.model_name,
 "base_url": self.client.base_url
 }

 @classmethod
 def from_dict(cls, data):
 return cls(
 api_key=data["api_key"],
 model_name=data["model_name"],
 base_url=data["base_url"]
 )

# Initialize Qdrant document store as before
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="_test",
 recreate_index=False,
 use_sparse_embeddings=True,
 sparse_idf=True,
 embedding_dim=768 # Adjust based on your DeepInfra model's dimension
)

# Initialize other components as before
cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
)

document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300,
 split_overlap=30,
)

# Initialize embedders (DeepInfra and Fastembed)
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE,
)

# Create and configure the pipeline with new PdfToTextOrOCR component
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("pdf_to_text_or_ocr", PdfToTextOrOCR())
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect components in the pipeline flow 
indexing_pipeline.connect("pdf_to_text_or_ocr", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Get list of PDF files and prepare metadata as before
pdf_folder = "/root/hbr"
pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]

# Run the pipeline on all PDFs in folder (with automatic OCR handling)
indexing_pipeline.run({"documents": pdf_files})

print(f"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}")


Performing OCR on scanned PDF: /root/hbr/PublicWaterMassMailing-1_removed.pdf


Extracting text from non-scanned PDF: /root/hbr/HBR_Synthetic_Data.pdf


Calculating sparse embeddings: 100%|██████████| 18/18 [00:08<00:00, 2.02it/s]
100it [00:00, 577.69it/s] 

Indexed documents in Qdrant. Total documents: 18





In [24]:
@component
class PdfToTextOrOCR:
 def run(self, documents: List[Path]) -> List[Document]:
 processed_documents = []
 
 for pdf_path in documents:
 if is_scanned_pdf(pdf_path):
 print(f"Performing OCR on scanned PDF: {pdf_path}")
 extracted_text, ocr_metadata = perform_ocr_on_pdf(pdf_path)
 
 # Create Haystack Document object with extracted content and metadata per page.
 for meta in ocr_metadata:
 doc = Document(content=extracted_text, meta=meta)
 processed_documents.append(doc)
 else:
 print(f"Extracting text from non-scanned PDF: {pdf_path}")
 with open(pdf_path, "rb") as f:
 reader = PdfReader(f)
 extracted_text = "\n".join([page.extract_text() for page in reader.pages])
 
 # Create Haystack Document object with extracted content and metadata.
 doc = Document(content=extracted_text, meta={
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "source_type": "pdf"
 })
 processed_documents.append(doc)
 
 return processed_documents # Return list of Document objects directly

# Test function for PdfToTextOrOCR
def test_pdf_to_text_or_ocr():
 # Sample PDF file paths (replace with actual file paths on your system)
 pdf_files = [Path("/root/hbr/PublicWaterMassMailing.pdf")]

 # Initialize the component
 pdf_to_text_or_ocr = PdfToTextOrOCR()

 # Run the component
 documents = pdf_to_text_or_ocr.run(pdf_files)

 # Print results
 for doc in documents:
 print(f"Document Content: {doc.content}")
 print(f"Document Metadata: {doc.meta}")

# Run the test
test_pdf_to_text_or_ocr()

Performing OCR on scanned PDF: /root/hbr/PublicWaterMassMailing.pdf
Document Content: Missouri Department of Health and Senior Services
P.O. Box 570, Jefferson City, MO 65102-0570
Phone: 573-751-6400
FAX: 573-751-6010
RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466

Peter Lyskowski
Acting Director

Jeremiah W. (Jay) Nixon
Governor

Missouri Public Water Systems

November 10, 2015

Dear Public Water System Owners/Operators:

The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Saf

In [62]:

document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="_test",
 recreate_index=False,
 use_sparse_embeddings=True,
 sparse_idf=True,
 embedding_dim=768 # Adjust based on your DeepInfra model's dimension
)


In [63]:
# Retrieve and print the top 100 records from Qdrant using get_documents_generator
top_documents = list(document_store.get_documents_generator())[:100]

# Print the content and metadata of each document
for i, doc in enumerate(top_documents):
 print(f"Document {i+1}:")
 print(f"Content: {doc}")


Document 1:
Content: Document(id=37d0166926cefc87821ad1ae66ecada0d2a72324f90e8816e6efd78283bc993d, content: 'Harvard Business Review Analytic Services is an independent commercial research unit within Harvard ...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'c5673577ed0630f0cb66291cb909ff1eacbeae86d81bea949471539268058aa5', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, embedding: vector of size 768, sparse_embedding: vector with 143 non-zero elements)
Document 2:
Content: Document(id=07b6945c2a65348c4c02255c196391acba22d36171e9f0b45109b161ce414535, content: 'Harvard Business Review Analytic Services
7Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '632cdeded3f3fa7ea36f7598134a4c0323cd063ded84537e9bdf3ba16e963394', 'split_id': 0, 'split_idx_start': 0, '

In [None]:
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
 FastembedSparseDocumentEmbedder,
 FastembedDocumentEmbedder
)
from haystack.dataclasses import Document
from typing import List
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
import base64
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from io import BytesIO

# Initialize OpenAI client for DeepInfra embeddings and OCR
openai = OpenAI(
 api_key="XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 base_url="https://api.deepinfra.com/v1/openai",
)

# Function to check if PDF is scanned (no extractable text)
def is_scanned_pdf(pdf_path):
 try:
 with open(pdf_path, "rb") as f:
 pdf_reader = PdfReader(f)
 for page in pdf_reader.pages:
 if page.extract_text():
 return False # Text found, not a scanned document
 return True # No extractable text found, likely a scanned document
 except Exception as e:
 raise ValueError(f"Error reading PDF file: {e}")

# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model
def perform_ocr_on_pdf(pdf_path):
 try:
 images = convert_from_path(pdf_path)
 ocr_metadata = []

 for i, image in enumerate(images):
 # Convert image to base64 for OCR processing
 buffered = BytesIO()
 image.save(buffered, format="JPEG")
 base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')

 # Perform OCR using DeepInfra Vision model
 chat_completion = openai.chat.completions.create(
 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
 messages=[
 {
 "role": "user",
 "content": [
 {
 "type": "image_url",
 "image_url": {
 "url": f"data:image/jpeg;base64,{base64_image}"
 }
 },
 {
 "type": "text",
 "text": "only respond with all the extracted text from the document and nothing else"
 }
 ]
 }
 ]
 )

 # Extract text for this specific page
 ocr_page_text = chat_completion.choices[0].message.content

 # Append OCR result for this page along with correct page number metadata
 ocr_metadata.append({
 "page_number": i + 1, # Correctly set page number (i starts at 0)
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "content": ocr_page_text # Store the text per page
 })

 return ocr_metadata
 
 except Exception as e:
 raise ValueError(f"Error during OCR processing: {e}")
 
 
 
def extract_text_from_non_scanned_pdf(pdf_path):
 try:
 with open(pdf_path, "rb") as f:
 reader = PdfReader(f)
 extracted_pages = []
 
 for i, page in enumerate(reader.pages):
 extracted_text = page.extract_text()
 
 # Append extracted text along with correct page number metadata
 extracted_pages.append({
 "page_number": i + 1, # Correctly set page number (i starts at 0)
 "file_name": Path(pdf_path).name,
 "file_path": str(pdf_path),
 "content": extracted_text
 })
 
 return extracted_pages
 
 except Exception as e:
 raise ValueError(f"Error during PDF text extraction: {e}")
 
 

@component
class PdfToTextOrOCR:
 @component.output_types(documents=List[Document])
 def run(self, documents: List[Path]) -> dict:
 processed_documents = []
 
 for pdf_path in documents:
 if is_scanned_pdf(pdf_path):
 ocr_metadata = perform_ocr_on_pdf(pdf_path)
 for meta in ocr_metadata:
 doc = Document(content=meta['content'], meta={
 "file_name": meta["file_name"],
 "file_path": meta["file_path"],
 "page_number": meta["page_number"] # Include page number here
 })
 processed_documents.append(doc)
 
 else:
 extracted_pages = extract_text_from_non_scanned_pdf(pdf_path)
 for meta in extracted_pages:
 doc = Document(content=meta['content'], meta={
 "file_name": meta["file_name"],
 "file_path": meta["file_path"],
 "page_number": meta["page_number"] # Include page number here
 })
 processed_documents.append(doc)
 
 return {"documents": processed_documents}

@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL",
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

 def to_dict(self):
 return {
 "api_key": self.client.api_key,
 "model_name": self.model_name,
 "base_url": self.client.base_url
 }

 @classmethod
 def from_dict(cls, data):
 return cls(
 api_key=data["api_key"],
 model_name=data["model_name"],
 base_url=data["base_url"]
 )

# Initialize Qdrant document store as before
document_store = QdrantDocumentStore(
 host="0.0.0.0",
 port=6333,
 index="_test",
 recreate_index=False,
 use_sparse_embeddings=True,
 sparse_idf=True,
 embedding_dim=768 # Adjust based on your DeepInfra model's dimension
)

# Initialize other components as before
cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
)

document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300,
 split_overlap=30,
)

# Initialize embedders (DeepInfra and Fastembed)
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE,
)

# Create and configure the pipeline with new PdfToTextOrOCR component
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("pdf_to_text_or_ocr", PdfToTextOrOCR())
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect components in the pipeline flow 
indexing_pipeline.connect("pdf_to_text_or_ocr", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Get list of PDF files and prepare metadata as before
pdf_folder = "/root/hbr"
pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]

# Run the pipeline on all PDFs in folder (with automatic OCR handling)
indexing_pipeline.run({"documents": pdf_files})

print(f"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}")


Calculating sparse embeddings: 100%|██████████| 21/21 [00:06<00:00, 3.10it/s]
100it [00:00, 532.35it/s] 

Indexed documents in Qdrant. Total documents: 21





In [67]:
documents = list(document_store.get_documents_generator())
if documents:
 for doc in documents[:5]: # Check a few documents
 print(f"Document ID: {doc.id}")
 print(f"File Name: {doc.meta.get('file_name')}")
 print(f"Page Number: {doc.meta.get('page_number')}")
 print("Content:", doc.content[:200])

Document ID: 37d0166926cefc87821ad1ae66ecada0d2a72324f90e8816e6efd78283bc993d
File Name: HBR_Synthetic_Data.pdf
Page Number: 1
Content: Harvard Business Review Analytic Services is an independent commercial research unit within Harvard Business Review Group, conducting research and comparative analysis on important management challeng
Document ID: 07b6945c2a65348c4c02255c196391acba22d36171e9f0b45109b161ce414535
File Name: HBR_Synthetic_Data.pdf
Page Number: 1
Content: Harvard Business Review Analytic Services
7Briefng Paper | The Executives Guide to Accelerating Artifcial Intelligence and Data Innovation with Synthetic DataWith recognition of synthetic datas abilit
Document ID: 064bb1421bdb620a4f74f54002927b8dd4d802b74b62abe6f78eace2fbe5dac6
File Name: HBR_Synthetic_Data.pdf
Page Number: 1
Content: synthetic data generation project have learned and mastered new technology. This is how we unlock further innovation, and it helps us to attract top talent to join and stay at Humana. Top ta

# fast api non ocr code

In [4]:
import os
from pathlib import Path
from typing import List
import numpy as np
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.dataclasses import Document
from openai import OpenAI

# Define the FastAPI app instance
app = FastAPI()

# Define custom embedding component using DeepInfra API
@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL", # Replace with your actual API key
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 # Get embeddings from DeepInfra API
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 # Assign embeddings to each document
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

 def to_dict(self):
 return {
 "api_key": self.client.api_key,
 "model_name": self.model_name,
 "base_url": self.client.base_url
 }

 @classmethod
 def from_dict(cls, data):
 return cls(
 api_key=data["api_key"],
 model_name=data["model_name"],
 base_url=data["base_url"]
 )

# Initialize Qdrant document store for indexing documents
document_store = QdrantDocumentStore(
 host="0.0.0.0", # Adjust host as needed (e.g., localhost or container IP)
 port=6333,
 index="aaa_test", # Name of the index in Qdrant
 recreate_index=True, # Recreate index if it exists (useful for development)
 use_sparse_embeddings=True, # Enable hybrid search with sparse embeddings
 sparse_idf=True, # Enable IDF calculation for sparse embeddings
 embedding_dim=768 # Adjust according to your model's embedding dimensions (DeepInfra model)
)

# Initialize document cleaner to preprocess text data
cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
 remove_repeated_substrings=False # Keep repeated substrings if necessary for context retention
)

# Initialize document converter for PDF files (can be expanded to other formats)
converter = PyPDFToDocument()

# Split large documents into smaller chunks for better indexing and retrieval performance
document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300, # Split after every 300 words 
 split_overlap=30 # Overlap by 30 words between chunks to maintain context continuity 
)

# Configure embedders: DeepInfra for dense embeddings and Fastembed for sparse embeddings.
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

# Create a document writer to write processed documents into the Qdrant document store.
writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE # Overwrite existing documents with the same ID if they exist.
)

# Create and configure the pipeline with all components.
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect components in the pipeline.
indexing_pipeline.connect("converter", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Define a temporary folder to store uploaded files before processing.
TEMP_UPLOAD_DIR = "/tmp/uploaded_files"
os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)

@app.post("/upload")
async def upload_files(files: List[UploadFile] = File(...)):
 """
 Endpoint to handle file uploads from the frontend.
 
 Args:
 files (List[UploadFile]): List of files uploaded by the user.

 Returns:
 JSONResponse: Success or failure message.
 """
 
 try:
 file_paths = []
 metadata_list = []

 # Save each uploaded file temporarily and prepare metadata.
 for file in files:
 file_path = Path(TEMP_UPLOAD_DIR) / file.filename
 
 with open(file_path, "wb") as f:
 f.write(await file.read()) # Save file contents
 
 file_paths.append(file_path)
 
 metadata_list.append({
 "file_name": file.filename,
 "file_path": str(file_path),
 "source_type": "pdf" if file.filename.endswith('.pdf') else 'other'
 })

 # Run the ingestion pipeline with the uploaded files and their metadata.
 indexing_pipeline.run({
 "converter": {
 "sources": file_paths,
 "meta": metadata_list # Pass metadata along with documents for indexing.
 }
 })

 return JSONResponse(content={"message": f"Successfully indexed {len(files)} documents."})

 except Exception as e:
 raise HTTPException(status_code=500, detail=str(e))

@app.get("/documents")
async def get_documents():
 """
 Endpoint to retrieve all indexed documents from Qdrant.

 Returns:
 JSONResponse: List of indexed documents with metadata.
 """
 
 try:
 documents = document_store.filter_documents(filters={})
 
 if not documents:
 return JSONResponse(content={"message": "No documents found."})
 
 return JSONResponse(content={
 "documents": [{
 "file_name": doc.meta.get("file_name"),
 "file_path": doc.meta.get("file_path")
 } for doc in documents]
 })
 
 except Exception as e:
 raise HTTPException(status_code=500, detail=str(e))



In [10]:
import os
from pathlib import Path
from typing import List
import numpy as np
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from haystack import Pipeline, component
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.dataclasses import Document
from openai import OpenAI

# Import necessary libraries for running FastAPI inside Jupyter Notebook
import nest_asyncio
import uvicorn

# Patch asyncio event loop for Jupyter Notebook compatibility
nest_asyncio.apply()

# Define the FastAPI app instance
app = FastAPI()

# Define custom embedding component using DeepInfra API
@component
class DeepInfraEmbeddings:
 def __init__(
 self,
 api_key: str = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL", # Replace with your actual API key
 model_name: str = "BAAI/bge-base-en-v1.5",
 base_url: str = "https://api.deepinfra.com/v1/openai"
 ):
 self.client = OpenAI(
 api_key=api_key,
 base_url=base_url
 )
 self.model_name = model_name

 @component.output_types(documents=List[Document])
 def run(self, documents: List[Document]) -> dict:
 texts = [doc.content for doc in documents]
 
 # Get embeddings from DeepInfra API
 response = self.client.embeddings.create(
 model=self.model_name,
 input=texts,
 encoding_format="float"
 )
 
 embeddings = [np.array(embedding.embedding) for embedding in response.data]
 
 # Assign embeddings to each document
 for doc, embedding in zip(documents, embeddings):
 doc.embedding = embedding
 
 return {"documents": documents}

 def to_dict(self):
 return {
 "api_key": self.client.api_key,
 "model_name": self.model_name,
 "base_url": self.client.base_url
 }

 @classmethod
 def from_dict(cls, data):
 return cls(
 api_key=data["api_key"],
 model_name=data["model_name"],
 base_url=data["base_url"]
 )

# Initialize Qdrant document store for indexing documents
document_store = QdrantDocumentStore(
 host="0.0.0.0", # Adjust host as needed (e.g., localhost or container IP)
 port=6333,
 index="aaa_test", # Name of the index in Qdrant
 recreate_index=True, # Recreate index if it exists (useful for development)
 use_sparse_embeddings=True, # Enable hybrid search with sparse embeddings
 sparse_idf=True, # Enable IDF calculation for sparse embeddings
 embedding_dim=768 # Adjust according to your model's embedding dimensions (DeepInfra model)
)

# Initialize document cleaner to preprocess text data
cleaner = DocumentCleaner(
 ascii_only=True,
 remove_empty_lines=True,
 remove_extra_whitespaces=True,
 remove_repeated_substrings=False # Keep repeated substrings if necessary for context retention
)

# Initialize document converter for PDF files (can be expanded to other formats)
converter = PyPDFToDocument()

# Split large documents into smaller chunks for better indexing and retrieval performance
document_splitter = DocumentSplitter(
 split_by="word",
 split_length=300, # Split after every 300 words 
 split_overlap=30 # Overlap by 30 words between chunks to maintain context continuity 
)

# Configure embedders: DeepInfra for dense embeddings and Fastembed for sparse embeddings.
deep_infra_embedder = DeepInfraEmbeddings()
sparse_embedder = FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1")

# Create a document writer to write processed documents into the Qdrant document store.
writer = DocumentWriter(
 document_store=document_store,
 policy=DuplicatePolicy.OVERWRITE # Overwrite existing documents with the same ID if they exist.
)

# Create and configure the pipeline with all components.
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", document_splitter)
indexing_pipeline.add_component("deep_infra_embedder", deep_infra_embedder)
indexing_pipeline.add_component("sparse_embedder", sparse_embedder)
indexing_pipeline.add_component("writer", writer)

# Connect components in the pipeline.
indexing_pipeline.connect("converter", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "sparse_embedder")
indexing_pipeline.connect("sparse_embedder", "deep_infra_embedder")
indexing_pipeline.connect("deep_infra_embedder", "writer")

# Define a temporary folder to store uploaded files before processing.
TEMP_UPLOAD_DIR = "/tmp/uploaded_files"
os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)

@app.post("/upload")
async def upload_files(files: List[UploadFile] = File(...)):
 """
 Endpoint to handle file uploads from the frontend.
 
 Args:
 files (List[UploadFile]): List of files uploaded by the user.

 Returns:
 JSONResponse: Success or failure message.
 """
 
 try:
 file_paths = []
 metadata_list = []

 # Save each uploaded file temporarily and prepare metadata.
 for file in files:
 file_path = Path(TEMP_UPLOAD_DIR) / file.filename
 
 with open(file_path, "wb") as f:
 f.write(await file.read()) # Save file contents
 
 file_paths.append(file_path)
 
 metadata_list.append({
 "file_name": file.filename,
 "file_path": str(file_path),
 "source_type": "pdf" if file.filename.endswith('.pdf') else 'other'
 })

 # Run the ingestion pipeline with the uploaded files and their metadata.
 indexing_pipeline.run({
 "converter": {
 "sources": file_paths,
 "meta": metadata_list # Pass metadata along with documents for indexing.
 }
 })

 return JSONResponse(content={"message": f"Successfully indexed {len(files)} documents."})

 except Exception as e:
 raise HTTPException(status_code=500, detail=str(e))

@app.get("/documents")
async def get_documents():
 """
 Endpoint to retrieve all indexed documents from Qdrant.

 Returns:
 JSONResponse: List of indexed documents with metadata.
 """
 
 try:
 documents = document_store.filter_documents(filters={})
 
 if not documents:
 return JSONResponse(content={"message": "No documents found."})
 
 return JSONResponse(content={
 "documents": [{
 "file_name": doc.meta.get("file_name"),
 "file_path": doc.meta.get("file_path")
 } for doc in documents]
 })
 
 except Exception as e:
 raise HTTPException(status_code=500, detail=str(e))


# Start Uvicorn server within Jupyter Notebook on port 8000.
if __name__ == "__main__":
 uvicorn.run(app, host="0.0.0.0", port=8000)


ERROR:asyncio:Task exception was never retrieved
future: exception=KeyboardInterrupt()>
Traceback (most recent call last):
 File "/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/main.py", line 579, in run
 server.run()
 File "/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/server.py", line 65, in run
 return asyncio.run(self.serve(sockets=sockets))
 File "/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py", line 30, in run
 return loop.run_until_complete(task)
 File "/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py", line 92, in run_until_complete
 self._run_once()
 File "/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py", line 133, in _run_once
 handle._run()
 File "/opt/conda/envs/py38/lib/python3.10/asyncio/events.py", line 80, in _run
 self._context.run(self._callback, *self._args)
 File "/opt/conda/envs/py38/lib/python3.10/asyncio/tasks.py", line 315, in __wakeup
 self.__step()
 File "/opt/conda/envs/py38/lib/pyth

INFO: 10.240.1.166:0 - "GET / HTTP/1.1" 404 Not Found
INFO: 10.240.1.166:0 - "GET /docs HTTP/1.1" 200 OK
INFO: 10.240.1.166:0 - "GET /openapi.json HTTP/1.1" 200 OK
INFO: 10.240.1.166:0 - "POST /upload HTTP/1.1" 400 Bad Request


INFO: Shutting down
INFO: Waiting for application shutdown.
INFO: Application shutdown complete.
INFO: Finished server process [45225]


In [None]:
%%bash
uvicorn main:app --reload --host 0.0.0.0 --port 8000
