{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "import numpy as np\n", "from openai import OpenAI\n", "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.components.preprocessors import DocumentCleaner\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedSparseDocumentEmbedder\n", ")\n", "from haystack.dataclasses import Document\n", "from typing import List\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Calculating sparse embeddings: 100%|██████████| 393/393 [02:01<00:00, 3.25it/s]\n", "400it [00:02, 161.43it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Indexed documents in Qdrant. Total documents: 393\n", "\n", "Sample document metadata:\n", "File name: HBR_Tech.pdf\n", "File path: /root/hbr/HBR_Tech.pdf\n" ] } ], "source": [ "import os\n", "from pathlib import Path\n", "import numpy as np\n", "from openai import OpenAI\n", "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.components.preprocessors import DocumentCleaner\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedSparseDocumentEmbedder,\n", " FastembedDocumentEmbedder\n", ")\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", " def to_dict(self):\n", " return {\n", " \"api_key\": self.client.api_key,\n", " \"model_name\": self.model_name,\n", " \"base_url\": self.client.base_url\n", " }\n", "\n", " @classmethod\n", " def from_dict(cls, data):\n", " return cls(\n", " api_key=data[\"api_key\"],\n", " model_name=data[\"model_name\"],\n", " base_url=data[\"base_url\"]\n", " )\n", "\n", "# Initialize Qdrant document store\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"aaa_test\",\n", " recreate_index=True,\n", " use_sparse_embeddings=True, # Enable hybrid search\n", " sparse_idf=True, # Enable IDF calculation for sparse embeddings\n", " embedding_dim=768 # Adjust based on your DeepInfra model's dimension\n", ")\n", "\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", " remove_repeated_substrings=False\n", ")\n", "\n", "# Create pipeline components\n", "converter = PyPDFToDocument()\n", "\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300,\n", " split_overlap=30\n", ")\n", "\n", "# Configure embedders\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "# Create document writer\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE\n", ")\n", "\n", "# Create and configure the pipeline\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"converter\", converter)\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect the components\n", "indexing_pipeline.connect(\"converter\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Get list of PDF files and prepare metadata\n", "pdf_folder = \"/root/hbr\"\n", "pdf_files = []\n", "metadata_list = []\n", "\n", "for filename in os.listdir(pdf_folder):\n", " if filename.endswith('.pdf'):\n", " file_path = Path(os.path.join(pdf_folder, filename))\n", " pdf_files.append(file_path)\n", " metadata_list.append({\n", " \"file_name\": filename,\n", " \"file_path\": str(file_path),\n", " \"source_type\": \"pdf\"\n", " })\n", "\n", "# Run the pipeline with metadata\n", "indexing_pipeline.run({\n", " \"converter\": {\n", " \"sources\": pdf_files,\n", " \"meta\": metadata_list\n", " }\n", "})\n", "\n", "print(f\"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}\")\n", "\n", "# Optional: Verify metadata\n", "documents = document_store.filter_documents(filters={})\n", "if documents:\n", " print(\"\\nSample document metadata:\")\n", " print(f\"File name: {documents[0].meta.get('file_name')}\")\n", " print(f\"File path: {documents[0].meta.get('file_path')}\")\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "Inputs:\n", " - sources: List[Union[str, Path, ByteStream]]\n", " - meta: Union[Dict[str, Any], List[Dict[str, Any]]]\n", "Outputs:\n", " - documents: List[Document]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PyPDFToDocument()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 13.05it/s]\n" ] } ], "source": [ "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder\n", "from haystack import Pipeline\n", "from haystack.utils import Secret\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.generators import OpenAIGenerator\n", "from haystack.components.builders import PromptBuilder\n", "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder\n", "from openai import OpenAI\n", "import numpy as np\n", "from haystack import component\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(embedding=List[float])\n", " def run(self, text: str) -> dict:\n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=[text],\n", " encoding_format=\"float\"\n", " )\n", " embedding = np.array(response.data[0].embedding)\n", " return {\"embedding\": embedding}\n", "\n", "\n", "prompt_template = \"\"\"\n", "I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.\n", "\n", "Context:\n", "{% for doc in documents %}\n", "Content: {{ doc.content }}\n", "Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}\n", "---\n", "{% endfor %}\n", "\n", "Search Query: {{question}}\n", "\n", "Relevant Matches:\n", "Here are the exact matches found in the documents:\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "If no relevant matches are found, I will respond:\n", "\"No matching information found in the provided documents.\"\n", "\"\"\"\n", "prompt_builder = PromptBuilder(template=prompt_template)\n", "llm = OpenAIGenerator(\n", " api_key=Secret.from_token(\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\"),\n", " api_base_url=\"https://api.deepinfra.com/v1/openai\",\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " generation_kwargs={\n", " \"max_tokens\": 512,\n", " \"temperature\": 0.7,\n", " }\n", ")\n", "\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "\n", "hybrid_query = Pipeline()\n", "hybrid_query.add_component('sparse_text_embedder',FastembedSparseTextEmbedder(model=\"prithvida/Splade_PP_en_v1\"))\n", "hybrid_query.add_component('dense_text_embedder',deep_infra_embedder)\n", "hybrid_query.add_component(\"retriever\", QdrantHybridRetriever(document_store=document_store))\n", "hybrid_query.add_component(\"prompt_builder\", prompt_builder)\n", "hybrid_query.add_component(\"llm\", llm)\n", "\n", "\n", "hybrid_query.connect('sparse_text_embedder.sparse_embedding','retriever.query_sparse_embedding')\n", "hybrid_query.connect(\"dense_text_embedder.embedding\", \"retriever.query_embedding\")\n", "hybrid_query.connect(\"retriever\", \"prompt_builder.documents\")\n", "hybrid_query.connect(\"prompt_builder\", \"llm\")\n", "\n", "\n", "question = \"what is this document about ?\"\n", "response = hybrid_query.run({\n", " \"dense_text_embedder\": {\"text\": question},\n", " \"sparse_text_embedder\": {\"text\": question},\n", " \"retriever\": {\"top_k\": 5},\n", " \"prompt_builder\": {\"question\": question}\n", "})" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'llm': {'replies': ['Here are the relevant matches:\\n\\n* \"Synthetic data is a tool that addresses many data challenges, particularly artificial intelligence and analytics issues such as privacy protection, regulatory compliance, accessibility, data scarcity, and bias, as well as data sharing and time to data (and therefore time to market).\" \\nLocation: HBR_Synthetic_Data.pdf, Page: 2\\n* \"Synthetic data is a tool that addresses many data challenges, particularly artificial intelligence and analytics issues such as privacy protection, regulatory compliance, accessibility, data scarcity, and bias, as well as data sharing and time to data (and therefore time to market).\" \\nLocation: HBR_Synthetic_Data.pdf, Page: 2\\n* \"The synthetic data opportunity means something different for every organization, but new revenue streams; faster, easier, GDPR-compliant data access; better pricing models; and scalable, ethical, and explainable AI all are within reach for those business leaders ready to remove their data privacy blind spots.\" \\nLocation: HBR_Synthetic_Data.pdf, Page: 10'],\n", " 'meta': [{'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct',\n", " 'index': 0,\n", " 'finish_reason': 'stop',\n", " 'usage': {'completion_tokens': 215,\n", " 'prompt_tokens': 2080,\n", " 'total_tokens': 2295,\n", " 'completion_tokens_details': None,\n", " 'prompt_tokens_details': None,\n", " 'estimated_cost': 0.000126225}}]}}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Question: \n", " where are the instructions ??\n", " \n", " \n", "\n", "Document Store Status:\n", "Total documents: 80\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 16.56it/s]\n", "Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 26.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Retrieved Documents with Metadata:\n", "==================================================\n", "\n", "Document 1:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 90c553d0a17897f53e0ec7e38d993db75c45d2052734f05c28543845e13cd0c3\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f2aead5aa2b985ecbf87a5bd31b738d5d23714fb58deca9ec44d0431a26c9649', 'range': [1680, 1877]}, {'doc_id': 'e6fce9d3a8e725f69c2d30337af28c9cef05ab67d264d098001962437f721421', 'range': [0, 245]}]\n", "\n", "Document 2:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 257cd8875c7e1ece07cc168760d24e67f4092162f11c1a4668031f94ac15f214\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'ca28eca71d842ec2436696f55adb8fd21c9817d4b1c9dbb5a427277bac9028b4', 'range': [1690, 1851]}, {'doc_id': '17ede6e88153e6a8dad40bdd1993b61f76f41f24cc6c046ee874f01725ed64ec', 'range': [0, 185]}]\n", "\n", "Document 3:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 257cd8875c7e1ece07cc168760d24e67f4092162f11c1a4668031f94ac15f214\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': '9901a7e78e5aac9d75f9b78c1129b916f02c30dd761ca44dbc1d60ebb625dd0f', 'range': [1680, 1877]}, {'doc_id': '6c7db9520ffeeb3462383f2f499e7fde093a95d16376f5925b90cb449950b408', 'range': [0, 245]}]\n", "\n", "Document 4:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 5b8f14ab85d3358b47f6a54dc9e36a03b0dbb5c837033ba5938916cdcbaa0545\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': '4c29d2945aab4698daaecc66f3b97ba45b5765c13dce43227f936517135b942d', 'range': [1690, 1851]}, {'doc_id': '7d679b94a9e08477707652821a3611805ce8803dc6125a1e6f264b863d3cf0e1', 'range': [0, 185]}]\n", "\n", "Document 5:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 90c553d0a17897f53e0ec7e38d993db75c45d2052734f05c28543845e13cd0c3\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': '9e43e7a58e5d0d3ef05f021ddc9b11f01ddeddce2d17141f8ebca1d9cc07a10b', 'range': [1690, 1851]}, {'doc_id': '77b7ace51d743763367f8206bb374ab8df92e3049db99a92b4562fb150c9789b', 'range': [0, 185]}]\n", "\n", "Document 6:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 5b8f14ab85d3358b47f6a54dc9e36a03b0dbb5c837033ba5938916cdcbaa0545\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f405db29529fc22f7287425b30c5c7459e46a8240b8be2c3d0ce836ea900c54f', 'range': [1680, 1877]}, {'doc_id': '703f0bf9f802a76306255a7f1ada1e1b6dedef6c51e2fdde0de39971d8435f52', 'range': [0, 245]}]\n", "\n", "Document 7:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 898fd2fdbf851dfe0c9790abb176e264b4cf0f62d9679d78e6c5e924a749a654\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'f9e4ee8ee3fc7a197a660305c0e4db9231cea3360d67e0f09d82f9a0ee544017', 'range': [1690, 1851]}, {'doc_id': '40ce900b0f1d6f2a323e521ba0eefd722e37e17b7a0d56130cbf4737cd44e8d1', 'range': [0, 185]}]\n", "\n", "Document 8:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 65526833eefc1600857ac3797dd95da1195532c5ced7932126e9ae54ff3f63fb\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': '537be6b09a464bfe4751c34b636fd623fd061cce661fd26f5e6b39118a96408a', 'range': [1680, 1877]}, {'doc_id': '197be13bdb75f471549866b7f3b12830dd3f66186e640ba9f7dbb2ce767e245f', 'range': [0, 245]}]\n", "\n", "Document 9:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 982a734f5b663b8a9f5e4a79f0e24182e6215dc8e459a062aff5ce84135aba81\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'f990c1b804eb9df09f9d815aaa00638e4b2111815802ac4146815a5e508a0629', 'range': [1690, 1851]}, {'doc_id': '3bf24235fdf21322ae4ba3d5e8f5abb2b51e2ecad760d034c12616abfcd30777', 'range': [0, 185]}]\n", "\n", "Document 10:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 982a734f5b663b8a9f5e4a79f0e24182e6215dc8e459a062aff5ce84135aba81\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f394a4042a97e46f19ba562d556ee483514f29f37eebe854e464ed5c6e0b4c56', 'range': [1680, 1877]}, {'doc_id': 'e0b7af576d04b71c5b5f63cd65021c41291c07013e34010f496c5fdbf343dfda', 'range': [0, 245]}]\n", "\n", "Answer: Here are the relevant matches for the search query \"where are the instructions\":\n", "\n", "**NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.**\n", "Location: PublicWaterMassMailing.pdf, Page: 1\n", "\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: \n", "Location: PublicWaterMassMailing.pdf, Page: 1\n", "\n", "**NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.**\n", "Location: PublicWaterMassMailing.pdf, Page: 1\n", "\n", "Number of documents retrieved: 10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from haystack import Pipeline\n", "from haystack.utils import Secret\n", "from haystack.components.generators import OpenAIGenerator\n", "from haystack.components.builders import PromptBuilder\n", "from openai import OpenAI\n", "import numpy as np\n", "from haystack import component\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder\n", "from haystack import Pipeline\n", "from haystack.utils import Secret\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.generators import OpenAIGenerator\n", "from haystack.components.builders import PromptBuilder\n", "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder\n", "from openai import OpenAI\n", "import numpy as np\n", "from haystack import component\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "\n", "\n", "# Connect to existing ChromaDB document store\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"aaa_test\",\n", " recreate_index=False,\n", " use_sparse_embeddings=True, # Enable hybrid search\n", " sparse_idf=True, # Enable IDF calculation for sparse embeddings\n", " embedding_dim=768, # Adjust based on your DeepInfra model's dimension\n", " similarity=\"cosine\"\n", ")\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(embedding=List[float])\n", " def run(self, text: str) -> dict:\n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=[text],\n", " encoding_format=\"float\"\n", " )\n", " embedding = np.array(response.data[0].embedding)\n", " return {\"embedding\": embedding}\n", "\n", "# Create prompt template for QA\n", "# Modified prompt template to include metadata\n", "prompt_template = \"\"\"\n", "I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.\n", "\n", "For each search query, I will:\n", "1. Identify matching sentences/passages from the documents\n", "2. Show the exact quotes with their source locations\n", "3. Provide file names and page numbers where the information can be found\n", "\n", "Context:\n", "{% for doc in documents %}\n", "Content: {{ doc.content }}\n", "Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}\n", "---\n", "{% endfor %}\n", "\n", "Search Query: {{question}}\n", "\n", "Relevant Matches:\n", "Here are the exact matches found in the documents:\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "If no relevant matches are found, I will respond:\n", "\"No matching information found in the provided documents.\"\n", "\n", "Note: I do not provide answers or interpretations. I only help locate where information exists within the documents.\n", "\"\"\"\n", "\n", "def run_pipeline(question, top_k=10):\n", " \"\"\"\n", " Run the pipeline and return both the answer and retrieved documents.\n", " \"\"\"\n", " # Initialize components\n", " embedder = DeepInfraEmbeddings()\n", " # embedder = FastembedTextEmbedder(model=\"BAAI/bge-small-en-v1.5\", prefix=\"Represent this sentence for searching relevant passages: \")\n", " sparse_embedder = FastembedSparseTextEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", " retriever = QdrantHybridRetriever(document_store=document_store,\n", " top_k=10\n", " )\n", " prompt_builder = PromptBuilder(template=prompt_template)\n", " llm = OpenAIGenerator(\n", " api_key=Secret.from_token(\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\"),\n", " api_base_url=\"https://api.deepinfra.com/v1/openai\",\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " generation_kwargs={\n", " \"max_tokens\": 512,\n", " \"temperature\": 0.7,\n", " }\n", " )\n", "\n", " # Create and connect pipeline components\n", " pipeline = Pipeline()\n", " pipeline.add_component(\"embedder\", embedder)\n", " pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", " pipeline.add_component(\"retriever\", retriever)\n", " pipeline.add_component(\"prompt_builder\", prompt_builder)\n", " pipeline.add_component(\"llm\", llm)\n", "\n", " pipeline.connect(\"sparse_embedder.sparse_embedding\", \"retriever.query_sparse_embedding\")\n", " pipeline.connect(\"embedder.embedding\", \"retriever.query_embedding\")\n", " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", " pipeline.connect(\"prompt_builder\", \"llm\")\n", "\n", " # Run pipeline\n", " response = pipeline.run({\n", " \"sparse_embedder\": {\"text\": question},\n", " \"embedder\": {\"text\": question},\n", " \"retriever\": {\"top_k\": top_k},\n", " \"prompt_builder\": {\"question\": question}\n", " })\n", "\n", " # Get documents with metadata - Corrected retriever call\n", " dense_embedding = embedder.run(text=question)[\"embedding\"]\n", " sparse_embedding = sparse_embedder.run(text=question)[\"sparse_embedding\"]\n", " \n", " retriever_response = retriever.run(\n", " query_embedding=dense_embedding,\n", " query_sparse_embedding=sparse_embedding,\n", " top_k=top_k\n", " )\n", " \n", " documents = retriever_response.get(\"documents\", [])\n", "\n", " # Extract answer\n", " answer = response[\"llm\"][\"replies\"][0]\n", "\n", " print(\"\\nRetrieved Documents with Metadata:\")\n", " print(\"=\" * 50)\n", " if documents:\n", " for i, doc in enumerate(documents, 1):\n", " print(f\"\\nDocument {i}:\")\n", " print(\"-\" * 30)\n", " print(f\"Content: {doc.content}\")\n", " if hasattr(doc, 'meta') and doc.meta:\n", " print(\"\\nMetadata:\")\n", " for key, value in doc.meta.items():\n", " print(f\"- {key}: {value}\")\n", "\n", " return answer, documents\n", "\n", "\n", "def display_results(question):\n", " \"\"\"\n", " Display the results in a formatted way\n", " \"\"\"\n", " print(f\"\\nQuestion: {question}\")\n", " print(f\"\\nDocument Store Status:\")\n", " print(f\"Total documents: {document_store.count_documents()}\")\n", " \n", " answer, documents = run_pipeline(question)\n", " \n", " print(f\"\\nAnswer: {answer}\")\n", " print(f\"\\nNumber of documents retrieved: {len(documents)}\")\n", "\n", "# Usage\n", "if __name__ == \"__main__\":\n", " question = \"\"\"\n", " where are the instructions ??\n", " \n", " \"\"\"\n", " display_results(question)\n", "\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Question: \n", " where are the instructions ?\n", " \n", " \n", "\n", "Document Store Status:\n", "Total documents: 80\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Fetching 5 files: 100%|██████████| 5/5 [00:06<00:00, 1.25s/it]\n", "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 1.22it/s]\n", "Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 17.84it/s]\n", "Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 1.51it/s]\n", "Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 22.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Retrieved Documents with Metadata:\n", "==================================================\n", "\n", "Document 1:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 90c553d0a17897f53e0ec7e38d993db75c45d2052734f05c28543845e13cd0c3\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f2aead5aa2b985ecbf87a5bd31b738d5d23714fb58deca9ec44d0431a26c9649', 'range': [1680, 1877]}, {'doc_id': 'e6fce9d3a8e725f69c2d30337af28c9cef05ab67d264d098001962437f721421', 'range': [0, 245]}]\n", "\n", "Document 2:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 257cd8875c7e1ece07cc168760d24e67f4092162f11c1a4668031f94ac15f214\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'ca28eca71d842ec2436696f55adb8fd21c9817d4b1c9dbb5a427277bac9028b4', 'range': [1690, 1851]}, {'doc_id': '17ede6e88153e6a8dad40bdd1993b61f76f41f24cc6c046ee874f01725ed64ec', 'range': [0, 185]}]\n", "\n", "Document 3:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 5b8f14ab85d3358b47f6a54dc9e36a03b0dbb5c837033ba5938916cdcbaa0545\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': '4c29d2945aab4698daaecc66f3b97ba45b5765c13dce43227f936517135b942d', 'range': [1690, 1851]}, {'doc_id': '7d679b94a9e08477707652821a3611805ce8803dc6125a1e6f264b863d3cf0e1', 'range': [0, 185]}]\n", "\n", "Document 4:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 257cd8875c7e1ece07cc168760d24e67f4092162f11c1a4668031f94ac15f214\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': '9901a7e78e5aac9d75f9b78c1129b916f02c30dd761ca44dbc1d60ebb625dd0f', 'range': [1680, 1877]}, {'doc_id': '6c7db9520ffeeb3462383f2f499e7fde093a95d16376f5925b90cb449950b408', 'range': [0, 245]}]\n", "\n", "Document 5:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 5b8f14ab85d3358b47f6a54dc9e36a03b0dbb5c837033ba5938916cdcbaa0545\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f405db29529fc22f7287425b30c5c7459e46a8240b8be2c3d0ce836ea900c54f', 'range': [1680, 1877]}, {'doc_id': '703f0bf9f802a76306255a7f1ada1e1b6dedef6c51e2fdde0de39971d8435f52', 'range': [0, 245]}]\n", "\n", "Document 6:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 90c553d0a17897f53e0ec7e38d993db75c45d2052734f05c28543845e13cd0c3\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': '9e43e7a58e5d0d3ef05f021ddc9b11f01ddeddce2d17141f8ebca1d9cc07a10b', 'range': [1690, 1851]}, {'doc_id': '77b7ace51d743763367f8206bb374ab8df92e3049db99a92b4562fb150c9789b', 'range': [0, 185]}]\n", "\n", "Document 7:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 898fd2fdbf851dfe0c9790abb176e264b4cf0f62d9679d78e6c5e924a749a654\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'f9e4ee8ee3fc7a197a660305c0e4db9231cea3360d67e0f09d82f9a0ee544017', 'range': [1690, 1851]}, {'doc_id': '40ce900b0f1d6f2a323e521ba0eefd722e37e17b7a0d56130cbf4737cd44e8d1', 'range': [0, 185]}]\n", "\n", "Document 8:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 65526833eefc1600857ac3797dd95da1195532c5ced7932126e9ae54ff3f63fb\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': '537be6b09a464bfe4751c34b636fd623fd061cce661fd26f5e6b39118a96408a', 'range': [1680, 1877]}, {'doc_id': '197be13bdb75f471549866b7f3b12830dd3f66186e640ba9f7dbb2ce767e245f', 'range': [0, 245]}]\n", "\n", "Document 9:\n", "------------------------------\n", "Content: at that time. NEW SAMPLE INSTRUCTIONS: Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached. NEW WEB PORTAL FOR RESULTS REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334. IMPLEMENTATION DATES: The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time. On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection. Once again, thank you for your patience and understanding as we implement these changes. Patrick R. Shannon Manager, Environmental Bacteriology Unit Missouri Department of Health and Senior Services State Public Health Laboratory 101 North Chestnut St. P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 Email: Pat.Shannon@health.mo.gov Web: www.health.mo.gov/Lab\n", "**Order #:** 984\n", "**REPORT TO:** \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 982a734f5b663b8a9f5e4a79f0e24182e6215dc8e459a062aff5ce84135aba81\n", "- split_id: 2\n", "- split_idx_start: 3553\n", "- _split_overlap: [{'doc_id': 'f394a4042a97e46f19ba562d556ee483514f29f37eebe854e464ed5c6e0b4c56', 'range': [1680, 1877]}, {'doc_id': 'e0b7af576d04b71c5b5f63cd65021c41291c07013e34010f496c5fdbf343dfda', 'range': [0, 245]}]\n", "\n", "Document 10:\n", "------------------------------\n", "Content: wrap still attached.** **5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.** **6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.** **7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should be at\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "\n", "Metadata:\n", "- page_number: 1\n", "- file_name: PublicWaterMassMailing.pdf\n", "- file_path: /root/hbr/PublicWaterMassMailing.pdf\n", "- source_id: 982a734f5b663b8a9f5e4a79f0e24182e6215dc8e459a062aff5ce84135aba81\n", "- split_id: 6\n", "- split_idx_start: 10807\n", "- _split_overlap: [{'doc_id': 'f990c1b804eb9df09f9d815aaa00638e4b2111815802ac4146815a5e508a0629', 'range': [1690, 1851]}, {'doc_id': '3bf24235fdf21322ae4ba3d5e8f5abb2b51e2ecad760d034c12616abfcd30777', 'range': [0, 185]}]\n", "\n", "Answer: Here are the relevant matches found in the documents:\n", "\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "**Public Drinking Water Bacterial Analysis** PRINT LEGIBLY using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows: Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order. REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address. BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL. Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you \n", "Location: PublicWaterMassMailing.pdf, Page: 1\n", "\n", "Number of documents retrieved: 10\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from haystack import Pipeline\n", "from haystack.utils import Secret\n", "# from haystack_integrations.document_stores.chroma import ChromaDocumentStore\n", "from haystack.components.generators import OpenAIGenerator\n", "from haystack.components.builders import PromptBuilder\n", "# from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever\n", "from openai import OpenAI\n", "import numpy as np\n", "from haystack import component\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder\n", "from haystack import Pipeline\n", "from haystack.utils import Secret\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.generators import OpenAIGenerator\n", "from haystack.components.builders import PromptBuilder\n", "from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever\n", "from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder\n", "from openai import OpenAI\n", "import numpy as np\n", "from haystack import component\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "\n", "\n", "\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedTextEmbedder,\n", " FastembedSparseTextEmbedder\n", ")\n", "\n", "# Connect to existing ChromaDB document store\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"aaa_test\",\n", " recreate_index=False,\n", " use_sparse_embeddings=True, # Enable hybrid search\n", " sparse_idf=True, # Enable IDF calculation for sparse embeddings\n", " embedding_dim=768, # Adjust based on your DeepInfra model's dimension\n", " similarity=\"cosine\"\n", ")\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(embedding=List[float])\n", " def run(self, text: str) -> dict:\n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=[text],\n", " encoding_format=\"float\"\n", " )\n", " embedding = np.array(response.data[0].embedding)\n", " return {\"embedding\": embedding}\n", "\n", "# Create prompt template for QA\n", "# Modified prompt template to include metadata\n", "prompt_template = \"\"\"\n", "I am a search engine bot. My only purpose is to locate and point to relevant information within the provided documents. I do not provide interpretations or answers - I only help you find where the information exists.\n", "\n", "For each search query, I will:\n", "1. Identify matching sentences/passages from the documents\n", "2. Show the exact quotes with their source locations\n", "3. Provide file names and page numbers where the information can be found\n", "\n", "Context:\n", "{% for doc in documents %}\n", "Content: {{ doc.content }}\n", "Source: {{ doc.meta.file_name }}, Page: {{ doc.meta.page_number }}\n", "---\n", "{% endfor %}\n", "\n", "Search Query: {{question}}\n", "\n", "Relevant Matches:\n", "Here are the exact matches found in the documents:\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "\"[exact quote]\" \n", "Location: [filename, page X]\n", "\n", "If no relevant matches are found, I will respond:\n", "\"No matching information found in the provided documents.\"\n", "\n", "Note: I do not provide answers or interpretations. I only help locate where information exists within the documents.\n", "\"\"\"\n", "\n", "def run_pipeline(question, top_k=10):\n", " \"\"\"\n", " Run the pipeline and return both the answer and retrieved documents.\n", " \"\"\"\n", " # Initialize components\n", " embedder = FastembedTextEmbedder(\n", " model=\"BAAI/bge-base-en-v1.5\",\n", " prefix=\"Represent this sentence for searching relevant passages: \"\n", " )\n", " sparse_embedder = FastembedSparseTextEmbedder(\n", " model=\"prithvida/Splade_PP_en_v1\"\n", " )\n", " retriever = QdrantHybridRetriever(\n", " document_store=document_store,\n", " top_k=top_k\n", " # sparse_weight=0.5, # Add weights for hybrid search\n", " # dense_weight=0.5\n", " )\n", " prompt_builder = PromptBuilder(template=prompt_template)\n", " llm = OpenAIGenerator(\n", " api_key=Secret.from_token(\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\"),\n", " api_base_url=\"https://api.deepinfra.com/v1/openai\",\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " generation_kwargs={\n", " \"max_tokens\": 512,\n", " \"temperature\": 0.7,\n", " }\n", " )\n", "\n", " # Create pipeline\n", " pipeline = Pipeline()\n", " pipeline.add_component(\"embedder\", embedder)\n", " pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", " pipeline.add_component(\"retriever\", retriever)\n", " pipeline.add_component(\"prompt_builder\", prompt_builder)\n", " pipeline.add_component(\"llm\", llm)\n", "\n", " # Connect components\n", " pipeline.connect(\"sparse_embedder.sparse_embedding\", \"retriever.query_sparse_embedding\")\n", " pipeline.connect(\"embedder.embedding\", \"retriever.query_embedding\")\n", " pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", " pipeline.connect(\"prompt_builder\", \"llm\")\n", "\n", " # Run pipeline\n", " response = pipeline.run({\n", " \"sparse_embedder\": {\"text\": question},\n", " \"embedder\": {\"text\": question},\n", " \"retriever\": {\"top_k\": top_k},\n", " \"prompt_builder\": {\"question\": question}\n", " })\n", "\n", " # Get documents with metadata\n", " dense_embedding = embedder.run(text=question)[\"embedding\"]\n", " sparse_embedding = sparse_embedder.run(text=question)[\"sparse_embedding\"]\n", " \n", " retriever_response = retriever.run(\n", " query_embedding=dense_embedding,\n", " query_sparse_embedding=sparse_embedding,\n", " top_k=top_k\n", " )\n", " \n", " documents = retriever_response.get(\"documents\", [])\n", "\n", " # Extract answer\n", " answer = response[\"llm\"][\"replies\"][0]\n", "\n", " print(\"\\nRetrieved Documents with Metadata:\")\n", " print(\"=\" * 50)\n", " if documents:\n", " for i, doc in enumerate(documents, 1):\n", " print(f\"\\nDocument {i}:\")\n", " print(\"-\" * 30)\n", " print(f\"Content: {doc.content}\")\n", " if hasattr(doc, 'meta') and doc.meta:\n", " print(\"\\nMetadata:\")\n", " for key, value in doc.meta.items():\n", " print(f\"- {key}: {value}\")\n", "\n", " return answer, documents\n", "\n", "\n", "def display_results(question):\n", " \"\"\"\n", " Display the results in a formatted way\n", " \"\"\"\n", " print(f\"\\nQuestion: {question}\")\n", " print(f\"\\nDocument Store Status:\")\n", " print(f\"Total documents: {document_store.count_documents()}\")\n", " \n", " answer, documents = run_pipeline(question)\n", " \n", " print(f\"\\nAnswer: {answer}\")\n", " print(f\"\\nNumber of documents retrieved: {len(documents)}\")\n", "\n", "# Usage\n", "if __name__ == \"__main__\":\n", " question = \"\"\"\n", " where are the instructions ?\n", " \n", " \"\"\"\n", " display_results(question)\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ocr" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text from page 1: **Missouri Department of Health and Senior Services**\n", "\n", "P.O. Box 570, Jefferson City, MO 65102-0570 Phone: 573-751-5400 FAX: 573-751-6010 RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "**Peter Lyskowski**\n", "\n", "Acting Director\n", "\n", "**Jeremiah W. (Jay) Nixon**\n", "\n", "Governor\n", "\n", "**Missouri Public Water Systems**\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "**NEW SAMPLE BOTTLES:**\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles.\n", "\n", "Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "**NEW SAMPLE INFORMATION FORMS:**\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "\n", "Text from page 2: Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "Text from page 3: If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "Text from page 4: **Order #**: 984\n", "\n", "**Pages in Order**: 1 of 1\n", "\n", "**Containers in Order**: 1\n", "\n", "**REPORT TO**: ADRIAN\n", "16 E 5TH ST\n", "ADRIAN, MO 64720\n", "\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES\n", "1101 RIVERSIDE DRIVE\n", "JEFFERSON CITY, MO 65102\n", "\n", "**Requested Analyses/Tests**\n", "\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "\n", "Total Coliform Bacteria and E. coli (Present/Absent Test)\n", "\n", "**PRINT LEGIBLY. Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.**\n", "\n", "**Complete or correct the following information**\n", "\n", "**Collected Date**: XXXX-XX-XX\n", "\n", "**PWS Id**: MO1010001\n", "\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "\n", "**Location**: address or name of sampling point\n", "\n", "**Collector Phone**: 000-111-2222\n", "\n", "**Repeat Location**: upstream, downstream, original source, other\n", "\n", "**Free Chlorine**: mg/L\n", "\n", "**Collector Signature**: I attest the information provided is accurate\n", "\n", "**Collected Time**: 24 hour format hits mm:\n", "\n", "**Facility Id**: DS\n", "\n", "**Sample Collection Point Id**: sampling point is from sample site plan\n", "\n", "**Collector**: last name, first name\n", "\n", "**Sample Category**: Bacterial\n", "\n", "**Bottle Number**: Bottle Number\n", "\n", "**Total Chlorine**: mg/L\n", "\n", "**County**: BATES\n", "\n", "**Received By**: Evidence of Tampering: Yes No\n", "\n", "**Date Printed**: 2015-11-06\n", "\n", "**Bottles Received**: Bottles Received\n", "\n", "**BUILD ID**: BUILD ID\n", "\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "\n", "**Evidence of Cooling**: Yes No\n", "\n", "**Temperature ( Celsius ):** Temperature ( Celsius )\n", "\n", "**Thermometer ID**: Thermometer ID\n", "\n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "Text from page 5: This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES.\n", "\n", "Sample Containers:\n", "\n", "Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled.\n", "\n", "Shrink Wrap Seal:\n", "\n", "Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached.\n", "\n", "Two Fill Lines:\n", "\n", "Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly.\n", "\n", "If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab.\n", "\n", "No Paper Label:\n", "\n", "There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit.\n", "\n", "For More Information, please contact:\n", "\n", "Missouri Department of Health and Senior Services\n", "State Public Health Laboratory\n", "Environmental Bacteriology Unit\n", "101 North Chestnut St., P.O. Box 570\n", "Jefferson City, MO 65102\n", "Phone: 573-751-3334\n", "FAX: 573-522-4032\n", "Email: labweb1@health.mo.gov\n", "Website: www.health.mo.gov/Lab\n", "Text from page 6: **Bacteriological Sample Collection Procedures**\n", "\n", "**Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.**\n", "\n", "**1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.**\n", "\n", "**2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.**\n", "\n", "**3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.**\n", "\n", "**4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.**\n", "\n", "**5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.**\n", "\n", "**6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.**\n", "\n", "**7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 - 120 ml). Preferably, the sample level should\n", "Text from page 7: Print Legibly using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "Order #: For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "REPORT TO: Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "BILL TO: Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "Requested Analysis/Tests: This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "Complete or correct the following information:\n", "\n", "All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "Collected Date: Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "Collected Time: Enter the time of sample collection using 24-hour military format hhmm.\n", "\n", "PWS ID: If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO#########).\n", "\n", "Facility ID: Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "Sample Type: Enter one of the following options:\n", "\n", "Routine - Regular monthly monitoring samples.\n", "\n", "Repeat – A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must be taken for each routine sample that tests positive (Present) for coliform bacteria. All repeats must be taken on the same day, within 24 hours of being notified of the coliform positive\n", "Text from page 8: **Shipping Instructions**\n", "\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contact courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as FedEx or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra.johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/. In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit\n" ] } ], "source": [ "from PyPDF2 import PdfReader\n", "from pdf2image import convert_from_path\n", "import base64\n", "from openai import OpenAI\n", "\n", "# Create an OpenAI client with your deepinfra token and endpoint\n", "openai = OpenAI(\n", " api_key=\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " base_url=\"https://api.deepinfra.com/v1/openai\",\n", ")\n", "\n", "# Function to get the total number of pages in a PDF file\n", "def get_pdf_page_count(pdf_path):\n", " \"\"\"Get the total number of pages in a PDF file.\"\"\"\n", " with open(pdf_path, \"rb\") as f:\n", " pdf_reader = PdfReader(f)\n", " return len(pdf_reader.pages)\n", "\n", "# Function to convert PDF pages to images, limiting to actual page count\n", "def convert_pdf_to_images(pdf_path, dpi=100):\n", " \"\"\"Convert PDF pages to images, up to the actual number of pages.\"\"\"\n", " total_pages = get_pdf_page_count(pdf_path)\n", " return convert_from_path(pdf_path, dpi=dpi, last_page=total_pages)\n", "\n", "def encode_image_to_base64(image):\n", " from io import BytesIO\n", " buffered = BytesIO()\n", " image.save(buffered, format=\"JPEG\")\n", " return base64.b64encode(buffered.getvalue()).decode('utf-8')\n", "\n", "# Path to your scanned PDF\n", "pdf_path = \"/root/PublicWaterMassMailing.pdf\"\n", "\n", "# Convert scanned PDF pages into images (limiting to actual page count)\n", "images = convert_pdf_to_images(pdf_path)\n", "\n", "# Process each page image\n", "for i, image in enumerate(images):\n", " # Save each image temporarily (optional: you can skip saving and directly encode)\n", " image_path = f\"page_{i+1}.jpg\"\n", " image.save(image_path, 'JPEG')\n", " \n", " # Encode image to base64\n", " base64_image = encode_image_to_base64(image)\n", " \n", " # Send the base64-encoded image to DeepInfra's Vision model for OCR\n", " chat_completion = openai.chat.completions.create(\n", " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n", " }\n", " },\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"only response with all the extracted text from the document and nothing else\"\n", " }\n", " ]\n", " }\n", " ]\n", " )\n", " \n", " # Print or process OCR result for each page\n", " print(f\"Text from page {i+1}:\", chat_completion.choices[0].message.content)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\n" ] } ], "source": [ "from PyPDF2 import PdfReader\n", "\n", "pdf_path = \"/root/PublicWaterMassMailing.pdf\"\n", "\n", "def get_pdf_page_count(pdf_path):\n", " \"\"\"Get the total number of pages in a PDF file.\"\"\"\n", " with open(pdf_path, \"rb\") as f:\n", " pdf_reader = PdfReader(f)\n", " return len(pdf_reader.pages)\n", " \n", "print(get_pdf_page_count(pdf_path))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 1\n", "Displaying page 2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 6\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 7\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Displaying page 8\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: no \"view\" mailcap rules found for type \"image/png\"\n" ] } ], "source": [ "from pdf2image import convert_from_path\n", "from PIL import Image\n", "\n", "# Convert PDF pages to images\n", "def convert_pdf_to_images(pdf_path, dpi=300):\n", " \"\"\"Convert PDF pages to images.\"\"\"\n", " return convert_from_path(pdf_path, dpi=dpi)\n", "\n", "# Display each page as an image\n", "def display_pdf_pages_as_images(pdf_path):\n", " \"\"\"Display each page of the PDF as an image.\"\"\"\n", " images = convert_pdf_to_images(pdf_path)\n", " \n", " for i, image in enumerate(images):\n", " # Display the image using PIL's show method\n", " print(f\"Displaying page {i+1}\")\n", " image.show()\n", "\n", "# Path to your scanned PDF\n", "pdf_path = \"/root/PublicWaterMassMailing.pdf\"\n", "\n", "# Display all pages\n", "display_pdf_pages_as_images(pdf_path)\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "ename": "PipelineConnectError", "evalue": "Cannot connect 'pdf_to_text_or_ocr' with 'cleaner': no matching connections available.\n'pdf_to_text_or_ocr':\n\n'cleaner':\n - documents: List[Document] (available)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPipelineConnectError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[19], line 193\u001b[0m\n\u001b[1;32m 190\u001b[0m indexing_pipeline\u001b[38;5;241m.\u001b[39madd_component(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwriter\u001b[39m\u001b[38;5;124m\"\u001b[39m, writer)\n\u001b[1;32m 192\u001b[0m \u001b[38;5;66;03m# Connect components in the pipeline flow \u001b[39;00m\n\u001b[0;32m--> 193\u001b[0m \u001b[43mindexing_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpdf_to_text_or_ocr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcleaner\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m indexing_pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcleaner\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplitter\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 195\u001b[0m indexing_pipeline\u001b[38;5;241m.\u001b[39mconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplitter\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msparse_embedder\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/opt/conda/envs/py38/lib/python3.10/site-packages/haystack/core/pipeline/base.py:526\u001b[0m, in \u001b[0;36mPipelineBase.connect\u001b[0;34m(self, sender, receiver)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 522\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 523\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot connect \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msender_component_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m with \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mreceiver_component_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mno matching connections available.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mstatus\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 525\u001b[0m )\n\u001b[0;32m--> 526\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m PipelineConnectError(msg)\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(possible_connections) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 529\u001b[0m \u001b[38;5;66;03m# There's only one possible connection, use it\u001b[39;00m\n\u001b[1;32m 530\u001b[0m sender_socket \u001b[38;5;241m=\u001b[39m possible_connections[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\n", "\u001b[0;31mPipelineConnectError\u001b[0m: Cannot connect 'pdf_to_text_or_ocr' with 'cleaner': no matching connections available.\n'pdf_to_text_or_ocr':\n\n'cleaner':\n - documents: List[Document] (available)" ] } ], "source": [ "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.components.preprocessors import DocumentCleaner\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedSparseDocumentEmbedder,\n", " # FastembedDocumentEmbedder\n", ")\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "import os\n", "from pathlib import Path\n", "import numpy as np\n", "from openai import OpenAI\n", "import base64\n", "from pdf2image import convert_from_path\n", "from PyPDF2 import PdfReader\n", "from io import BytesIO\n", "\n", "# Initialize OpenAI client for DeepInfra embeddings and OCR\n", "openai = OpenAI(\n", " api_key=\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " base_url=\"https://api.deepinfra.com/v1/openai\",\n", ")\n", "\n", "# Function to check if PDF is scanned (no extractable text)\n", "def is_scanned_pdf(pdf_path):\n", " try:\n", " with open(pdf_path, \"rb\") as f:\n", " pdf_reader = PdfReader(f)\n", " for page in pdf_reader.pages:\n", " if page.extract_text():\n", " return False # Text found, not a scanned document\n", " return True # No extractable text found, likely a scanned document\n", " except Exception as e:\n", " raise ValueError(f\"Error reading PDF file: {e}\")\n", "\n", "# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model\n", "def perform_ocr_on_pdf(pdf_path):\n", " try:\n", " images = convert_from_path(pdf_path)\n", " ocr_text = \"\"\n", " ocr_metadata = []\n", "\n", " for i, image in enumerate(images):\n", " # Convert image to base64 for OCR processing\n", " buffered = BytesIO()\n", " image.save(buffered, format=\"JPEG\")\n", " base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')\n", " \n", " # Perform OCR using DeepInfra Vision model\n", " chat_completion = openai.chat.completions.create(\n", " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n", " }\n", " },\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"only respond with all the extracted text from the document and nothing else\"\n", " }\n", " ]\n", " }\n", " ]\n", " )\n", " \n", " # Append OCR result for each page along with page number metadata\n", " ocr_text += chat_completion.choices[0].message.content + \"\\n\"\n", " ocr_metadata.append({\n", " \"page_number\": i + 1,\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path)\n", " })\n", " \n", " return ocr_text, ocr_metadata\n", " \n", " except Exception as e:\n", " raise ValueError(f\"Error during OCR processing: {e}\")\n", "\n", "@component\n", "class PdfToTextOrOCR:\n", " def run(self, documents: List[Path]) -> List[Document]:\n", " processed_documents = []\n", " \n", " for pdf_path in documents:\n", " if is_scanned_pdf(pdf_path):\n", " print(f\"Performing OCR on scanned PDF: {pdf_path}\")\n", " extracted_text, ocr_metadata = perform_ocr_on_pdf(pdf_path)\n", " \n", " # Create Haystack Document object with extracted content and metadata per page.\n", " for meta in ocr_metadata:\n", " doc = Document(content=extracted_text, meta=meta)\n", " processed_documents.append(doc)\n", " else:\n", " print(f\"Extracting text from non-scanned PDF: {pdf_path}\")\n", " with open(pdf_path, \"rb\") as f:\n", " reader = PdfReader(f)\n", " extracted_text = \"\\n\".join([page.extract_text() for page in reader.pages])\n", " \n", " # Create Haystack Document object with extracted content and metadata.\n", " doc = Document(content=extracted_text, meta={\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"source_type\": \"pdf\"\n", " })\n", " processed_documents.append(doc)\n", " \n", " return processed_documents # Return list of Document objects directly\n", " \n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", "# Initialize Qdrant document store as before\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"aaa_test\",\n", " recreate_index=True,\n", " use_sparse_embeddings=True,\n", " sparse_idf=True,\n", " embedding_dim=768 # Adjust based on your DeepInfra model's dimension\n", ")\n", "\n", "# Initialize other components as before\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", ")\n", "\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300,\n", " split_overlap=30,\n", ")\n", "\n", "# Initialize embedders (DeepInfra and Fastembed)\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE,\n", ")\n", "\n", "# Create and configure the pipeline with new PdfToTextOrOCR component\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"pdf_to_text_or_ocr\", PdfToTextOrOCR())\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect components in the pipeline flow \n", "indexing_pipeline.connect(\"pdf_to_text_or_ocr\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Get list of PDF files and prepare metadata as before\n", "pdf_folder = \"/root/hbr\"\n", "pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]\n", "\n", "# Run the pipeline on all PDFs in folder (with automatic OCR handling)\n", "indexing_pipeline.run({\"pdf_to_text_or_ocr\": pdf_files})\n", "\n", "print(f\"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Performing OCR on scanned PDF: /root/hbr/PublicWaterMassMailing-1_removed.pdf\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Extracting text from non-scanned PDF: /root/hbr/HBR_Synthetic_Data.pdf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating sparse embeddings: 100%|██████████| 18/18 [00:08<00:00, 2.02it/s]\n", "100it [00:00, 577.69it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "Indexed documents in Qdrant. Total documents: 18\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.components.preprocessors import DocumentCleaner\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedSparseDocumentEmbedder,\n", " FastembedDocumentEmbedder\n", ")\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "import os\n", "from pathlib import Path\n", "import numpy as np\n", "from openai import OpenAI\n", "import base64\n", "from pdf2image import convert_from_path\n", "from PyPDF2 import PdfReader\n", "from io import BytesIO\n", "\n", "# Initialize OpenAI client for DeepInfra embeddings and OCR\n", "openai = OpenAI(\n", " api_key=\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " base_url=\"https://api.deepinfra.com/v1/openai\",\n", ")\n", "\n", "# Function to check if PDF is scanned (no extractable text)\n", "def is_scanned_pdf(pdf_path):\n", " try:\n", " with open(pdf_path, \"rb\") as f:\n", " pdf_reader = PdfReader(f)\n", " for page in pdf_reader.pages:\n", " if page.extract_text():\n", " return False # Text found, not a scanned document\n", " return True # No extractable text found, likely a scanned document\n", " except Exception as e:\n", " raise ValueError(f\"Error reading PDF file: {e}\")\n", "\n", "# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model\n", "def perform_ocr_on_pdf(pdf_path):\n", " try:\n", " images = convert_from_path(pdf_path)\n", " ocr_text = \"\"\n", " ocr_metadata = []\n", "\n", " for i, image in enumerate(images):\n", " # Convert image to base64 for OCR processing\n", " buffered = BytesIO()\n", " image.save(buffered, format=\"JPEG\")\n", " base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')\n", " \n", " # Perform OCR using DeepInfra Vision model\n", " chat_completion = openai.chat.completions.create(\n", " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n", " }\n", " },\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"only respond with all the extracted text from the document and nothing else\"\n", " }\n", " ]\n", " }\n", " ]\n", " )\n", " \n", " # Append OCR result for each page along with page number metadata\n", " ocr_text += chat_completion.choices[0].message.content + \"\\n\"\n", " ocr_metadata.append({\n", " \"page_number\": i + 1,\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"content\": ocr_text\n", " })\n", " \n", " return ocr_metadata\n", " \n", " except Exception as e:\n", " raise ValueError(f\"Error during OCR processing: {e}\")\n", "\n", "@component\n", "class PdfToTextOrOCR:\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Path]) -> List[Document]:\n", " processed_documents = []\n", " \n", " for pdf_path in documents:\n", " if is_scanned_pdf(pdf_path):\n", " print(f\"Performing OCR on scanned PDF: {pdf_path}\")\n", " ocr_metadata = perform_ocr_on_pdf(pdf_path)\n", " \n", " # Create Haystack Document object with extracted content and metadata per page.\n", " for meta in ocr_metadata:\n", " meta_without_content = {key: value for key, value in meta.items() if key != 'content'}\n", " doc = Document(content=meta['content'], meta=meta_without_content)\n", " processed_documents.append(doc)\n", " else:\n", " print(f\"Extracting text from non-scanned PDF: {pdf_path}\")\n", " with open(pdf_path, \"rb\") as f:\n", " reader = PdfReader(f)\n", " extracted_text = \"\\n\".join([page.extract_text() for page in reader.pages])\n", " \n", " # Create Haystack Document object with extracted content and metadata.\n", " doc = Document(content=extracted_text, meta={\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"source_type\": \"pdf\"\n", " })\n", " processed_documents.append(doc)\n", " \n", " return {'documents':processed_documents} # Return list of Document objects directly\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", " def to_dict(self):\n", " return {\n", " \"api_key\": self.client.api_key,\n", " \"model_name\": self.model_name,\n", " \"base_url\": self.client.base_url\n", " }\n", "\n", " @classmethod\n", " def from_dict(cls, data):\n", " return cls(\n", " api_key=data[\"api_key\"],\n", " model_name=data[\"model_name\"],\n", " base_url=data[\"base_url\"]\n", " )\n", "\n", "# Initialize Qdrant document store as before\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"_test\",\n", " recreate_index=False,\n", " use_sparse_embeddings=True,\n", " sparse_idf=True,\n", " embedding_dim=768 # Adjust based on your DeepInfra model's dimension\n", ")\n", "\n", "# Initialize other components as before\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", ")\n", "\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300,\n", " split_overlap=30,\n", ")\n", "\n", "# Initialize embedders (DeepInfra and Fastembed)\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE,\n", ")\n", "\n", "# Create and configure the pipeline with new PdfToTextOrOCR component\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"pdf_to_text_or_ocr\", PdfToTextOrOCR())\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect components in the pipeline flow \n", "indexing_pipeline.connect(\"pdf_to_text_or_ocr\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Get list of PDF files and prepare metadata as before\n", "pdf_folder = \"/root/hbr\"\n", "pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]\n", "\n", "# Run the pipeline on all PDFs in folder (with automatic OCR handling)\n", "indexing_pipeline.run({\"documents\": pdf_files})\n", "\n", "print(f\"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Performing OCR on scanned PDF: /root/hbr/PublicWaterMassMailing.pdf\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 1, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 2, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 3, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 4, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 5, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 6, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 7, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n", "Document Content: Missouri Department of Health and Senior Services\n", "P.O. Box 570, Jefferson City, MO 65102-0570\n", "Phone: 573-751-6400\n", "FAX: 573-751-6010\n", "RELAY MISSOURI for Hearing and Speech Impaired 1-800-735-2966 VOICE 1-800-735-2466\n", "\n", "Peter Lyskowski\n", "Acting Director\n", "\n", "Jeremiah W. (Jay) Nixon\n", "Governor\n", "\n", "Missouri Public Water Systems\n", "\n", "November 10, 2015\n", "\n", "Dear Public Water System Owners/Operators:\n", "\n", "The Missouri State Public Health Laboratory (MSPHL) is in the process of implementing a new Laboratory Information Management System (LIMS) in its drinking water bacteriology testing laboratory. The OpenELIS (OE) LIMS will provide the laboratory with improved sample management capability, improved data integrity and reduced potential for human data entry error. In addition, the system will provide improved reporting capabilities, including direct electronic data exchange with the Missouri Department of Natural Resources' (MDNR) Safe Drinking Water Information System (SDWIS). SDWIS is the computer system MDNR uses to store regulatory water testing data and report testing results to you and the U.S. Environmental Protection Agency. In addition, the new OE LIMS will provide a web portal that MSPHL clients can use to access their own test results in real time.\n", "\n", "As the MSPHL implements this new computer system, several changes will be made in the way you collect and submit water samples for testing. This letter and information packet will provide you with information to help educate you on these changes.\n", "\n", "NEW SAMPLE BOTTLES:\n", "\n", "Beginning in August 2015, the MSPHL began using a larger sample bottle for water bacterial testing. This bottle has a shrink wrap seal and two lines to indicate the proper sample volume. Please read the attached \"SAMPLE COLLECTION INSTRUCTIONS\" for details on how to use these new bottles. Sample volume MUST be within the two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your old bottles until the MSPHL can ship you new ones. Once you have received the new bottles, please discard or recycle the old bottles.\n", "\n", "NEW SAMPLE INFORMATION FORMS:\n", "\n", "The traditional sample information \"card\" that has been used for more than twenty years is being replaced by the Environmental Sample Collection Form. An example form is attached. Please read the attached instructions for information on properly completing the new form.\n", "\n", "Changes to the form include\n", "Contract operators will be provided with forms for all the supplies they operate. Blank forms will be available for MDNR Regional Office staff use.\n", "\n", "The form requires all requested information to be printed by the collector. There are no longer check boxes for Sample Type or Repeat Location.\n", "\n", "Facility ID, Sample Collection Point ID and Location for the sampling site MUST be provided by the collector. This information is available from your MDNR approved PWS sampling plan. MDNR will be providing all public water systems with a current copy of their approved sampling plan. This information is required by SDWIS and is used by MDNR to ensure regulatory compliance requirements have been met. Failure to complete this information on the sample collection form may result in a non-compliance report from MDNR.\n", "\n", "A Collector Signature line has been added. The sample collector must sign the form to attest the information provided is accurate to the best of their knowledge.\n", "\n", "The MSPHL will begin shipping the new forms to public water systems in late November or early December. Please begin using the new forms December 16, 2015. Discard all the old forms (\"cards\") at that time.\n", "\n", "NEW SAMPLE INSTRUCTIONS:\n", "\n", "Sample instructions have been revised to include changes to the bottle and sampling form. The instructions include detailed information on how to collect the sample using the new bottle, how to complete the new sample collection form, how to best ship samples to the MSPHL using the free MSPHL courier system, and how to register for the new MSPHL web portal. A copy of these instructions is attached.\n", "\n", "NEW WEB PORTAL FOR RESULTS REPORTS\n", "\n", "The OE LIMS provides a web portal that may be used by systems to view and print their test result reports, check status of samples, download sample information into Excel, and receive automated emails when samples are received at the laboratory, and when sample results are ready to be viewed. For information on how to gain access to this portal, please contact Shondra Johnson, LIMS Administrator at Shondra.Johnson@health.mo.gov or at 573-751-3334.\n", "\n", "IMPLEMENTATION DATES:\n", "\n", "The MSPHL intends to implement the OpenELIS LIMS on December 1, 2015. There will be a two week testing period in which laboratory staff will run the new LIMS in conjunction with our current manual, paper-based system to ensure the OE LIMS is operating properly. You may continue to submit samples as you currently do, using the old sample information card, throughout this time.\n", "\n", "On December 16, 2015,\n", "If you have any questions, please contact the MSPHL Environmental Bacteriology Unit at 573-751-3334. You may also contact your MDNR Regional Office for additional information on sample collection.\n", "\n", "Once again, thank you for your patience and understanding as we implement these changes.\n", "\n", "Patrick R. Shannon\n", "\n", "Manager, Environmental Bacteriology Unit\n", "\n", "Missouri Department of Health and Senior Services\n", "\n", "State Public Health Laboratory\n", "\n", "101 North Chestnut St.\n", "\n", "P.O. Box 570\n", "\n", "Jefferson City, MO 65102\n", "\n", "Phone: 573-751-3334\n", "\n", "Email: Pat.Shannon@health.mo.gov\n", "\n", "Web: www.health.mo.gov/Lab\n", "**Order**: 984\n", "**Pages in Order**: 1 of 1\n", "**Containers in Order**: 1\n", "**REPORT TO**: ADRIAN 16 E 5TH ST ADRIAN, MO 64720\n", "**BILL TO**: MO DEPARTMENT OF NATURAL RESOURCES 1101 RIVERSIDE DRIVE JEFFERSON CITY, MO 65102\n", "**PUBLIC DRINKING WATER BACTERIAL ANALYSIS**\n", "**Total Coliform Bacteria and E. coli (Present/Absent Test)**\n", "**PRINT LEGIBLY**\n", "Instructions for completing form are supplied in the Collection Kit. For compliance monitoring questions, contact the Missouri Department of Natural Resources-Public Drinking Water Branch at (573) 751-5331 or your regional office. For laboratory test results or testing questions, contact the Missouri State Public Health Laboratory at (573) 751-3334.\n", "**Complete or correct the following information**\n", "**Collected Date**: XXXX-XX-XX\n", "**PWS Id**: MO1010001\n", "**Collected Time**: 24 hour format hhs.mm:\n", "**Facility Id**: DS\n", "**Sample Type**: routine, repeat, special, replacement, sources\n", "**Sample Collection Point Id**: sampling point id from sample site plan\n", "**Collector**: last name, first name\n", "**Sample Category**: Bacterial\n", "**Collector Phone**: 000/111-2222\n", "**Sample Location**: upstream, downstream, original source, other\n", "**Bottle Number**: \n", "**Free Chlorine**: mg/L\n", "**Total Chlorine**: \n", "**Collector Signature**: \n", "**Repeat Location**: \n", "**County**: BATES\n", "**For Laboratory Use Only -- Please do not write below this line**\n", "**Received By**: \n", "**Evidence of Tampering**: \n", "**Date Printed**: 2015-11-06\n", "**Bottles Received**: \n", "**pH**: \n", "**Evidence of Cooling**: \n", "**Temperature (Celsius)**: \n", "**Thermometer ID**: \n", "**BUILD ID**: \n", "**PLACE THE ACCESSION LABEL WITHIN THIS BOX**\n", "SAMPLE COLLECTION INSTRUCTIONS PUBLIC DRINKING WATER FOR COLIFORM BACTERIA ANALYSIS This sample kit and collection method is for public drinking water regulatory compliance and special samples. Only samples collected in bottles supplied by the Missouri State Public Health Laboratory (MSPHL) and collected in accordance with these instructions will be accepted for testing. PLEASE READ THESE INSTRUCTIONS COMPLETELY BEFORE COLLECTING SAMPLES. Sample Containers: Sample bottles from the MSPHL contain a chlorine neutralizer that is present in powder or liquid form. The bottles are sterile and ready for use when shipped. Do not rinse the contents from the container and keep the bottle closed until it is to be filled. Shrink Wrap Seal: Remove the seal by pulling down on the red strip and pealing shrink wrap from both the cap and bottle. Discard all shrink wrap. Do not attempt to reseal lid with shrink wrap still attached. Two Fill Lines: Fill the bottle until the water sample level is BETWEEN THE TWO LINES. Place the bottle on a level surface to check the sample level. Samples below the 100 mL (lower) line WILL NOT BE TESTED due to insufficient sample volume. Samples above the 120 mL (upper) line WILL NOT BE TESTED due to overfilled bottle. Technical protocol and EPA requirements dictate that bottles must have sufficient air space to add testing reagents and to mix the sample properly. If the bottle is overfilled past the 120 mL line, pour off water until the sample volume is between the two lines before shipping to MSPHL. MSPHL WILL NOT adjust sample volume once the sample is received at the lab. No Paper Label: There is no longer a label to record sample information on the bottle. DO NOT WRITE ON THE BOTTLE. Please complete a sample information form for each sample submitted for testing. DATE AND TIME OF SAMPLE COLLECTION and the BOTTLE NUMBER (from sticker on bottle) ARE REQUIRED. A form for each bottle is included in this sample kit. For More Information, please contact: Missouri Department of Health and Senior Services State Public Health Laboratory Environmental Bacteriology Unit 101 North Chestnut St., P.O. Box 570 Jefferson City, MO 65102 Phone: 573-751-3334 FAX: 573-522-4032 Email: Website: www.health.mo.gov/Lab LAB 34 Public Water (R10-2015)\n", "Bacteriological Sample Collection Procedures\n", "\n", "Assemble all of the sampling supplies. Before you begin, wash your hands thoroughly before handling supplies. Go to the sampling location(s) specified in your Missouri Department of Natural Resources (MDNR) approved sampling site plan. The sample should be taken from a clean, smooth-nosed cold water faucet if possible. Avoid drinking fountains, leaky faucets, hot/cold mixing faucets and frost-proof yard hydrants since it is not practical to sterilize these fixtures. If possible, remove any aerators, strainers or hoses that are present because they may harbor bacteria. Follow the procedures below when collecting the sample. Instructions for completing the environmental sampling form are on the following page.\n", "\n", "1. Open the cold water tap for about 3 minutes before collecting the sample. This should adequately flush the water line of any debris.\n", "\n", "2. Flame-sterilize the tap and/or chemically disinfect the tap. Do not flame-sterilize if tap is plastic or if aerators are attached. Disinfect tap by thoroughly rinsing both the inside and outside of the tap with a mixture of 50% house-hold bleach (NaOCl) and 50% tap water. Take extreme care with strong bleach (oxidizing) solutions.\n", "\n", "3. Flush the tap for an additional 3 minutes with cold water, and then reduce to a gentle flow to about the width of a pencil. Do not change the water flow once you have started sampling as this could dislodge contaminants in the tap.\n", "\n", "4. Remove the plastic shrink wrap seal by pulling down on the red strip and pealing the shrink wrap from both the cap and bottle. Discard the shrink wrap. Do not attempt to reseal the lid with shrink wrap still attached.\n", "\n", "5. Grasp cap along top edge and remove carefully. Do not touch the inside with your fingers. Hold the bottle in one hand and the cap in the other. Do not lay the cap down or put it in a pocket. Also, take care not to contaminate the sterile bottle or cap with your fingers or permit the faucet to touch the inside of the bottle.\n", "\n", "6. Hold the bottle so that water entering the bottle will not come in contact with your hands or the outside of the bottle.\n", "\n", "7. Fill the bottle until the water sample level is BETWEEN THE TWO LINES on the bottle (100 – 120 ml). Preferably, the sample level should be at or just slightly above the 100 ml line. Sample levels below the\n", "**INSTRUCTIONS FOR COMPLETING ENVIRONMENTAL SAMPLE COLLECTION FORM**\n", "\n", "**Public Drinking Water Bacterial Analysis**\n", "\n", "**PRINT LEGIBLY** using water proof ink. A standard ink pen is sufficient. Complete ALL sample information lines on the form. Some sections of the form may already be completed by the laboratory computer system when the forms are printed. To make corrections, please draw a single line through the inaccurate information and print the corrected information behind it. The sections of the form and directions for completing each line are as follows:\n", "\n", "**Order #:** For Missouri State Public Health Lab (MSPHL) purposes only. Pages in Order and Containers in Order indicate number of forms and sample bottles shipped in the sample kit order.\n", "\n", "**REPORT TO:** Public water system's name and shipping address on file with Missouri Department of Natural Resources (MDNR). Please review and correct if necessary. Result reports will be mailed to this address.\n", "\n", "**BILL TO:** Section defaulted to the MDNR. There are no charges for public water testing at the MSPHL.\n", "\n", "**Requested Analysis/Tests:** This section will state PUBLIC DRINKING WATER BACTERIAL ANALYSIS. If it does not, you may have the wrong collection form. Please contact the MSPHL or MDNR for the proper form. Do not use forms from a local county health agency as those forms are for private well water samples. Your MDNR Regional Office can provide blank forms for your use.\n", "\n", "**Complete or correct the following information:** All lines are considered required information. Failure to complete a line may result in an invalid sample.\n", "\n", "**Collected Date:** Enter the date of sample collection in the format YYYY-MM-DD. Use 4 digits for year and 2 digits for month and date. November 1, 2015 would be written as 2015-11-01.\n", "\n", "**Collected Time:** Enter the time of sample collection using 24-hour military format h:mm.\n", "\n", "**PWS ID:** If blank, enter your 7-digit Public Water System ID number as assigned by MDNR (MO############).\n", "\n", "**Facility ID:** Defaulted to DS (Distribution System) for routine samples. If submitting a sample type other than Routine, enter the Facility ID number from your system's MDNR approved sample site plan (for example DS#, WL#, WTF#).\n", "\n", "**Sample Type:** Enter one of the following options:\n", "\n", "* Routine - Regular monthly monitoring samples.\n", "* Repeat - A series of 3 or 4 repeat samples (4 if you only take 1 routine per month) must\n", "Per U.S. Environmental Protection Agency requirements, public water samples must be received by the laboratory and tested within 30 hours of the date and time of collection. The MSPHL and MDNR recommend you use the free Department of Health and Senior Services (DHSS) contract courier for overnight delivery to the MSPHL. This courier picks up at most local public health agency offices and hospitals (Note: Not all hospitals will accept water samples for courier pick up). For sample drop off locations and times, please go to http://www.health.mo.gov/lab/courierservices.php and click on the interactive map or the listing of drop off locations by county; or you may call the MSPHL courier liaison at (573) 751-4830, or the MDNR Public Drinking Water Branch (PDWB) at (573) 526-1124.\n", "\n", "Please note the courier is allowed to pick up samples within one hour of the scheduled time (before or after). The earliest pick up time is at 10:30 a.m. To ensure your samples meet the transit time requirement of 30 hours, it is important that you collect your samples in the morning and have them dropped off at the courier pickup point one hour prior to the scheduled time.\n", "\n", "Use of the U.S. Postal Service or other commercial carriers such as Fed Ex or UPS will require additional charges and may not meet the 30 hour transit time requirement.\n", "\n", "Samples should not be en route to the laboratory over a weekend or state holiday (New Year's Day, Martin Luther King Day, Lincoln's Birthday, Washington's Birthday, Truman's Birthday, Memorial Day, Independence Day, Labor Day, Columbus Day, Veteran's Day, Thanksgiving Day, and Christmas.)\n", "\n", "Public water supplies may use the new MSPHL Test Results Web Portal to retrieve preliminary test results on-line. For information on how to register as a user for the web portal and to receive email notifications, please contact the MSPHL LIMS Administrator at shondra-johnson@health.mo.gov or call 573-751-3334. These preliminary test results are for informational purposes only. Official test results are available on-line within 2 or 3 business days at the MDNR Drinking Water Watch website http://dnr.mo.gov/DWW/ In addition, the official bacteriological sample reports will be mailed by MDNR within 4 or 5 business days.\n", "\n", "Additional sample bottles can be ordered on-line at http://www.health.mo.gov/lab/specimentestforms.php or by calling the MSPHL Central Services Unit at (573\n", "\n", "Document Metadata: {'page_number': 8, 'file_name': 'PublicWaterMassMailing.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing.pdf'}\n" ] } ], "source": [ "@component\n", "class PdfToTextOrOCR:\n", " def run(self, documents: List[Path]) -> List[Document]:\n", " processed_documents = []\n", " \n", " for pdf_path in documents:\n", " if is_scanned_pdf(pdf_path):\n", " print(f\"Performing OCR on scanned PDF: {pdf_path}\")\n", " extracted_text, ocr_metadata = perform_ocr_on_pdf(pdf_path)\n", " \n", " # Create Haystack Document object with extracted content and metadata per page.\n", " for meta in ocr_metadata:\n", " doc = Document(content=extracted_text, meta=meta)\n", " processed_documents.append(doc)\n", " else:\n", " print(f\"Extracting text from non-scanned PDF: {pdf_path}\")\n", " with open(pdf_path, \"rb\") as f:\n", " reader = PdfReader(f)\n", " extracted_text = \"\\n\".join([page.extract_text() for page in reader.pages])\n", " \n", " # Create Haystack Document object with extracted content and metadata.\n", " doc = Document(content=extracted_text, meta={\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"source_type\": \"pdf\"\n", " })\n", " processed_documents.append(doc)\n", " \n", " return processed_documents # Return list of Document objects directly\n", "\n", "# Test function for PdfToTextOrOCR\n", "def test_pdf_to_text_or_ocr():\n", " # Sample PDF file paths (replace with actual file paths on your system)\n", " pdf_files = [Path(\"/root/hbr/PublicWaterMassMailing.pdf\")]\n", "\n", " # Initialize the component\n", " pdf_to_text_or_ocr = PdfToTextOrOCR()\n", "\n", " # Run the component\n", " documents = pdf_to_text_or_ocr.run(pdf_files)\n", "\n", " # Print results\n", " for doc in documents:\n", " print(f\"Document Content: {doc.content}\")\n", " print(f\"Document Metadata: {doc.meta}\")\n", "\n", "# Run the test\n", "test_pdf_to_text_or_ocr()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"_test\",\n", " recreate_index=False,\n", " use_sparse_embeddings=True,\n", " sparse_idf=True,\n", " embedding_dim=768 # Adjust based on your DeepInfra model's dimension\n", ")\n" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Document 1:\n", "Content: Document(id=37d0166926cefc87821ad1ae66ecada0d2a72324f90e8816e6efd78283bc993d, content: 'Harvard Business Review Analytic Services is an independent commercial research unit within Harvard ...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'c5673577ed0630f0cb66291cb909ff1eacbeae86d81bea949471539268058aa5', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, embedding: vector of size 768, sparse_embedding: vector with 143 non-zero elements)\n", "Document 2:\n", "Content: Document(id=07b6945c2a65348c4c02255c196391acba22d36171e9f0b45109b161ce414535, content: 'Harvard Business Review Analytic Services\n", "7Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '632cdeded3f3fa7ea36f7598134a4c0323cd063ded84537e9bdf3ba16e963394', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, embedding: vector of size 768, sparse_embedding: vector with 149 non-zero elements)\n", "Document 3:\n", "Content: Document(id=064bb1421bdb620a4f74f54002927b8dd4d802b74b62abe6f78eace2fbe5dac6, content: 'synthetic data generation project have learned and mastered new technology. This is how we unlock fu...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '2ccd4634b1962d931b5d6267c1c6e2db61d6b406c6c57aeb1296372afefee6af', 'split_id': 1, 'split_idx_start': 1794, '_split_overlap': [{'doc_id': 'fce0983f19329441eb661e7afeddcd86154b443870269436766d7a92e2726b4e', 'range': [1794, 1970]}, {'doc_id': 'cdf1bc90fb9a8a5e5f7b0bd3293c86575c51d02dd2da00facff0f444b75d6c4b', 'range': [0, 167]}]}, embedding: vector of size 768, sparse_embedding: vector with 184 non-zero elements)\n", "Document 4:\n", "Content: Document(id=7a898cf65eb0843c5b8eba0ae13ff1e3afa6971bd2150ef645b8ba27cb3ec604, content: 'But its a process because its a rather new thing. For him, trust goes hand in hand with organization...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'fe672fd1f6c6ce570133bdedff7745ff4d99765beafe60ebc3fefa29c7792a42', 'split_id': 1, 'split_idx_start': 1716, '_split_overlap': [{'doc_id': '1dde0328b3d8ae4b1902886b6c5ebba9e9891a72ff49ec8afc0580a0f7046218', 'range': [1716, 1877]}, {'doc_id': '7bbb109e793d05927e42cde821ebda29fa4cf2db76b150acf1ec56d935f2a54f', 'range': [0, 197]}]}, embedding: vector of size 768, sparse_embedding: vector with 168 non-zero elements)\n", "Document 5:\n", "Content: Document(id=826c2d987c4ebe393a3f3f8a9b08752a0eafea8f188214f125f9151fdbe61b82, content: 'Harvard Business Review Analytic Services\n", "8Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '6134dee2da25d7015932dd4eb5186948d2fcc46cda2a3574ccce2f103bd93f1a', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, embedding: vector of size 768, sparse_embedding: vector with 163 non-zero elements)\n", "Document 6:\n", "Content: Document(id=81a378ae43e3f3ad9f0937a57fcba99137dcb262aab5b19680936548574c74ff, content: 'Harvard Business Review Analytic Services\n", "2Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '6407fd1614ea07d5717b715cfaed09885fc764f3170f83ee6b8d23b342d64293', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': '5f69281a39ad9b9b7ee3894107f84215b61bf6595b5bee40aff0c157976db755', 'range': [0, 184]}]}, embedding: vector of size 768, sparse_embedding: vector with 169 non-zero elements)\n", "Document 7:\n", "Content: Document(id=f229f476aee43a8fa22cdb9d3c7940f1a5633f63a915e1c1d3f92543a4e28895, content: 'well as data sharing and time to data (and therefore time to market). It can speed up the analytics ...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '71b660fb882f89326ebe4ecbcfe7bed44549dea1a0ec9b8ac542956df25ddc30', 'split_id': 1, 'split_idx_start': 1834, '_split_overlap': [{'doc_id': 'a3e2782aada9265cd9790893359a02fda4a9c251960c4903193b24e06eee6126', 'range': [1834, 2015]}]}, embedding: vector of size 768, sparse_embedding: vector with 150 non-zero elements)\n", "Document 8:\n", "Content: Document(id=fe65f8a615d9d166331c8b050533315d17615afedbbe6b483cac8f75eebcc33b, content: 'REPORTS The OE LIMS provides a web portal that may be used by systems to view and print their test r...', meta: {'file_name': 'PublicWaterMassMailing-1_removed.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing-1_removed.pdf', 'page_number': 1, 'source_id': 'd67ece661298f799e10ab412ad04a8251a5de49fbaf10d6379df37c87ca6a693', 'split_id': 1, 'split_idx_start': 1632, '_split_overlap': [{'doc_id': '65807fbafe034b4c918dbc80453db48578fafa45e562f3741ec80ea66ee2392a', 'range': [1632, 1805]}]}, embedding: vector of size 768, sparse_embedding: vector with 153 non-zero elements)\n", "Document 9:\n", "Content: Document(id=cdf1bc90fb9a8a5e5f7b0bd3293c86575c51d02dd2da00facff0f444b75d6c4b, content: 'you take phone data, its mostly collected from the younger generation, so if you apply it to seniors...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '2ccd4634b1962d931b5d6267c1c6e2db61d6b406c6c57aeb1296372afefee6af', 'split_id': 2, 'split_idx_start': 3455, '_split_overlap': [{'doc_id': '064bb1421bdb620a4f74f54002927b8dd4d802b74b62abe6f78eace2fbe5dac6', 'range': [1661, 1828]}]}, embedding: vector of size 768, sparse_embedding: vector with 194 non-zero elements)\n", "Document 10:\n", "Content: Document(id=1dde0328b3d8ae4b1902886b6c5ebba9e9891a72ff49ec8afc0580a0f7046218, content: 'Harvard Business Review Analytic Services\n", "4Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'fe672fd1f6c6ce570133bdedff7745ff4d99765beafe60ebc3fefa29c7792a42', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': '7a898cf65eb0843c5b8eba0ae13ff1e3afa6971bd2150ef645b8ba27cb3ec604', 'range': [0, 161]}]}, embedding: vector of size 768, sparse_embedding: vector with 156 non-zero elements)\n", "Document 11:\n", "Content: Document(id=5f69281a39ad9b9b7ee3894107f84215b61bf6595b5bee40aff0c157976db755, content: 'information about your health care. There is plenty of data to mine in medical records; there are in...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '6407fd1614ea07d5717b715cfaed09885fc764f3170f83ee6b8d23b342d64293', 'split_id': 1, 'split_idx_start': 1723, '_split_overlap': [{'doc_id': '81a378ae43e3f3ad9f0937a57fcba99137dcb262aab5b19680936548574c74ff', 'range': [1723, 1907]}, {'doc_id': '54f105fe2f2d3d7d67e347a6b24660ae1feca604fc886b2af5bc61f8336779ca', 'range': [0, 194]}]}, embedding: vector of size 768, sparse_embedding: vector with 150 non-zero elements)\n", "Document 12:\n", "Content: Document(id=41d65df62a595d5d3d38ee1e4528e67379ab7a883e115633a58412fdf250c9b5, content: 'AI and can be used to modify existing datasets, e.g., to correct for present biases. With the upcomi...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'e190887fc3099366dcc8979ba5bc82bc4ca743411656a2fe8dc08fcedc09489e', 'split_id': 1, 'split_idx_start': 1896, '_split_overlap': [{'doc_id': '0e2a87b1d98a159ca80536c750b5936630c8701c36ac6d7a94af171329fcf159', 'range': [1896, 2078]}]}, embedding: vector of size 768, sparse_embedding: vector with 137 non-zero elements)\n", "Document 13:\n", "Content: Document(id=a3e2782aada9265cd9790893359a02fda4a9c251960c4903193b24e06eee6126, content: 'H I G H L I G H T S\n", "1Briefng Paper | The Executives Guide to Accelerating Artifcial Intelligence and...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '71b660fb882f89326ebe4ecbcfe7bed44549dea1a0ec9b8ac542956df25ddc30', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': 'f229f476aee43a8fa22cdb9d3c7940f1a5633f63a915e1c1d3f92543a4e28895', 'range': [0, 181]}]}, embedding: vector of size 768, sparse_embedding: vector with 162 non-zero elements)\n", "Document 14:\n", "Content: Document(id=7bbb109e793d05927e42cde821ebda29fa4cf2db76b150acf1ec56d935f2a54f, content: 'either between different departments within the organization or with a third party, was not possible...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'fe672fd1f6c6ce570133bdedff7745ff4d99765beafe60ebc3fefa29c7792a42', 'split_id': 2, 'split_idx_start': 3313, '_split_overlap': [{'doc_id': '7a898cf65eb0843c5b8eba0ae13ff1e3afa6971bd2150ef645b8ba27cb3ec604', 'range': [1597, 1794]}]}, embedding: vector of size 768, sparse_embedding: vector with 185 non-zero elements)\n", "Document 15:\n", "Content: Document(id=0e2a87b1d98a159ca80536c750b5936630c8701c36ac6d7a94af171329fcf159, content: 'Tobias Hann\n", "CEO MOSTLY AIOrganizations today face a challenging environment where its easy to fall b...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'e190887fc3099366dcc8979ba5bc82bc4ca743411656a2fe8dc08fcedc09489e', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': '41d65df62a595d5d3d38ee1e4528e67379ab7a883e115633a58412fdf250c9b5', 'range': [0, 182]}]}, embedding: vector of size 768, sparse_embedding: vector with 187 non-zero elements)\n", "Document 16:\n", "Content: Document(id=54f105fe2f2d3d7d67e347a6b24660ae1feca604fc886b2af5bc61f8336779ca, content: 'Sicular, a research vice president at Gartner. But as we saw during Covid-19, a lot of models stoppe...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '6407fd1614ea07d5717b715cfaed09885fc764f3170f83ee6b8d23b342d64293', 'split_id': 2, 'split_idx_start': 3258, '_split_overlap': [{'doc_id': '5f69281a39ad9b9b7ee3894107f84215b61bf6595b5bee40aff0c157976db755', 'range': [1535, 1729]}]}, embedding: vector of size 768, sparse_embedding: vector with 164 non-zero elements)\n", "Document 17:\n", "Content: Document(id=65807fbafe034b4c918dbc80453db48578fafa45e562f3741ec80ea66ee2392a, content: 'Contract operators will be provided with forms for all the supplies they operate. Blank forms will b...', meta: {'file_name': 'PublicWaterMassMailing-1_removed.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing-1_removed.pdf', 'page_number': 1, 'source_id': 'd67ece661298f799e10ab412ad04a8251a5de49fbaf10d6379df37c87ca6a693', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': 'fe65f8a615d9d166331c8b050533315d17615afedbbe6b483cac8f75eebcc33b', 'range': [0, 173]}]}, embedding: vector of size 768, sparse_embedding: vector with 140 non-zero elements)\n", "Document 18:\n", "Content: Document(id=fce0983f19329441eb661e7afeddcd86154b443870269436766d7a92e2726b4e, content: 'Harvard Business Review Analytic Services\n", "6Briefng Paper | The Executives Guide to Accelerating Arti...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': '2ccd4634b1962d931b5d6267c1c6e2db61d6b406c6c57aeb1296372afefee6af', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': '064bb1421bdb620a4f74f54002927b8dd4d802b74b62abe6f78eace2fbe5dac6', 'range': [0, 176]}]}, embedding: vector of size 768, sparse_embedding: vector with 176 non-zero elements)\n", "Document 19:\n", "Content: Document(id=168136a2746a1e693be2b847e5ee6f790489692b762af23110a703f5b7b3b835, content: 'Sponsored byB R I E F I N G P A P E R\n", "The Executives Guide to Accelerating Articial Intelligence and...', meta: {'file_name': 'HBR_Synthetic_Data.pdf', 'file_path': '/root/hbr/HBR_Synthetic_Data.pdf', 'page_number': 1, 'source_id': 'db28019880736a01edd5316e51442eee6d7aa3459b980f9a47cd0623310ce116', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, embedding: vector of size 768, sparse_embedding: vector with 92 non-zero elements)\n", "Document 20:\n", "Content: Document(id=f7a970be01dcdb8060ef9f7919900178b4d8a15c8ec65177776d0d9b87a5c693, content: '**Missouri Department of Health and Senior Services** P.O. Box 570, Jefferson City, MO 65102-0570\n", "Ph...', meta: {'file_name': 'PublicWaterMassMailing-1_removed.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing-1_removed.pdf', 'page_number': 1, 'source_id': '9bdb1812f23e69c3630fa01aa86e152a6869c7615944dcf523c7d2f498e243a1', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': [{'doc_id': 'ac8baa090d17a8b0e945f318cbb33c94ecd2be7b819b46a6c453950a14ec0c2e', 'range': [0, 149]}]}, embedding: vector of size 768, sparse_embedding: vector with 87 non-zero elements)\n", "Document 21:\n", "Content: Document(id=ac8baa090d17a8b0e945f318cbb33c94ecd2be7b819b46a6c453950a14ec0c2e, content: 'two lines on the bottle (100 - 120 mL) to be acceptable for testing. You may continue to use your ol...', meta: {'file_name': 'PublicWaterMassMailing-1_removed.pdf', 'file_path': '/root/hbr/PublicWaterMassMailing-1_removed.pdf', 'page_number': 1, 'source_id': '9bdb1812f23e69c3630fa01aa86e152a6869c7615944dcf523c7d2f498e243a1', 'split_id': 1, 'split_idx_start': 1854, '_split_overlap': [{'doc_id': 'f7a970be01dcdb8060ef9f7919900178b4d8a15c8ec65177776d0d9b87a5c693', 'range': [1854, 2003]}]}, embedding: vector of size 768, sparse_embedding: vector with 138 non-zero elements)\n" ] } ], "source": [ "# Retrieve and print the top 100 records from Qdrant using get_documents_generator\n", "top_documents = list(document_store.get_documents_generator())[:100]\n", "\n", "# Print the content and metadata of each document\n", "for i, doc in enumerate(top_documents):\n", " print(f\"Document {i+1}:\")\n", " print(f\"Content: {doc}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Calculating sparse embeddings: 100%|██████████| 21/21 [00:06<00:00, 3.10it/s]\n", "100it [00:00, 532.35it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "Indexed documents in Qdrant. Total documents: 21\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.components.preprocessors import DocumentCleaner\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack_integrations.components.embedders.fastembed import (\n", " FastembedSparseDocumentEmbedder,\n", " FastembedDocumentEmbedder\n", ")\n", "from haystack.dataclasses import Document\n", "from typing import List\n", "import os\n", "from pathlib import Path\n", "import numpy as np\n", "from openai import OpenAI\n", "import base64\n", "from pdf2image import convert_from_path\n", "from PyPDF2 import PdfReader\n", "from io import BytesIO\n", "\n", "# Initialize OpenAI client for DeepInfra embeddings and OCR\n", "openai = OpenAI(\n", " api_key=\"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " base_url=\"https://api.deepinfra.com/v1/openai\",\n", ")\n", "\n", "# Function to check if PDF is scanned (no extractable text)\n", "def is_scanned_pdf(pdf_path):\n", " try:\n", " with open(pdf_path, \"rb\") as f:\n", " pdf_reader = PdfReader(f)\n", " for page in pdf_reader.pages:\n", " if page.extract_text():\n", " return False # Text found, not a scanned document\n", " return True # No extractable text found, likely a scanned document\n", " except Exception as e:\n", " raise ValueError(f\"Error reading PDF file: {e}\")\n", "\n", "# Function to convert PDF pages to images and perform OCR using DeepInfra Vision model\n", "def perform_ocr_on_pdf(pdf_path):\n", " try:\n", " images = convert_from_path(pdf_path)\n", " ocr_metadata = []\n", "\n", " for i, image in enumerate(images):\n", " # Convert image to base64 for OCR processing\n", " buffered = BytesIO()\n", " image.save(buffered, format=\"JPEG\")\n", " base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')\n", "\n", " # Perform OCR using DeepInfra Vision model\n", " chat_completion = openai.chat.completions.create(\n", " model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": f\"data:image/jpeg;base64,{base64_image}\"\n", " }\n", " },\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"only respond with all the extracted text from the document and nothing else\"\n", " }\n", " ]\n", " }\n", " ]\n", " )\n", "\n", " # Extract text for this specific page\n", " ocr_page_text = chat_completion.choices[0].message.content\n", "\n", " # Append OCR result for this page along with correct page number metadata\n", " ocr_metadata.append({\n", " \"page_number\": i + 1, # Correctly set page number (i starts at 0)\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"content\": ocr_page_text # Store the text per page\n", " })\n", "\n", " return ocr_metadata\n", " \n", " except Exception as e:\n", " raise ValueError(f\"Error during OCR processing: {e}\")\n", " \n", " \n", " \n", "def extract_text_from_non_scanned_pdf(pdf_path):\n", " try:\n", " with open(pdf_path, \"rb\") as f:\n", " reader = PdfReader(f)\n", " extracted_pages = []\n", " \n", " for i, page in enumerate(reader.pages):\n", " extracted_text = page.extract_text()\n", " \n", " # Append extracted text along with correct page number metadata\n", " extracted_pages.append({\n", " \"page_number\": i + 1, # Correctly set page number (i starts at 0)\n", " \"file_name\": Path(pdf_path).name,\n", " \"file_path\": str(pdf_path),\n", " \"content\": extracted_text\n", " })\n", " \n", " return extracted_pages\n", " \n", " except Exception as e:\n", " raise ValueError(f\"Error during PDF text extraction: {e}\")\n", " \n", " \n", "\n", "@component\n", "class PdfToTextOrOCR:\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Path]) -> dict:\n", " processed_documents = []\n", " \n", " for pdf_path in documents:\n", " if is_scanned_pdf(pdf_path):\n", " ocr_metadata = perform_ocr_on_pdf(pdf_path)\n", " for meta in ocr_metadata:\n", " doc = Document(content=meta['content'], meta={\n", " \"file_name\": meta[\"file_name\"],\n", " \"file_path\": meta[\"file_path\"],\n", " \"page_number\": meta[\"page_number\"] # Include page number here\n", " })\n", " processed_documents.append(doc)\n", " \n", " else:\n", " extracted_pages = extract_text_from_non_scanned_pdf(pdf_path)\n", " for meta in extracted_pages:\n", " doc = Document(content=meta['content'], meta={\n", " \"file_name\": meta[\"file_name\"],\n", " \"file_path\": meta[\"file_path\"],\n", " \"page_number\": meta[\"page_number\"] # Include page number here\n", " })\n", " processed_documents.append(doc)\n", " \n", " return {\"documents\": processed_documents}\n", "\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\",\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", " def to_dict(self):\n", " return {\n", " \"api_key\": self.client.api_key,\n", " \"model_name\": self.model_name,\n", " \"base_url\": self.client.base_url\n", " }\n", "\n", " @classmethod\n", " def from_dict(cls, data):\n", " return cls(\n", " api_key=data[\"api_key\"],\n", " model_name=data[\"model_name\"],\n", " base_url=data[\"base_url\"]\n", " )\n", "\n", "# Initialize Qdrant document store as before\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\",\n", " port=6333,\n", " index=\"_test\",\n", " recreate_index=False,\n", " use_sparse_embeddings=True,\n", " sparse_idf=True,\n", " embedding_dim=768 # Adjust based on your DeepInfra model's dimension\n", ")\n", "\n", "# Initialize other components as before\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", ")\n", "\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300,\n", " split_overlap=30,\n", ")\n", "\n", "# Initialize embedders (DeepInfra and Fastembed)\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE,\n", ")\n", "\n", "# Create and configure the pipeline with new PdfToTextOrOCR component\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"pdf_to_text_or_ocr\", PdfToTextOrOCR())\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect components in the pipeline flow \n", "indexing_pipeline.connect(\"pdf_to_text_or_ocr\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Get list of PDF files and prepare metadata as before\n", "pdf_folder = \"/root/hbr\"\n", "pdf_files = [Path(os.path.join(pdf_folder, filename)) for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]\n", "\n", "# Run the pipeline on all PDFs in folder (with automatic OCR handling)\n", "indexing_pipeline.run({\"documents\": pdf_files})\n", "\n", "print(f\"Indexed documents in Qdrant. Total documents: {document_store.count_documents()}\")\n" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Document ID: 37d0166926cefc87821ad1ae66ecada0d2a72324f90e8816e6efd78283bc993d\n", "File Name: HBR_Synthetic_Data.pdf\n", "Page Number: 1\n", "Content: Harvard Business Review Analytic Services is an independent commercial research unit within Harvard Business Review Group, conducting research and comparative analysis on important management challeng\n", "Document ID: 07b6945c2a65348c4c02255c196391acba22d36171e9f0b45109b161ce414535\n", "File Name: HBR_Synthetic_Data.pdf\n", "Page Number: 1\n", "Content: Harvard Business Review Analytic Services\n", "7Briefng Paper | The Executives Guide to Accelerating Artifcial Intelligence and Data Innovation with Synthetic DataWith recognition of synthetic datas abilit\n", "Document ID: 064bb1421bdb620a4f74f54002927b8dd4d802b74b62abe6f78eace2fbe5dac6\n", "File Name: HBR_Synthetic_Data.pdf\n", "Page Number: 1\n", "Content: synthetic data generation project have learned and mastered new technology. This is how we unlock further innovation, and it helps us to attract top talent to join and stay at Humana. Top talent value\n", "Document ID: 7a898cf65eb0843c5b8eba0ae13ff1e3afa6971bd2150ef645b8ba27cb3ec604\n", "File Name: HBR_Synthetic_Data.pdf\n", "Page Number: 1\n", "Content: But its a process because its a rather new thing. For him, trust goes hand in hand with organizational issues. To make it really work, you need to position this new technology within the existing work\n", "Document ID: 826c2d987c4ebe393a3f3f8a9b08752a0eafea8f188214f125f9151fdbe61b82\n", "File Name: HBR_Synthetic_Data.pdf\n", "Page Number: 1\n", "Content: Harvard Business Review Analytic Services\n", "8Briefng Paper | The Executives Guide to Accelerating Artifcial Intelligence and Data Innovation with Synthetic DataEndnotes\n", "1 Saul Judah, Andrew White, Svetl\n" ] } ], "source": [ "documents = list(document_store.get_documents_generator())\n", "if documents:\n", " for doc in documents[:5]: # Check a few documents\n", " print(f\"Document ID: {doc.id}\")\n", " print(f\"File Name: {doc.meta.get('file_name')}\")\n", " print(f\"Page Number: {doc.meta.get('page_number')}\")\n", " print(\"Content:\", doc.content[:200])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# fast api non ocr code" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "from typing import List\n", "import numpy as np\n", "from fastapi import FastAPI, File, UploadFile, HTTPException\n", "from fastapi.responses import JSONResponse\n", "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack.dataclasses import Document\n", "from openai import OpenAI\n", "\n", "# Define the FastAPI app instance\n", "app = FastAPI()\n", "\n", "# Define custom embedding component using DeepInfra API\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\", # Replace with your actual API key\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " # Get embeddings from DeepInfra API\n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " # Assign embeddings to each document\n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", " def to_dict(self):\n", " return {\n", " \"api_key\": self.client.api_key,\n", " \"model_name\": self.model_name,\n", " \"base_url\": self.client.base_url\n", " }\n", "\n", " @classmethod\n", " def from_dict(cls, data):\n", " return cls(\n", " api_key=data[\"api_key\"],\n", " model_name=data[\"model_name\"],\n", " base_url=data[\"base_url\"]\n", " )\n", "\n", "# Initialize Qdrant document store for indexing documents\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\", # Adjust host as needed (e.g., localhost or container IP)\n", " port=6333,\n", " index=\"aaa_test\", # Name of the index in Qdrant\n", " recreate_index=True, # Recreate index if it exists (useful for development)\n", " use_sparse_embeddings=True, # Enable hybrid search with sparse embeddings\n", " sparse_idf=True, # Enable IDF calculation for sparse embeddings\n", " embedding_dim=768 # Adjust according to your model's embedding dimensions (DeepInfra model)\n", ")\n", "\n", "# Initialize document cleaner to preprocess text data\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", " remove_repeated_substrings=False # Keep repeated substrings if necessary for context retention\n", ")\n", "\n", "# Initialize document converter for PDF files (can be expanded to other formats)\n", "converter = PyPDFToDocument()\n", "\n", "# Split large documents into smaller chunks for better indexing and retrieval performance\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300, # Split after every 300 words \n", " split_overlap=30 # Overlap by 30 words between chunks to maintain context continuity \n", ")\n", "\n", "# Configure embedders: DeepInfra for dense embeddings and Fastembed for sparse embeddings.\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "# Create a document writer to write processed documents into the Qdrant document store.\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE # Overwrite existing documents with the same ID if they exist.\n", ")\n", "\n", "# Create and configure the pipeline with all components.\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"converter\", converter)\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect components in the pipeline.\n", "indexing_pipeline.connect(\"converter\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Define a temporary folder to store uploaded files before processing.\n", "TEMP_UPLOAD_DIR = \"/tmp/uploaded_files\"\n", "os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)\n", "\n", "@app.post(\"/upload\")\n", "async def upload_files(files: List[UploadFile] = File(...)):\n", " \"\"\"\n", " Endpoint to handle file uploads from the frontend.\n", " \n", " Args:\n", " files (List[UploadFile]): List of files uploaded by the user.\n", "\n", " Returns:\n", " JSONResponse: Success or failure message.\n", " \"\"\"\n", " \n", " try:\n", " file_paths = []\n", " metadata_list = []\n", "\n", " # Save each uploaded file temporarily and prepare metadata.\n", " for file in files:\n", " file_path = Path(TEMP_UPLOAD_DIR) / file.filename\n", " \n", " with open(file_path, \"wb\") as f:\n", " f.write(await file.read()) # Save file contents\n", " \n", " file_paths.append(file_path)\n", " \n", " metadata_list.append({\n", " \"file_name\": file.filename,\n", " \"file_path\": str(file_path),\n", " \"source_type\": \"pdf\" if file.filename.endswith('.pdf') else 'other'\n", " })\n", "\n", " # Run the ingestion pipeline with the uploaded files and their metadata.\n", " indexing_pipeline.run({\n", " \"converter\": {\n", " \"sources\": file_paths,\n", " \"meta\": metadata_list # Pass metadata along with documents for indexing.\n", " }\n", " })\n", "\n", " return JSONResponse(content={\"message\": f\"Successfully indexed {len(files)} documents.\"})\n", "\n", " except Exception as e:\n", " raise HTTPException(status_code=500, detail=str(e))\n", "\n", "@app.get(\"/documents\")\n", "async def get_documents():\n", " \"\"\"\n", " Endpoint to retrieve all indexed documents from Qdrant.\n", "\n", " Returns:\n", " JSONResponse: List of indexed documents with metadata.\n", " \"\"\"\n", " \n", " try:\n", " documents = document_store.filter_documents(filters={})\n", " \n", " if not documents:\n", " return JSONResponse(content={\"message\": \"No documents found.\"})\n", " \n", " return JSONResponse(content={\n", " \"documents\": [{\n", " \"file_name\": doc.meta.get(\"file_name\"),\n", " \"file_path\": doc.meta.get(\"file_path\")\n", " } for doc in documents]\n", " })\n", " \n", " except Exception as e:\n", " raise HTTPException(status_code=500, detail=str(e))\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "ERROR:asyncio:Task exception was never retrieved\n", "future: exception=KeyboardInterrupt()>\n", "Traceback (most recent call last):\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/main.py\", line 579, in run\n", " server.run()\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/server.py\", line 65, in run\n", " return asyncio.run(self.serve(sockets=sockets))\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py\", line 30, in run\n", " return loop.run_until_complete(task)\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py\", line 92, in run_until_complete\n", " self._run_once()\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/nest_asyncio.py\", line 133, in _run_once\n", " handle._run()\n", " File \"/opt/conda/envs/py38/lib/python3.10/asyncio/events.py\", line 80, in _run\n", " self._context.run(self._callback, *self._args)\n", " File \"/opt/conda/envs/py38/lib/python3.10/asyncio/tasks.py\", line 315, in __wakeup\n", " self.__step()\n", " File \"/opt/conda/envs/py38/lib/python3.10/asyncio/tasks.py\", line 232, in __step\n", " result = coro.send(None)\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/server.py\", line 68, in serve\n", " with self.capture_signals():\n", " File \"/opt/conda/envs/py38/lib/python3.10/contextlib.py\", line 142, in __exit__\n", " next(self.gen)\n", " File \"/opt/conda/envs/py38/lib/python3.10/site-packages/uvicorn/server.py\", line 332, in capture_signals\n", " signal.raise_signal(captured_signal)\n", "KeyboardInterrupt\n", "INFO: Started server process [45225]\n", "INFO: Waiting for application startup.\n", "INFO: Application startup complete.\n", "INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO: 10.240.1.166:0 - \"GET / HTTP/1.1\" 404 Not Found\n", "INFO: 10.240.1.166:0 - \"GET /docs HTTP/1.1\" 200 OK\n", "INFO: 10.240.1.166:0 - \"GET /openapi.json HTTP/1.1\" 200 OK\n", "INFO: 10.240.1.166:0 - \"POST /upload HTTP/1.1\" 400 Bad Request\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO: Shutting down\n", "INFO: Waiting for application shutdown.\n", "INFO: Application shutdown complete.\n", "INFO: Finished server process [45225]\n" ] } ], "source": [ "import os\n", "from pathlib import Path\n", "from typing import List\n", "import numpy as np\n", "from fastapi import FastAPI, File, UploadFile, HTTPException\n", "from fastapi.responses import JSONResponse\n", "from haystack import Pipeline, component\n", "from haystack.components.converters import PyPDFToDocument\n", "from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner\n", "from haystack_integrations.document_stores.qdrant import QdrantDocumentStore\n", "from haystack.components.writers import DocumentWriter\n", "from haystack.document_stores.types import DuplicatePolicy\n", "from haystack.dataclasses import Document\n", "from openai import OpenAI\n", "\n", "# Import necessary libraries for running FastAPI inside Jupyter Notebook\n", "import nest_asyncio\n", "import uvicorn\n", "\n", "# Patch asyncio event loop for Jupyter Notebook compatibility\n", "nest_asyncio.apply()\n", "\n", "# Define the FastAPI app instance\n", "app = FastAPI()\n", "\n", "# Define custom embedding component using DeepInfra API\n", "@component\n", "class DeepInfraEmbeddings:\n", " def __init__(\n", " self,\n", " api_key: str = \"XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL\", # Replace with your actual API key\n", " model_name: str = \"BAAI/bge-base-en-v1.5\",\n", " base_url: str = \"https://api.deepinfra.com/v1/openai\"\n", " ):\n", " self.client = OpenAI(\n", " api_key=api_key,\n", " base_url=base_url\n", " )\n", " self.model_name = model_name\n", "\n", " @component.output_types(documents=List[Document])\n", " def run(self, documents: List[Document]) -> dict:\n", " texts = [doc.content for doc in documents]\n", " \n", " # Get embeddings from DeepInfra API\n", " response = self.client.embeddings.create(\n", " model=self.model_name,\n", " input=texts,\n", " encoding_format=\"float\"\n", " )\n", " \n", " embeddings = [np.array(embedding.embedding) for embedding in response.data]\n", " \n", " # Assign embeddings to each document\n", " for doc, embedding in zip(documents, embeddings):\n", " doc.embedding = embedding\n", " \n", " return {\"documents\": documents}\n", "\n", " def to_dict(self):\n", " return {\n", " \"api_key\": self.client.api_key,\n", " \"model_name\": self.model_name,\n", " \"base_url\": self.client.base_url\n", " }\n", "\n", " @classmethod\n", " def from_dict(cls, data):\n", " return cls(\n", " api_key=data[\"api_key\"],\n", " model_name=data[\"model_name\"],\n", " base_url=data[\"base_url\"]\n", " )\n", "\n", "# Initialize Qdrant document store for indexing documents\n", "document_store = QdrantDocumentStore(\n", " host=\"0.0.0.0\", # Adjust host as needed (e.g., localhost or container IP)\n", " port=6333,\n", " index=\"aaa_test\", # Name of the index in Qdrant\n", " recreate_index=True, # Recreate index if it exists (useful for development)\n", " use_sparse_embeddings=True, # Enable hybrid search with sparse embeddings\n", " sparse_idf=True, # Enable IDF calculation for sparse embeddings\n", " embedding_dim=768 # Adjust according to your model's embedding dimensions (DeepInfra model)\n", ")\n", "\n", "# Initialize document cleaner to preprocess text data\n", "cleaner = DocumentCleaner(\n", " ascii_only=True,\n", " remove_empty_lines=True,\n", " remove_extra_whitespaces=True,\n", " remove_repeated_substrings=False # Keep repeated substrings if necessary for context retention\n", ")\n", "\n", "# Initialize document converter for PDF files (can be expanded to other formats)\n", "converter = PyPDFToDocument()\n", "\n", "# Split large documents into smaller chunks for better indexing and retrieval performance\n", "document_splitter = DocumentSplitter(\n", " split_by=\"word\",\n", " split_length=300, # Split after every 300 words \n", " split_overlap=30 # Overlap by 30 words between chunks to maintain context continuity \n", ")\n", "\n", "# Configure embedders: DeepInfra for dense embeddings and Fastembed for sparse embeddings.\n", "deep_infra_embedder = DeepInfraEmbeddings()\n", "sparse_embedder = FastembedSparseDocumentEmbedder(model=\"prithvida/Splade_PP_en_v1\")\n", "\n", "# Create a document writer to write processed documents into the Qdrant document store.\n", "writer = DocumentWriter(\n", " document_store=document_store,\n", " policy=DuplicatePolicy.OVERWRITE # Overwrite existing documents with the same ID if they exist.\n", ")\n", "\n", "# Create and configure the pipeline with all components.\n", "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(\"converter\", converter)\n", "indexing_pipeline.add_component(\"cleaner\", cleaner)\n", "indexing_pipeline.add_component(\"splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"deep_infra_embedder\", deep_infra_embedder)\n", "indexing_pipeline.add_component(\"sparse_embedder\", sparse_embedder)\n", "indexing_pipeline.add_component(\"writer\", writer)\n", "\n", "# Connect components in the pipeline.\n", "indexing_pipeline.connect(\"converter\", \"cleaner\")\n", "indexing_pipeline.connect(\"cleaner\", \"splitter\")\n", "indexing_pipeline.connect(\"splitter\", \"sparse_embedder\")\n", "indexing_pipeline.connect(\"sparse_embedder\", \"deep_infra_embedder\")\n", "indexing_pipeline.connect(\"deep_infra_embedder\", \"writer\")\n", "\n", "# Define a temporary folder to store uploaded files before processing.\n", "TEMP_UPLOAD_DIR = \"/tmp/uploaded_files\"\n", "os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)\n", "\n", "@app.post(\"/upload\")\n", "async def upload_files(files: List[UploadFile] = File(...)):\n", " \"\"\"\n", " Endpoint to handle file uploads from the frontend.\n", " \n", " Args:\n", " files (List[UploadFile]): List of files uploaded by the user.\n", "\n", " Returns:\n", " JSONResponse: Success or failure message.\n", " \"\"\"\n", " \n", " try:\n", " file_paths = []\n", " metadata_list = []\n", "\n", " # Save each uploaded file temporarily and prepare metadata.\n", " for file in files:\n", " file_path = Path(TEMP_UPLOAD_DIR) / file.filename\n", " \n", " with open(file_path, \"wb\") as f:\n", " f.write(await file.read()) # Save file contents\n", " \n", " file_paths.append(file_path)\n", " \n", " metadata_list.append({\n", " \"file_name\": file.filename,\n", " \"file_path\": str(file_path),\n", " \"source_type\": \"pdf\" if file.filename.endswith('.pdf') else 'other'\n", " })\n", "\n", " # Run the ingestion pipeline with the uploaded files and their metadata.\n", " indexing_pipeline.run({\n", " \"converter\": {\n", " \"sources\": file_paths,\n", " \"meta\": metadata_list # Pass metadata along with documents for indexing.\n", " }\n", " })\n", "\n", " return JSONResponse(content={\"message\": f\"Successfully indexed {len(files)} documents.\"})\n", "\n", " except Exception as e:\n", " raise HTTPException(status_code=500, detail=str(e))\n", "\n", "@app.get(\"/documents\")\n", "async def get_documents():\n", " \"\"\"\n", " Endpoint to retrieve all indexed documents from Qdrant.\n", "\n", " Returns:\n", " JSONResponse: List of indexed documents with metadata.\n", " \"\"\"\n", " \n", " try:\n", " documents = document_store.filter_documents(filters={})\n", " \n", " if not documents:\n", " return JSONResponse(content={\"message\": \"No documents found.\"})\n", " \n", " return JSONResponse(content={\n", " \"documents\": [{\n", " \"file_name\": doc.meta.get(\"file_name\"),\n", " \"file_path\": doc.meta.get(\"file_path\")\n", " } for doc in documents]\n", " })\n", " \n", " except Exception as e:\n", " raise HTTPException(status_code=500, detail=str(e))\n", "\n", "\n", "# Start Uvicorn server within Jupyter Notebook on port 8000.\n", "if __name__ == \"__main__\":\n", " uvicorn.run(app, host=\"0.0.0.0\", port=8000)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "uvicorn main:app --reload --host 0.0.0.0 --port 8000\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "py38", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }