rag_hydro_json

Sleeping

App Files Files Community

Anas Bader commited on Apr 10

Commit

4cbe4e9

1 Parent(s): c0d5f87

redo

Browse files

Files changed (17) hide show

.gitignore +16 -0
Dockerfile +85 -0
README.md +4 -4
app.py +113 -0
chunking/semantic_chunking.py +93 -0
elastic/es_client.py +41 -0
elastic/es_index.py +41 -0
elastic/indexing.py +60 -0
elastic/retrieval.py +83 -0
embeddings/embeddings.py +33 -0
es_data/node.lock +0 -0
es_data/nodes +1 -0
file_processing.py +106 -0
ingestion.py +0 -0
prompting/rewrite_question.py +212 -0
requirements.txt +21 -0
streamlit.py +50 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.env*
+myenv*
+pyproject.toml
+.env*
+!.env.example
+myenv*
+pyproject.toml
+test.*
+.conda
+docs/*
+__pycache__/
+.vscode
+certif_extraction/certifs_md
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,85 @@

+FROM ubuntu:22.04
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    openjdk-11-jdk \
+    python3 \
+    python3-pip \
+    wget \
+    apt-transport-https \
+    gnupg \
+    && rm -rf /var/lib/apt/lists/*
+# Install Elasticsearch
+ENV ES_VERSION=8.8.0
+RUN curl -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.8.0-linux-x86_64.tar.gz && \
+    tar -xzf elasticsearch-8.8.0-linux-x86_64.tar.gz && \
+    mv elasticsearch-8.8.0 /usr/share/elasticsearch && \
+    rm elasticsearch-8.8.0-linux-x86_64.tar.gz
+# Create elasticsearch.yml with proper YAML format
+RUN echo "discovery.type: single-node" > /usr/share/elasticsearch/config/elasticsearch.yml && \
+    echo "xpack.security.enabled: false" >> /usr/share/elasticsearch/config/elasticsearch.yml && \
+    echo "network.host: 0.0.0.0" >> /usr/share/elasticsearch/config/elasticsearch.yml
+# Set Elasticsearch environment variables
+ENV ES_JAVA_OPTS="-Xms1g -Xmx1g"
+# Create non-root user for running the services
+RUN useradd -m -u 1000 appuser
+RUN mkdir -p /app /usr/share/elasticsearch/data && \
+    chown -R appuser:appuser /app /usr/share/elasticsearch
+# Create app directory
+WORKDIR /app
+# Copy your project files
+COPY --chown=appuser:appuser app.py streamlit.py requirements.txt ./
+COPY --chown=appuser:appuser chunking ./chunking
+COPY --chown=appuser:appuser embeddings ./embeddings
+COPY --chown=appuser:appuser prompting ./prompting
+COPY --chown=appuser:appuser elastic ./elastic
+COPY --chown=appuser:appuser file_processing.py ./
+COPY --chown=appuser:appuser ingestion.py ./
+# Copy ES data if needed - consider if this is actually necessary
+COPY --chown=appuser:appuser es_data /usr/share/elasticsearch/data
+# Install Python dependencies
+RUN pip3 install -r requirements.txt
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ENABLE_CORS=false
+ENV ES_HOST=localhost
+ENV ES_PORT=9200
+ENV ELASTICSEARCH_HOSTS="http://localhost:9200"
+# Expose required ports (Elasticsearch and Streamlit)
+EXPOSE 9200 7860
+# Switch to non-root user
+USER appuser
+# Create startup script
+RUN echo '#!/bin/bash\n\
+# Start Elasticsearch in the background\n\
+/usr/share/elasticsearch/bin/elasticsearch &\n\
+\n\
+# Wait for Elasticsearch to become available\n\
+echo "Waiting for Elasticsearch to start..."\n\
+until curl -s http://localhost:9200 > /dev/null; do\n\
+  sleep 2\n\
+  echo "Still waiting for Elasticsearch..."\n\
+done\n\
+echo "Elasticsearch is up and running!"\n\
+\n\
+# Start Streamlit\n\
+echo "Starting Streamlit application..."\n\
+streamlit run /app/streamlit.py\n\
+' > /app/start.sh && chmod +x /app/start.sh
+# Command to run
+CMD ["/app/start.sh"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Rag Hydro
-emoji: 🚀
-colorFrom: purple
-colorTo: yellow
 sdk: docker
 pinned: false
 ---

 ---
+title: Hydro Rag
+emoji: 🐢
+colorFrom: indigo
+colorTo: green
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+from fastapi import FastAPI, HTTPException
+from langchain.prompts import PromptTemplate
+from pydantic import BaseModel
+from typing import Optional
+from dotenv import load_dotenv
+from embeddings.embeddings import generate_embeddings
+from elastic.retrieval import search_certification_chunks
+from prompting.rewrite_question import classify_certification, initialize_llms, process_query
+load_dotenv()
+app = FastAPI(
+    title="Hydrogen Certification RAG System",
+    description="API for querying hydrogen certification documents using RAG",
+    version="0.1.0"
+)
+# Initialize LLMs and Elasticsearch client
+llms = initialize_llms()
+# Request models
+class QueryRequest(BaseModel):
+    query: str
+llm = initialize_llms()["rewrite_llm"]
+# Endpoints
+@app.post("/query")
+async def handle_query(request: QueryRequest):
+    """
+    Process a query through the full RAG pipeline:
+    1. Classify certification (if not provided)
+    2. Optimize query based on specificity
+    3. Search relevant chunks
+    """
+    try:
+        # Step 1: Determine certification
+        query = request.query
+        certification = classify_certification(request.query, llms["rewrite_llm"])
+        if "no certification mentioned" in certification :
+                raise HTTPException(
+                    status_code=400,
+                    detail="No certification specified in query and none provided"
+                )
+        # Step 2: Process query
+        processed_query = process_query(request.query, llms)
+        question_vector = generate_embeddings(processed_query)
+        # Step 3: Search
+        results = search_certification_chunks(
+            index_name="certif_index",
+            certification_name=certification,
+            text_query=processed_query,
+        )
+        results_ = search_certification_chunks(
+            index_name="certification_index",
+            certification_name=certification,
+            text_query=processed_query,
+            vector_query=question_vector,
+        )
+        results_merged = ". ".join([result["text"] for result in results])
+        results_merged_ = ". ".join([result["text"] for result in results_])
+        template = """
+        You are an AI assistant tasked with providing answers based on the given context about a specific hydrogen certification.
+        Provide a clear, concise response that directly addresses the question without unnecessary information.
+        Question: {question}
+        Certification: {certification}
+        Context: {context}
+        Answer:
+        """
+        prompt = PromptTemplate(
+        input_variables=["question", "certification", "context"],
+        template=template
+        )
+        chain = prompt | llm
+        answer = chain.invoke({"question": processed_query, "certification": certification, "context": results_merged}).content
+        answer_ = chain.invoke({"question": processed_query, "certification": certification, "context": results_merged_}).content
+        return {
+            "certification": certification,
+            "certif_index": answer,
+            "certification_index": answer_,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/certifications", response_model=list[str])
+async def list_certifications():
+    """List all available certifications"""
+    try:
+        certs_dir = "docs/processed"
+        return [f for f in os.listdir(certs_dir) if os.path.isdir(os.path.join(certs_dir, f))]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

chunking/semantic_chunking.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
+    """
+    Split text into chunks respecting sentence boundaries when possible,
+    with optional overlap between chunks.
+    Args:
+        text: The text to split
+        max_len: Maximum length for each chunk
+    Returns:
+        List of text chunks
+    """
+    # Normalize text
+    text = text.replace("\r", "").replace("\n", " ").strip()
+    # Extract sentences (more robust regex for sentence detection)
+    import re
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(sentence) > max_len:
+            # First add the current chunk if it exists
+            chunks.append(sentence)
+        # Normal case - see if adding the sentence exceeds max_len
+        elif len(current_chunk) + len(sentence) + 1 > max_len:
+            # Add the current chunk and start a new one
+            chunks.append(current_chunk)
+            current_chunk = ""
+        else:
+            # Add to the current chunk
+            if current_chunk:
+                current_chunk += " " + sentence
+            else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def cosine_similarity(vec1, vec2):
+    """Calculate the cosine similarity between two vectors."""
+    dot_product = np.dot(vec1, vec2)
+    norm_vec1 = np.linalg.norm(vec1)
+    norm_vec2 = np.linalg.norm(vec2)
+    return dot_product / (norm_vec1 * norm_vec2)
+def get_embedding(text):
+    """Generate an embedding using SBERT."""
+    return embedding_model.encode(text, convert_to_numpy=True)
+def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
+    """
+    Splits text into semantic chunks based on sentence similarity.
+    - threshold: Lower = more splits, Higher = fewer splits
+    - max_chunk_size: Maximum size of each chunk in characters
+    """
+    text = text.replace("\n", " ").replace("\r", " ").strip()
+    sentences = hybrid_split(text)
+    embeddings = [get_embedding(sent) for sent in sentences]
+    chunks = []
+    current_chunk = [sentences[0]]
+    for i in range(1, len(sentences)):
+        sim = cosine_similarity(embeddings[i - 1], embeddings[i])
+        if (
+            sim < threshold
+            or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
+        ):
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentences[i]]
+        else:
+            current_chunk.append(sentences[i])
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks

elastic/es_client.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import logging
+from elasticsearch import Elasticsearch, ConnectionError, AuthenticationException
+# Configure logging at the application level
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# Load environment variables
+ES_CLIENT_URL = os.getenv("ELASTICSEARCH_HOSTS", "http://localhost:9200")
+class ElasticsearchClientError(Exception):
+    """Custom exception for Elasticsearch client errors."""
+    pass
+def get_es_client() -> Elasticsearch:
+    """
+    Establish connection to Elasticsearch and return the client instance.
+    Raises ElasticsearchClientError if the connection cannot be established.
+    """
+    try:
+        print("es client", ES_CLIENT_URL)
+        # Initialize Elasticsearch client
+        es_client = Elasticsearch(
+            hosts=[ES_CLIENT_URL],
+        )
+        # Verify connection
+        if not es_client.ping():
+            error_message = "Elasticsearch cluster is not reachable!"
+            logger.error(error_message)
+            raise ElasticsearchClientError(error_message)
+        logger.info("Successfully connected to Elasticsearch")
+        return es_client
+    except (ConnectionError, AuthenticationException) as e:
+        error_message = f"Elasticsearch connection error: {e}"
+        logger.error(error_message)
+        raise ElasticsearchClientError(error_message) from e

elastic/es_index.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import logging
+from elasticsearch import Elasticsearch, ConnectionError, AuthenticationException
+# Configure logging at the application level
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# Load environment variables
+ES_CLIENT_URL = os.getenv("ELASTICSEARCH_HOSTS")
+class ElasticsearchClientError(Exception):
+    """Custom exception for Elasticsearch client errors."""
+    pass
+def get_es_client() -> Elasticsearch:
+    """
+    Establish connection to Elasticsearch and return the client instance.
+    Raises ElasticsearchClientError if the connection cannot be established.
+    """
+    try:
+        print("es client", ES_CLIENT_URL)
+        # Initialize Elasticsearch client
+        es_client = Elasticsearch(
+            hosts=[ES_CLIENT_URL],
+        )
+        # Verify connection
+        if not es_client.ping():
+            error_message = "Elasticsearch cluster is not reachable!"
+            logger.error(error_message)
+            raise ElasticsearchClientError(error_message)
+        logger.info("Successfully connected to Elasticsearch")
+        return es_client
+    except (ConnectionError, AuthenticationException) as e:
+        error_message = f"Elasticsearch connection error: {e}"
+        logger.error(error_message)
+        raise ElasticsearchClientError(error_message) from e

elastic/indexing.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import logging
+from elasticsearch import Elasticsearch, exceptions
+from typing import Dict, Any
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.DEBUG)
+embedding_dimension = 1536
+def create_mapping(properties: Dict[str, Any]) -> Dict[str, Any]:
+    """Helper function to create index mappings with predefined settings."""
+    return {
+        "settings": {"number_of_shards": 1, "number_of_replicas": 1},
+        "mappings": {"properties": properties},
+    }
+def retrieval_index() -> Dict[str, Any]:
+    """Returns the Elasticsearch mapping for retrieval indices."""
+    return create_mapping(
+        {
+            "chunk_id": {"type": "keyword"},
+            "chunk": {"type": "text"},
+            "embedding": {
+                "type": "dense_vector",
+                "dims": embedding_dimension,
+            },
+            "certification": {"type": "keyword"},
+            "source_file": {"type": "keyword"},
+            "timestamp": {"type": "date"},
+        }
+    )
+def create_elasticsearch_index(es_client: Elasticsearch, index_name: str) -> bool:
+    """
+    Create an Elasticsearch index with the appropriate mapping.
+    Args:
+        es_client (Elasticsearch): The Elasticsearch client instance.
+        index_name (str): The name of the index to create.
+    Returns:
+        bool: True if the index was created successfully, False otherwise.
+    """
+    try:
+        mapping = retrieval_index()
+        if es_client.indices.exists(index=index_name):
+            logger.warning(f"Index '{index_name}' already exists. Skipping creation.")
+            return True
+        es_client.indices.create(index=index_name, body=mapping)
+        logger.info(f"Index '{index_name}' created successfully.")
+        return True
+    except Exception as e:
+        logger.error(f"Unexpected error while creating index '{index_name}': {e}")
+        return False

elastic/retrieval.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import List, Dict, Any, Optional
+import logging
+from elasticsearch import exceptions
+from elastic.es_client import get_es_client
+logger = logging.getLogger(__name__)
+es_client = get_es_client()
+def search_certification_chunks(
+    index_name: str,
+    text_query: str,
+    vector_query: List[float],
+    certification_name: str,
+    es_client=es_client,
+    vector_field: str = "embedding",
+    text_field: str = "chunk",
+    size: int = 5,
+    min_score: float = 0.1,  # Lowered threshold
+    boost_text: float = 1.0,
+    boost_vector: float = 1.0,
+) -> List[Dict[str, Any]]:
+    # First verify the certification value exists
+    cert_check = es_client.search(
+        index=index_name,
+        body={
+            "query": {"term": {"certification": certification_name}},
+            "size": 1,
+        },
+    )
+    if not cert_check["hits"]["hits"]:
+        logger.error(f"No documents found with certification: {certification_name}")
+        return []
+    # Then proceed with hybrid search
+    query_body = {
+        "size": size,
+        "query": {
+            "bool": {
+                "should": [
+                    {"match": {"chunk": text_query}},
+                    {
+                        "script_score": {
+                            "query": {"match_all": {}},
+                            "script": {
+                                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
+                                "params": {"query_vector": vector_query},
+                            },
+                        }
+                    },
+                ]
+            }
+        },
+    }
+    logger.debug(f"Elasticsearch query body: {query_body}")
+    logger.info(f"Executing search on index '{index_name}'")
+    response = es_client.search(index=index_name, body=query_body, routing=cert_check["hits"]["hits"][0]["_id"])
+    hits = response.get("hits", {}).get("hits", [])
+    logger.info(f"Found {len(hits)} matching documents")
+    # Process results with correct field names
+    results = [
+        {
+            "id": hit["_id"],
+            "score": hit["_score"],
+            "text": hit["_source"]["chunk"],
+            "source_file": hit["_source"]["source_file"],
+        }
+        for hit in hits
+    ]
+    if results:
+        logger.debug(f"Top result score: {results[0]['score']}")
+        logger.debug(f"Top result source: {results[0]['source_file']}")
+    else:
+        logger.warning("No results returned from Elasticsearch")
+    return results

embeddings/embeddings.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import logging
+from typing import List
+from openai import OpenAI
+from dotenv import load_dotenv
+# Configure logging at the application level
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+load_dotenv()
+embedding_dimension = 1536
+model = "text-embedding-3-small"
+openai_api_key = os.getenv("OPENAI_API_KEY").strip().strip("\n")
+client = OpenAI(api_key=openai_api_key)
+def generate_embeddings(text: str) -> List[float]:
+    """Get embeddings from OpenAI API."""
+    logging.info("Embedding model: %s", model)
+    try:
+        if text:
+            response = client.embeddings.create(
+                model=model, input=text, dimensions=embedding_dimension
+            )
+            return response.data[0].embedding
+    except Exception as e:
+        logger.error(f"OpenAI API error: {e}")
+        raise

es_data/node.lock ADDED Viewed

File without changes

es_data/nodes ADDED Viewed

	@@ -0,0 +1 @@


1	+ written by Elasticsearch v8.8.0 to prevent a downgrade to a version prior to v8.0.0 which would result in data loss

file_processing.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import pdfplumber
+from docx import Document
+from openpyxl import load_workbook
+import pdfplumber
+import logging
+from typing import List, Union, Tuple
+import os
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_pdf_content(pdf_path: str) -> List[str]:
+    """
+    Extract text and tables from PDF in their natural reading order.
+    Simplified version without positional processing.
+    Args:
+        pdf_path (str): Path to the PDF file
+    Returns:
+        List[str]: List of extracted content chunks (text and tables)
+    """
+    if not os.path.exists(pdf_path):
+        logger.error(f"PDF file not found: {pdf_path}")
+        return []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            content = []
+            for page in pdf.pages:
+                # First extract tables
+                tables = page.extract_tables()
+                for table in tables:
+                    if table:
+                        # Convert table to string representation
+                        table_str = "\n".join(
+                            ["\t".join(str(cell) for cell in row) for row in table]
+                        )
+                        content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
+                # Then extract regular text
+                text = page.extract_text()
+                if text and text.strip():
+                    content.append(text.strip())
+            logger.info(f"Successfully extracted content from {pdf_path}")
+            return content
+    except Exception as e:
+        logger.error(f"Error processing {pdf_path}: {str(e)}")
+        return []
+from docx import Document
+from typing import List
+import os
+def extract_docx_content(docx_path: str) -> List[str]:
+    """
+    Extract text and tables from DOCX file with clear table markers.
+    Args:
+        docx_path (str): Path to the DOCX file
+    Returns:
+        List[str]: List of extracted content chunks with tables marked as [TABLE]...[/TABLE]
+    """
+    if not os.path.exists(docx_path):
+        raise FileNotFoundError(f"DOCX file not found: {docx_path}")
+    doc = Document(docx_path)
+    content = []
+    # Process all paragraphs first
+    for paragraph in doc.paragraphs:
+        text = paragraph.text.strip()
+        if text:
+            content.append(text)
+    # Process all tables after paragraphs
+    for table in doc.tables:
+        table_str = "\n".join(
+            ["\t".join(cell.text.strip() for cell in row.cells)
+             for row in table.rows]
+        )
+        if table_str.strip():
+            content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
+    return content
+def extract_xlsx_content(file_path: str):
+    wb = load_workbook(file_path)
+    sheets_text = []
+    for sheet in wb:
+        sheet_str = f"--- Sheet: {sheet.title} ---\n"
+        for row in sheet.iter_rows():
+            row_str = "\t".join(str(cell.value) if cell.value else "" for cell in row)
+            sheet_str += row_str + "\n"
+        sheets_text.append(sheet_str.strip())
+    return sheets_text

ingestion.py ADDED Viewed

File without changes

prompting/rewrite_question.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+from dotenv import load_dotenv
+from langchain.prompts import PromptTemplate
+from langchain_groq import ChatGroq
+from typing import Literal
+# Load environment variables
+load_dotenv()
+# Initialize LLMs
+def initialize_llms():
+    """Initialize and return the LLM instances"""
+    groq_api_key = os.getenv("GROQ_API_KEY")
+    return {
+        "rewrite_llm": ChatGroq(
+            temperature=0.1,
+            model="llama-3.3-70b-versatile",
+            api_key=groq_api_key
+        ),
+        "step_back_llm": ChatGroq(
+            temperature=0,
+            model="Gemma2-9B-IT",
+            api_key=groq_api_key
+        )
+    }
+# Certification classification
+def classify_certification(
+    query: str,
+    llm: ChatGroq,
+    certs_dir: str = "docs/processed"
+) -> str:
+    """
+    Classify which certification a query is referring to.
+    Returns certification name or 'no certification mentioned'.
+    """
+    available_certs = "2BSvs, CertifHy - National Green Certificate (NGC), CertifHy - RFNBO, Certified_Hydrogen_Producer, GH2_Standard, Green_Hydrogen_Certification, ISCC CORSIA, ISCC EU (International Sustainability & Carbon Certification), ISCC PLUS, ISO_19880_Hydrogen_Quality, REDcert-EU, RSB, Scottish Quality Farm Assured Combinable Crops (SQC), TUV Rheinland H2.21, UK RTFO_regulation"
+    template = """
+    You are an AI assistant classifying user queries based on the certification they are asking for in a RAG system.
+    Classify the given query into one of the following certifications:
+    - {available_certifications}
+    Don't need any explanation, just return the name of the certification.
+    Use the exact name of the certification as it appears in the directory.
+    If the query refers to multiple certifications, return the most relevant one.
+    If the query doesn't mention any certification, respond with "no certification mentioned".
+    Original query: {original_query}
+    Classification:
+    """
+    prompt = PromptTemplate(
+        input_variables=["original_query", "available_certifications"],
+        template=template
+    )
+    chain = prompt | llm
+    response = chain.invoke({
+        "original_query": query,
+        "available_certifications": available_certs
+    }).content.strip()
+    return response
+# Query specificity classification
+def classify_query_specificity(
+    query: str,
+    llm: ChatGroq
+) -> Literal["specific", "general", "too narrow"]:
+    """
+    Classify query specificity.
+    Returns one of: 'specific', 'general', or 'too narrow'.
+    """
+    template = """
+    You are an AI assistant classifying user queries based on their specificity for a RAG system.
+    Classify the given query into one of:
+    - "specific" → If it asks for exact values, certifications, or well-defined facts.
+    - "general" → If it is broad and needs refinement for better retrieval.
+    - "too narrow" → If it is very specific and might need broader context.
+    DO NOT output explanations, only return one of: "specific", "general", or "too narrow".
+    Original query: {original_query}
+    Classification:
+    """
+    prompt = PromptTemplate(
+        input_variables=["original_query"],
+        template=template
+    )
+    chain = prompt | llm
+    response = chain.invoke({"original_query": query}).content.strip().lower()
+    return response.split("\n")[0].strip()  # type: ignore
+# Query refinement
+def refine_query(
+    query: str,
+    llm: ChatGroq
+) -> str:
+    """Rewrite a query to be clearer and more detailed while keeping the original intent"""
+    template = """
+    You are an AI assistant that improves queries for retrieving precise certification and compliance data.
+    Rewrite the query to be clearer while keeping the intent unchanged.
+    Original query: {original_query}
+    Refined query:
+    """
+    prompt = PromptTemplate(
+        input_variables=["original_query"],
+        template=template
+    )
+    chain = prompt | llm
+    return chain.invoke({"original_query": query}).content
+# Step-back query generation
+def generate_step_back_query(
+    query: str,
+    llm: ChatGroq
+) -> str:
+    """Generate a broader step-back query to retrieve relevant background information"""
+    template = """
+    You are an AI assistant generating broader queries to improve retrieval context.
+    Given the original query, generate a more general step-back query to retrieve relevant background information.
+    Original query: {original_query}
+    Step-back query:
+    """
+    prompt = PromptTemplate(
+        input_variables=["original_query"],
+        template=template
+    )
+    chain = prompt | llm
+    return chain.invoke({"original_query": query}).content
+# Main query processing pipeline
+def process_query(
+    original_query: str,
+    llms: dict
+) -> str:
+    """
+    Process a query through the full pipeline:
+    1. Classify specificity
+    2. Apply appropriate refinement
+    """
+    specificity = classify_query_specificity(original_query, llms["rewrite_llm"])
+    if specificity == "specific":
+        return refine_query(original_query, llms["rewrite_llm"])
+    elif specificity == "general":
+        return refine_query(original_query, llms["rewrite_llm"])
+    elif specificity == "too narrow":
+        return generate_step_back_query(original_query, llms["step_back_llm"])
+    return original_query
+# Test setup
+def test_hydrogen_certification_functions():
+    # Initialize LLMs
+    llms = initialize_llms()
+    # Create a test directory with hydrogen certifications
+    test_certs_dir = "docs/processed"
+    os.makedirs(test_certs_dir, exist_ok=True)
+    # Create some dummy certification folders
+    hydrogen_certifications = [
+        "GH2_Standard",
+        "Certified_Hydrogen_Producer",
+        "Green_Hydrogen_Certification",
+        "ISO_19880_Hydrogen_Quality"
+    ]
+    for cert in hydrogen_certifications:
+        os.makedirs(os.path.join(test_certs_dir, cert), exist_ok=True)
+    # Test queries
+    test_queries = [
+        ("What are the purity requirements in GH2 Standard?", "specific"),
+        ("How does hydrogen certification work?", "general"),
+        ("What's the exact ppm of CO2 allowed in ISO_19880_Hydrogen_Quality section 4.2?", "too narrow"),
+        ("What safety protocols exist for hydrogen storage?", "general")
+    ]
+    print("=== Testing Certification Classification ===")
+    for query, _ in test_queries:
+        cert = classify_certification(query, llms["rewrite_llm"], test_certs_dir)
+        print(f"Query: {query}\nClassification: {cert}\n")
+    print("\n=== Testing Specificity Classification ===")
+    for query, expected_type in test_queries:
+        specificity = classify_query_specificity(query, llms["rewrite_llm"])
+        print(f"Query: {query}\nExpected: {expected_type}, Got: {specificity}\n")
+    print("\n=== Testing Full Query Processing ===")
+    for query, _ in test_queries:
+        processed = process_query(query, llms)
+        print(f"Original: {query}\nProcessed: {processed}\n")
+# Run the tests
+if __name__ == "__main__":
+    test_hydrogen_certification_functions()

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+langchain
+langchain-groq
+langchain-community
+chromadb
+jq
+fastembed
+python-dotenv
+langchain_chroma
+unstructured
+openai
+elastic-transport==8.17.0
+elasticsearch==8.17.1
+sentence-transformers
+fastapi
+pdfplumber
+pdfminer.six
+python-docx
+openpyxl
+PyPDF2
+streamlit
+uvicorn

streamlit.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+import logging
+import asyncio
+from contextlib import asynccontextmanager
+from app import QueryRequest  # Import the request model
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set page config
+st.set_page_config(page_title="Certification Chat", layout="centered")
+st.title("🎓 Certification Chat Assistant")
+# Create a function to handle the async call
+async def async_query(query_text):
+    from app import handle_query  # Import here to avoid circular imports
+    request = QueryRequest(query=query_text)
+    return await handle_query(request)
+# Function to run async code in Streamlit
+def run_async(coroutine):
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    return loop.run_until_complete(coroutine)
+# User input
+user_input = st.text_input("💬 Enter your prompt:")
+if user_input:
+    st.markdown("## 🧠 Response")
+    try:
+        # Use try-except to handle errors
+        with st.spinner("Processing your query..."):
+            # Run the async function
+            result = run_async(async_query(user_input))
+            # Display output
+            st.write("**Certification:**", result["certification"])
+            st.write("**Answer from certif_index:**", result["certif_index"])
+            st.write("**Answer from certification_index:**", result["certification_index"])
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        logger.error(f"Error processing query: {e}", exc_info=True)