Spaces:

PhoenixDecim
/

slm_financial_rag

Running

App Files Files Community

PhoenixDecim commited on about 1 month ago

Commit

e1e8013

1 Parent(s): a704797

Improved input query filters

Browse files

Files changed (2) hide show

app.py +53 -14
data_filters.py +9 -0

app.py CHANGED Viewed

@@ -21,6 +21,8 @@ from data_filters import (
     restricted_patterns,
     restricted_topics,
     FINANCIAL_DATA_PATTERNS,
     sensitive_terms,
     FINANCIAL_TERMS,
 )
@@ -37,8 +39,8 @@ os.makedirs("data", exist_ok=True)
 # SLM: Microsoft PHI-2 model is loaded
 # It does have higher memory and compute requirements compared to TinyLlama and Falcon
 # But it gives the best results among the three
-DEVICE = "cpu"  # or cuda
-# DEVICE = "cuda"  # or cuda
 # MODEL_NAME = "TinyLlama/TinyLlama_v1.1"
 # MODEL_NAME = "tiiuae/falcon-rw-1b"
 MODEL_NAME = "microsoft/phi-2"
@@ -55,7 +57,7 @@ if tokenizer.pad_token is None:
 # Since the model is to be hosted on a cpu instance, we use float32
 # For GPU, we can use float16 or bfloat16
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME, torch_dtype=torch.float32, trust_remote_code=True
 ).to(DEVICE)
 model.eval()
 # model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
@@ -234,25 +236,62 @@ def process_files(files, chunk_size=512):
         pickle.dump(bm25_data, f)
     return "Files processed successfully! You can now query."
 def contains_financial_entities(query):
-    """Check if the query has financial entities"""
     doc = nlp(query)
     for ent in doc.ents:
         if ent.label_ in FINANCIAL_ENTITY_LABELS:
             return True
     return False
 # Input guardrail implementation
 # Regex is used to filter queries related to sensitive topics
 # Uses spaCy model's Named Entity Recognition to filter queries for personal details
 # Uses cosine similarity with the embedded query and sensitive topic vectors
 # to filter out queries violating confidential/security rules (additional)
 def is_query_allowed(query):
     """Checks if the query violates security or confidentiality rules"""
     for pattern in restricted_patterns:
         if re.search(pattern, query.lower(), re.IGNORECASE):
             return False, "This query requests sensitive or confidential information."
     doc = nlp(query)
     for ent in doc.ents:
         if ent.label_ == "PERSON":
             for token in ent.subtree:
@@ -265,6 +304,7 @@ def is_query_allowed(query):
     topic_embeddings = embed_model.encode(
         list(restricted_topics), normalize_embeddings=True
     )
     similarities = np.dot(topic_embeddings, query_embedding)
     if np.max(similarities) > 0.85:
         return False, "This query requests sensitive or confidential information."
@@ -368,8 +408,9 @@ def compute_response_confidence(
         normalized_bm25 = 0.0
     logger.info(
         f"Faiss score: {normalized_faiss}, bm25: {normalized_bm25}, "
-        f"Mean Top Token + Entropy Avg: {model_conf_signal}"
     )
     confidence_score = (
         lambda_faiss * normalized_faiss
         + model_conf_signal * lambda_conf
@@ -436,13 +477,10 @@ def query_model(
         "You are a financial analyst. Answer financial queries concisely using only the numerical data "
         "explicitly present in the provided financial context:\n\n"
         f"{context}\n\n"
-        "Strictly use only the given financial data. Do not assume, infer, or generate missing data."
-        " Retain the original format of financial figures exactly as given."
-        " Do not attempt to convert the currency into any other format."
-        " If the requested information is not available in the provided context, respond with "
-        "'No relevant financial data available.'"
-        " Provide exactly one answer in a single sentence."
-        " Do not generate explanations, additional text, or answer multiple queries."
         f"\nQuery: {query}"
     )
     inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
@@ -463,7 +501,8 @@ def query_model(
         sequences = output["sequences"][0][input_len:]
     execution_time = time.perf_counter() - start_time
     logger.info(f"Query processed in {execution_time:.2f} seconds.")
-    log_probs = output["scores"]  # List of logits per generated token
     token_probs = [torch.softmax(lp, dim=-1) for lp in log_probs]
     # Extract top token probabilities for each step
     token_confidences = [tp.max().item() for tp in token_probs]
@@ -487,7 +526,7 @@ def query_model(
         final_out += f"Context: {context}\nQuery: {query}\n"
     final_out += f"Response: {response}"
     return (
-        response,
         f"Confidence: {confidence_score}%\nTime taken: {execution_time:.2f} seconds",
     )

     restricted_patterns,
     restricted_topics,
     FINANCIAL_DATA_PATTERNS,
+    FINANCIAL_ENTITY_LABELS,
+    GENERAL_KNOWLEDGE_PATTERNS,
     sensitive_terms,
     FINANCIAL_TERMS,
 )
 # SLM: Microsoft PHI-2 model is loaded
 # It does have higher memory and compute requirements compared to TinyLlama and Falcon
 # But it gives the best results among the three
+# DEVICE = "cpu"  # or cuda
+DEVICE = "cuda"  # or cuda
 # MODEL_NAME = "TinyLlama/TinyLlama_v1.1"
 # MODEL_NAME = "tiiuae/falcon-rw-1b"
 MODEL_NAME = "microsoft/phi-2"
 # Since the model is to be hosted on a cpu instance, we use float32
 # For GPU, we can use float16 or bfloat16
 model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True
 ).to(DEVICE)
 model.eval()
 # model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
         pickle.dump(bm25_data, f)
     return "Files processed successfully! You can now query."
 def contains_financial_entities(query):
+    """Check if query contains financial entities"""
     doc = nlp(query)
     for ent in doc.ents:
         if ent.label_ in FINANCIAL_ENTITY_LABELS:
             return True
     return False
+def contains_geographical_entities(query):
+    """Check if the query contains geographical entities"""
+    doc = nlp(query)
+    return any(ent.label_ == "GPE" for ent in doc.ents)
+def contains_financial_terms(query):
+    """Check if the query contains financial terms"""
+    return any(term in query.lower() for term in FINANCIAL_TERMS)
+def is_general_knowledge_query(query):
+    """Check if query contains general knowledge"""
+    query_lower = query.lower()
+    for pattern in GENERAL_KNOWLEDGE_PATTERNS:
+        if re.search(pattern, query_lower):
+            return True
+    return False
+def is_irrelevant_query(query):
+    """Check if the query is not finance related"""
+    # If the query is general knowledge and not finance-related
+    if is_general_knowledge_query(query) and not contains_financial_terms(query):
+        return True
+    # If the query contains only geographical terms without financial entities
+    if contains_geographical_entities(query) and not contains_financial_entities(query):
+        return True
+    return False
 # Input guardrail implementation
+# NER + Regex + List of terms used to filter irrelevant queries
 # Regex is used to filter queries related to sensitive topics
 # Uses spaCy model's Named Entity Recognition to filter queries for personal details
 # Uses cosine similarity with the embedded query and sensitive topic vectors
 # to filter out queries violating confidential/security rules (additional)
 def is_query_allowed(query):
     """Checks if the query violates security or confidentiality rules"""
+    if is_irrelevant_query(query):
+        return False, "Query is not finance-related. Please ask a financial question."
     for pattern in restricted_patterns:
         if re.search(pattern, query.lower(), re.IGNORECASE):
             return False, "This query requests sensitive or confidential information."
     doc = nlp(query)
+    # Check if there's a person entity and contains sensitive terms
     for ent in doc.ents:
         if ent.label_ == "PERSON":
             for token in ent.subtree:
     topic_embeddings = embed_model.encode(
         list(restricted_topics), normalize_embeddings=True
     )
+    # Check similarities between the restricted topics and the query
     similarities = np.dot(topic_embeddings, query_embedding)
     if np.max(similarities) > 0.85:
         return False, "This query requests sensitive or confidential information."
         normalized_bm25 = 0.0
     logger.info(
         f"Faiss score: {normalized_faiss}, bm25: {normalized_bm25}, "
+        f"Mean Top Token + 1-Entropy Avg: {model_conf_signal}"
     )
+    # Weighted sum of all the normalized scores
     confidence_score = (
         lambda_faiss * normalized_faiss
         + model_conf_signal * lambda_conf
         "You are a financial analyst. Answer financial queries concisely using only the numerical data "
         "explicitly present in the provided financial context:\n\n"
         f"{context}\n\n"
+        "Use only the given financial data—do not assume, infer, or generate missing values."
+        " Retain the original format of financial figures without conversion."
+        " If the requested information is unavailable, respond with 'No relevant financial data available.'"
+        " Provide a single-sentence answer without explanations, additional text, or multiple responses."
         f"\nQuery: {query}"
     )
     inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
         sequences = output["sequences"][0][input_len:]
     execution_time = time.perf_counter() - start_time
     logger.info(f"Query processed in {execution_time:.2f} seconds.")
+    # Get the logits per generated token
+    log_probs = output["scores"]
     token_probs = [torch.softmax(lp, dim=-1) for lp in log_probs]
     # Extract top token probabilities for each step
     token_confidences = [tp.max().item() for tp in token_probs]
         final_out += f"Context: {context}\nQuery: {query}\n"
     final_out += f"Response: {response}"
     return (
+        final_out,
         f"Confidence: {confidence_score}%\nTime taken: {execution_time:.2f} seconds",
     )

data_filters.py CHANGED Viewed

@@ -29,6 +29,15 @@ restricted_topics = {
     "financial package",
 }
 sensitive_terms = {
     "salary",
     "compensation",

     "financial package",
 }
+FINANCIAL_ENTITY_LABELS = {"MONEY", "PERCENT", "CARDINAL", "ORG"}
+GENERAL_KNOWLEDGE_PATTERNS = [
+    r"\b(?:capital of|where is|who is|when did|what is|history of|define|meaning of|synonym of|antonym of|explain|how does|why is)\b",
+    r"\b(?:country|city|continent|leader|president|prime minister|language|currency|population|politics|war|anthem|flag|national animal|national bird|national flower|national sport|monarch|king|queen|ruler|army|military|constitution|government|laws|famous person|historical figure|famous landmark|ocean|mountain|river|lake|climate|weather|culture|tradition|festival|holiday|invention|discovery|science|technology|art|literature|music|religion|mythology|folklore|education|university|school|mathematics|physics|chemistry|biology|philosophy|astronomy|space|planet|star|galaxy|universe|health|medicine|disease|virus|bacteria|genetics|DNA|evolution|ecology|environment|pollution|wildlife|habitat|natural disaster|earthquake|volcano|tsunami|hurricane|storm|flood|drought)\b",
+    r"\b(?:[A-Z][a-z]+(?:'s)?\s+(?:capital|president|prime minister|national animal|national bird|national flower|national sport|anthem|flag|currency|language|leader|government|constitution|laws|monarch|king|queen|army|military|famous person|historical figure|landmark|river|ocean|mountain|religion|festival|holiday))\b",
+]
 sensitive_terms = {
     "salary",
     "compensation",