Spaces:

Zwounds
/

Boolean_Search_Query_Model

Runtime error

File size: 7,145 Bytes

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_model():
    """Load fine-tuned model without quantization for CPU compatibility."""
    logger.info("Loading model...")
    
    # Use explicit AutoTokenizer instead of LlamaTokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "Zwounds/boolean-search-model"
    )
    
    # Load model in the most compatible way for Spaces
    model = AutoModelForCausalLM.from_pretrained(
        "Zwounds/boolean-search-model",
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32  # Use standard floating point for CPU
    )
    return model, tokenizer

def format_prompt(query):
    """Format query with instruction prompt."""
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert this natural language query into a boolean search query by following these rules:

1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
   - articles, papers, research, studies
   - examining, investigating, analyzing
   - findings, documents, literature
   - publications, journals, reviews
   Example: "Research examining X" → just "X"

2. SECOND: Remove generic implied terms that don't add search value:
   - Remove words like "practices," "techniques," "methods," "approaches," "strategies"
   - Remove words like "impacts," "effects," "influences," "role," "applications"
   - For example: "sustainable agriculture practices" → "sustainable agriculture"
   - For example: "teaching methodologies" → "teaching"
   - For example: "leadership styles" → "leadership"

3. THEN: Format the remaining terms:
   CRITICAL QUOTING RULES:
   - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
   - Examples of correct quoting:
     - Wrong: machine learning AND deep learning
     - Right: "machine learning" AND "deep learning"
     - Wrong: natural language processing
     - Right: "natural language processing"
   - Single words must NEVER have quotes (e.g., science, research, learning)
   - Use AND to connect required concepts
   - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))

Example conversions showing proper quoting:
"Research on machine learning for natural language processing"
→ "machine learning" AND "natural language processing"

"Studies examining anxiety depression stress in workplace"
→ (anxiety OR depression OR stress) AND workplace

"Articles about deep learning impact on computer vision"
→ "deep learning" AND "computer vision"

"Research on sustainable agriculture practices and their impact on soil health or biodiversity"
→ "sustainable agriculture" AND ("soil health" OR biodiversity)

"Articles about effective teaching methods for second language acquisition"
→ teaching AND "second language acquisition"

### Input:
{query}

### Response:
"""

def get_boolean_query(query):
    """Generate boolean query from natural language."""
    prompt = format_prompt(query)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Tokenize and generate response
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=32,
        do_sample=False,
        use_cache=True,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Extract response section and clean output
    full_response = tokenizer.decode(outputs[0])
    response = full_response.split("### Response:")[-1].strip()
    # Remove end of text token if present
    cleaned_response = response.replace("<|end_of_text|>", "").strip()
    return cleaned_response

# Load model globally
logger.info("Initializing model...")
model, tokenizer = load_model()
logger.info("Model loaded successfully")

# Example queries using more natural language
examples = [
    # Testing removal of meta-terms
    ["Find research papers examining the long-term effects of meditation on brain structure"],
    
    # Testing removal of generic implied terms (practices, techniques, methods)
    ["Articles about deep learning techniques for natural language processing tasks"],
    
    # Testing removal of impact/effect terms
    ["Studies on the impact of early childhood nutrition on cognitive development"],
    
    # Testing handling of technology applications
    ["Information on virtual reality applications in architectural design and urban planning"],
    
    # Testing proper OR relationship with parentheses
    ["Research on electric vehicles adoption in urban environments or rural communities"],
    
    # Testing proper quoting of multi-word concepts only
    ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
    
    # Testing removal of strategy/approach terms
    ["Studies about different teaching approaches for children with learning disabilities"],
    
    # Testing complex OR relationships
    ["Research examining social media influence on political polarization or public discourse"],
    
    # Testing implied terms in specific industries
    ["Articles about implementation strategies for blockchain in supply chain management or financial services"],
    
    # Testing qualifiers that don't add search value
    ["Research on effective leadership styles in multicultural organizations"],
    
    # Testing removal of multiple implied terms
    ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
    
    # Testing domain-specific implied terms
    ["Articles about successful cybersecurity protection methods for critical infrastructure"],
    
    # Testing generalized vs specific concepts
    ["Research papers on quantum computing algorithms for cryptography or optimization problems"],
    
    # Testing implied terms in outcome descriptions
    ["Studies examining the relationship between sleep quality and academic performance outcomes"],
    
    # Testing complex nesting of concepts
    ["Articles about renewable energy integration challenges in developing countries or island nations"]
]


# Create Gradio interface with metadata for deployment
title = "Boolean Search Query Generator"
description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
demo = gr.Interface(
    fn=get_boolean_query,
    inputs=[
        gr.Textbox(
            label="Enter your natural language query",
            placeholder="e.g., I'm looking for information about climate change and renewable energy"
        )
    ],
    outputs=gr.Textbox(label="Boolean Search Query"),
    title=title,
    description=description,
    examples=examples,
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()