import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import logging # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules: 1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output): - articles, papers, research, studies - examining, investigating, analyzing - findings, documents, literature - publications, journals, reviews Example: "Research examining X" → just "X" 2. SECOND: Remove generic implied terms that don't add search value: - Remove words like "practices," "techniques," "methods," "approaches," "strategies" - Remove words like "impacts," "effects," "influences," "role," "applications" - For example: "sustainable agriculture practices" → "sustainable agriculture" - For example: "teaching methodologies" → "teaching" - For example: "leadership styles" → "leadership" 3. THEN: Format the remaining terms: CRITICAL QUOTING RULES: - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS - Examples of correct quoting: - Wrong: machine learning AND deep learning - Right: "machine learning" AND "deep learning" - Wrong: natural language processing - Right: "natural language processing" - Single words must NEVER have quotes (e.g., science, research, learning) - Use AND to connect required concepts - Use OR with parentheses for alternatives""" def load_model(): """Load the model and set up tokenizer.""" logger.info("Loading model...") model = AutoModelForCausalLM.from_pretrained( "Zwounds/boolean-search-model", torch_dtype=torch.float32 ) tokenizer = AutoTokenizer.from_pretrained("Zwounds/boolean-search-model") tokenizer.use_default_system_prompt = False logger.info("Model loaded successfully") return model, tokenizer def extract_response(output: str) -> str: """Extract the response part from the output.""" start_marker = "<|start_header_id|>assistant<|end_header_id|>" end_marker = "<|eot_id|>" start_idx = output.find(start_marker) if start_idx != -1: start_idx += len(start_marker) end_idx = output.find(end_marker, start_idx) if end_idx != -1: return output[start_idx:end_idx].strip() return output.strip() def get_boolean_query(query: str, model=None, tokenizer=None) -> str: """Generate boolean query from natural language.""" # Format the conversation conversation = [ {"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": query} ] # Format into chat template prompt = tokenizer.apply_chat_template(conversation, tokenize=False) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate response outputs = model.generate( **inputs, max_new_tokens=64, do_sample=False, use_cache=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) return extract_response(tokenizer.batch_decode(outputs)[0]) # Example queries demonstrating various cases examples = [ # Testing removal of meta-terms ["Find research papers examining the long-term effects of meditation on brain structure"], # Testing removal of generic implied terms (practices, techniques, methods) ["Articles about deep learning techniques for natural language processing tasks"], # Testing removal of impact/effect terms ["Studies on the impact of early childhood nutrition on cognitive development"], # Testing handling of technology applications ["Information on virtual reality applications in architectural design and urban planning"], # Testing proper OR relationship with parentheses ["Research on electric vehicles adoption in urban environments or rural communities"], # Testing proper quoting of multi-word concepts only ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"], # Testing removal of strategy/approach terms ["Studies about different teaching approaches for children with learning disabilities"], # Testing complex OR relationships ["Research examining social media influence on political polarization or public discourse"], # Testing implied terms in specific industries ["Articles about implementation strategies for blockchain in supply chain management or financial services"], # Testing qualifiers that don't add search value ["Research on effective leadership styles in multicultural organizations"], # Testing removal of multiple implied terms ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"], # Testing domain-specific implied terms ["Articles about successful cybersecurity protection methods for critical infrastructure"], # Testing generalized vs specific concepts ["Research papers on quantum computing algorithms for cryptography or optimization problems"], # Testing implied terms in outcome descriptions ["Studies examining the relationship between sleep quality and academic performance outcomes"], # Testing complex nesting of concepts ["Articles about renewable energy integration challenges in developing countries or island nations"] ] # Load model globally logger.info("Initializing model...") model, tokenizer = load_model() # Create Gradio interface title = "Natural Language to Boolean Search" description = """Convert natural language queries into boolean search expressions. The model will: 1. Remove search-related terms (like 'articles', 'research', etc.) 2. Handle generic implied terms (like 'practices', 'methods') 3. Format concepts using proper boolean syntax: - Multi-word phrases in quotes - Single words without quotes - AND to connect required concepts - OR with parentheses for alternatives """ demo = gr.Interface( fn=lambda x: get_boolean_query(x, model, tokenizer), inputs=[ gr.Textbox( label="Enter your natural language query", placeholder="e.g., I'm looking for information about climate change and renewable energy" ) ], outputs=gr.Textbox(label="Boolean Search Query"), title=title, description=description, examples=examples, theme=gr.themes.Soft() ) if __name__ == "__main__": demo.launch()