File size: 7,145 Bytes
39838a2
 
e635ed4
39838a2
 
 
 
 
 
 
74654a8
39838a2
e2c5a01
e635ed4
 
 
e2c5a01
74654a8
 
d8057b0
 
74654a8
e635ed4
39838a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_model():
    """Load fine-tuned model without quantization for CPU compatibility."""
    logger.info("Loading model...")
    
    # Use explicit AutoTokenizer instead of LlamaTokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "Zwounds/boolean-search-model"
    )
    
    # Load model in the most compatible way for Spaces
    model = AutoModelForCausalLM.from_pretrained(
        "Zwounds/boolean-search-model",
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32  # Use standard floating point for CPU
    )
    return model, tokenizer

def format_prompt(query):
    """Format query with instruction prompt."""
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert this natural language query into a boolean search query by following these rules:

1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
   - articles, papers, research, studies
   - examining, investigating, analyzing
   - findings, documents, literature
   - publications, journals, reviews
   Example: "Research examining X" β†’ just "X"

2. SECOND: Remove generic implied terms that don't add search value:
   - Remove words like "practices," "techniques," "methods," "approaches," "strategies"
   - Remove words like "impacts," "effects," "influences," "role," "applications"
   - For example: "sustainable agriculture practices" β†’ "sustainable agriculture"
   - For example: "teaching methodologies" β†’ "teaching"
   - For example: "leadership styles" β†’ "leadership"

3. THEN: Format the remaining terms:
   CRITICAL QUOTING RULES:
   - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
   - Examples of correct quoting:
     - Wrong: machine learning AND deep learning
     - Right: "machine learning" AND "deep learning"
     - Wrong: natural language processing
     - Right: "natural language processing"
   - Single words must NEVER have quotes (e.g., science, research, learning)
   - Use AND to connect required concepts
   - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))

Example conversions showing proper quoting:
"Research on machine learning for natural language processing"
β†’ "machine learning" AND "natural language processing"

"Studies examining anxiety depression stress in workplace"
β†’ (anxiety OR depression OR stress) AND workplace

"Articles about deep learning impact on computer vision"
β†’ "deep learning" AND "computer vision"

"Research on sustainable agriculture practices and their impact on soil health or biodiversity"
β†’ "sustainable agriculture" AND ("soil health" OR biodiversity)

"Articles about effective teaching methods for second language acquisition"
β†’ teaching AND "second language acquisition"

### Input:
{query}

### Response:
"""

def get_boolean_query(query):
    """Generate boolean query from natural language."""
    prompt = format_prompt(query)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Tokenize and generate response
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=32,
        do_sample=False,
        use_cache=True,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Extract response section and clean output
    full_response = tokenizer.decode(outputs[0])
    response = full_response.split("### Response:")[-1].strip()
    # Remove end of text token if present
    cleaned_response = response.replace("<|end_of_text|>", "").strip()
    return cleaned_response

# Load model globally
logger.info("Initializing model...")
model, tokenizer = load_model()
logger.info("Model loaded successfully")

# Example queries using more natural language
examples = [
    # Testing removal of meta-terms
    ["Find research papers examining the long-term effects of meditation on brain structure"],
    
    # Testing removal of generic implied terms (practices, techniques, methods)
    ["Articles about deep learning techniques for natural language processing tasks"],
    
    # Testing removal of impact/effect terms
    ["Studies on the impact of early childhood nutrition on cognitive development"],
    
    # Testing handling of technology applications
    ["Information on virtual reality applications in architectural design and urban planning"],
    
    # Testing proper OR relationship with parentheses
    ["Research on electric vehicles adoption in urban environments or rural communities"],
    
    # Testing proper quoting of multi-word concepts only
    ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
    
    # Testing removal of strategy/approach terms
    ["Studies about different teaching approaches for children with learning disabilities"],
    
    # Testing complex OR relationships
    ["Research examining social media influence on political polarization or public discourse"],
    
    # Testing implied terms in specific industries
    ["Articles about implementation strategies for blockchain in supply chain management or financial services"],
    
    # Testing qualifiers that don't add search value
    ["Research on effective leadership styles in multicultural organizations"],
    
    # Testing removal of multiple implied terms
    ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
    
    # Testing domain-specific implied terms
    ["Articles about successful cybersecurity protection methods for critical infrastructure"],
    
    # Testing generalized vs specific concepts
    ["Research papers on quantum computing algorithms for cryptography or optimization problems"],
    
    # Testing implied terms in outcome descriptions
    ["Studies examining the relationship between sleep quality and academic performance outcomes"],
    
    # Testing complex nesting of concepts
    ["Articles about renewable energy integration challenges in developing countries or island nations"]
]


# Create Gradio interface with metadata for deployment
title = "Boolean Search Query Generator"
description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
demo = gr.Interface(
    fn=get_boolean_query,
    inputs=[
        gr.Textbox(
            label="Enter your natural language query",
            placeholder="e.g., I'm looking for information about climate change and renewable energy"
        )
    ],
    outputs=gr.Textbox(label="Boolean Search Query"),
    title=title,
    description=description,
    examples=examples,
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()