Spaces:

Zwounds
/

Boolean_Search_Query_Model

Runtime error

App Files Files Community

Boolean_Search_Query_Model / demo.py

Zwounds

Upload folder using huggingface_hub

dc70758 verified 4 months ago

raw

history blame

5.74 kB

	import gradio as gr
	from vllm import LLM
	import logging

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:

	1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
	- articles, papers, research, studies
	- examining, investigating, analyzing
	- findings, documents, literature
	- publications, journals, reviews
	Example: "Research examining X" → just "X"

	2. SECOND: Remove generic implied terms that don't add search value:
	- Remove words like "practices," "techniques," "methods," "approaches," "strategies"
	- Remove words like "impacts," "effects," "influences," "role," "applications"
	- For example: "sustainable agriculture practices" → "sustainable agriculture"
	- For example: "teaching methodologies" → "teaching"
	- For example: "leadership styles" → "leadership"

	3. THEN: Format the remaining terms:
	CRITICAL QUOTING RULES:
	- Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
	- Examples of correct quoting:
	- Wrong: machine learning AND deep learning
	- Right: "machine learning" AND "deep learning"
	- Wrong: natural language processing
	- Right: "natural language processing"
	- Single words must NEVER have quotes (e.g., science, research, learning)
	- Use AND to connect required concepts
	- Use OR with parentheses for alternatives"""

	def load_model():
	"""Load the model using vLLM."""
	logger.info("Loading model...")
	model = LLM(
	model="Zwounds/boolean-search-model",
	tensor_parallel_size=1 # For CPU
	)
	logger.info("Model loaded successfully")
	return model

	def get_boolean_query(query: str, model=None) -> str:
	"""Generate boolean query from natural language."""
	# Format the conversation with proper markers
	prompt = f"""<\|start_header_id\|>system<\|end_header_id\|>{SYSTEM_INSTRUCTION}<\|start_header_id\|>user<\|end_header_id\|>{query}<\|start_header_id\|>assistant<\|end_header_id\|>"""

	# Generate with vllm
	outputs = model.generate(
	prompt,
	max_tokens=64,
	temperature=0.0, # Deterministic
	stop_tokens=["<\|eot_id\|>"]
	)

	# Extract response
	response = outputs[0].outputs[0].text.strip()
	return response

	# Example queries demonstrating various cases
	examples = [
	# Testing removal of meta-terms
	["Find research papers examining the long-term effects of meditation on brain structure"],

	# Testing removal of generic implied terms (practices, techniques, methods)
	["Articles about deep learning techniques for natural language processing tasks"],

	# Testing removal of impact/effect terms
	["Studies on the impact of early childhood nutrition on cognitive development"],

	# Testing handling of technology applications
	["Information on virtual reality applications in architectural design and urban planning"],

	# Testing proper OR relationship with parentheses
	["Research on electric vehicles adoption in urban environments or rural communities"],

	# Testing proper quoting of multi-word concepts only
	["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],

	# Testing removal of strategy/approach terms
	["Studies about different teaching approaches for children with learning disabilities"],

	# Testing complex OR relationships
	["Research examining social media influence on political polarization or public discourse"],

	# Testing implied terms in specific industries
	["Articles about implementation strategies for blockchain in supply chain management or financial services"],

	# Testing qualifiers that don't add search value
	["Research on effective leadership styles in multicultural organizations"],

	# Testing removal of multiple implied terms
	["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],

	# Testing domain-specific implied terms
	["Articles about successful cybersecurity protection methods for critical infrastructure"],

	# Testing generalized vs specific concepts
	["Research papers on quantum computing algorithms for cryptography or optimization problems"],

	# Testing implied terms in outcome descriptions
	["Studies examining the relationship between sleep quality and academic performance outcomes"],

	# Testing complex nesting of concepts
	["Articles about renewable energy integration challenges in developing countries or island nations"]
	]

	# Load model globally
	logger.info("Initializing model...")
	model = load_model()

	# Create Gradio interface
	title = "Natural Language to Boolean Search"
	description = """Convert natural language queries into boolean search expressions. The model will:

	1. Remove search-related terms (like 'articles', 'research', etc.)
	2. Handle generic implied terms (like 'practices', 'methods')
	3. Format concepts using proper boolean syntax:
	- Multi-word phrases in quotes
	- Single words without quotes
	- AND to connect required concepts
	- OR with parentheses for alternatives
	"""

	demo = gr.Interface(
	fn=lambda x: get_boolean_query(x, model),
	inputs=[
	gr.Textbox(
	label="Enter your natural language query",
	placeholder="e.g., I'm looking for information about climate change and renewable energy"
	)
	],
	outputs=gr.Textbox(label="Boolean Search Query"),
	title=title,
	description=description,
	examples=examples,
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch()