Spaces:

Zwounds
/

Boolean_Search_Query_Model

Runtime error

App Files Files Community

Zwounds commited on Mar 19

Commit

6d79ec9

verified ·

1 Parent(s): e34d0c9

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

demo.py +66 -75

demo.py CHANGED Viewed

@@ -1,39 +1,13 @@
 import gradio as gr
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 import logging
-import os
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def load_model():
-    """Load the GGUF model from Hugging Face."""
-    logger.info("Loading GGUF model...")
-    # Download the model from HF Hub
-    model_path = hf_hub_download(
-        repo_id="Zwounds/boolean-search-model",
-        filename="boolean-model.gguf",
-        repo_type="model"
-    )
-    # Load the model with llama-cpp-python
-    model = Llama(
-        model_path=model_path,
-        n_ctx=2048,  # Context window
-        n_gpu_layers=0  # Use CPU only for HF Spaces compatibility
-    )
-    return model
-def format_prompt(query):
-    """Format query with instruction prompt."""
-    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-### Instruction:
-Convert this natural language query into a boolean search query by following these rules:
 1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
    - articles, papers, research, studies
@@ -59,57 +33,61 @@ Convert this natural language query into a boolean search query by following the
      - Right: "natural language processing"
    - Single words must NEVER have quotes (e.g., science, research, learning)
    - Use AND to connect required concepts
-   - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))
-Example conversions showing proper quoting:
-"Research on machine learning for natural language processing"
-→ "machine learning" AND "natural language processing"
-"Studies examining anxiety depression stress in workplace"
-→ (anxiety OR depression OR stress) AND workplace
-"Articles about deep learning impact on computer vision"
-→ "deep learning" AND "computer vision"
-"Research on sustainable agriculture practices and their impact on soil health or biodiversity"
-→ "sustainable agriculture" AND ("soil health" OR biodiversity)
-"Articles about effective teaching methods for second language acquisition"
-→ teaching AND "second language acquisition"
-### Input:
-{query}
-### Response:
-"""
-def get_boolean_query(query):
     """Generate boolean query from natural language."""
-    prompt = format_prompt(query)
     # Generate response
-    response = model(
-        prompt,
-        max_tokens=64,
-        temperature=0,
-        stop=["<|end_of_text|>", "###"]  # Stop at these tokens
     )
-    # Extract generated text
-    text = response["choices"][0]["text"].strip()
-    # Extract response section if present
-    if "### Response:" in text:
-        text = text.split("### Response:")[-1].strip()
-    return text
-# Load model globally
-logger.info("Initializing model...")
-model = load_model()
-logger.info("Model loaded successfully")
-# Example queries using more natural language
 examples = [
     # Testing removal of meta-terms
     ["Find research papers examining the long-term effects of meditation on brain structure"],
@@ -157,12 +135,25 @@ examples = [
     ["Articles about renewable energy integration challenges in developing countries or island nations"]
 ]
-# Create Gradio interface with metadata for deployment
-title = "Boolean Search Query Generator"
-description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
 demo = gr.Interface(
-    fn=get_boolean_query,
     inputs=[
         gr.Textbox(
             label="Enter your natural language query",

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:
 1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
    - articles, papers, research, studies
      - Right: "natural language processing"
    - Single words must NEVER have quotes (e.g., science, research, learning)
    - Use AND to connect required concepts
+   - Use OR with parentheses for alternatives"""
+def load_model():
+    """Load the model and set up tokenizer."""
+    logger.info("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(
+        "boolean_model_merged",
+        device_map="auto",
+        torch_dtype=torch.float16
+    )
+    tokenizer = AutoTokenizer.from_pretrained("boolean_model_merged")
+    tokenizer.use_default_system_prompt = False
+    logger.info("Model loaded successfully")
+    return model, tokenizer
+def extract_response(output: str) -> str:
+    """Extract the response part from the output."""
+    start_marker = "<|start_header_id|>assistant<|end_header_id|>"
+    end_marker = "<|eot_id|>"
+    start_idx = output.find(start_marker)
+    if start_idx != -1:
+        start_idx += len(start_marker)
+        end_idx = output.find(end_marker, start_idx)
+        if end_idx != -1:
+            return output[start_idx:end_idx].strip()
+    return output.strip()
+def get_boolean_query(query: str, model=None, tokenizer=None) -> str:
     """Generate boolean query from natural language."""
+    # Format the conversation
+    conversation = [
+        {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": query}
+    ]
+    # Format into chat template
+    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # Generate response
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=64,
+        do_sample=False,
+        use_cache=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id
     )
+    return extract_response(tokenizer.batch_decode(outputs)[0])
+# Example queries demonstrating various cases
 examples = [
     # Testing removal of meta-terms
     ["Find research papers examining the long-term effects of meditation on brain structure"],
     ["Articles about renewable energy integration challenges in developing countries or island nations"]
 ]
+# Load model globally
+logger.info("Initializing model...")
+model, tokenizer = load_model()
+# Create Gradio interface
+title = "Natural Language to Boolean Search"
+description = """Convert natural language queries into boolean search expressions. The model will:
+1. Remove search-related terms (like 'articles', 'research', etc.)
+2. Handle generic implied terms (like 'practices', 'methods')
+3. Format concepts using proper boolean syntax:
+   - Multi-word phrases in quotes
+   - Single words without quotes
+   - AND to connect required concepts
+   - OR with parentheses for alternatives
+"""
 demo = gr.Interface(
+    fn=lambda x: get_boolean_query(x, model, tokenizer),
     inputs=[
         gr.Textbox(
             label="Enter your natural language query",