Zwounds commited on
Commit
6d79ec9
·
verified ·
1 Parent(s): e34d0c9

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. demo.py +66 -75
demo.py CHANGED
@@ -1,39 +1,13 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
4
  import logging
5
- import os
6
 
7
  # Setup logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
- def load_model():
12
- """Load the GGUF model from Hugging Face."""
13
- logger.info("Loading GGUF model...")
14
-
15
- # Download the model from HF Hub
16
- model_path = hf_hub_download(
17
- repo_id="Zwounds/boolean-search-model",
18
- filename="boolean-model.gguf",
19
- repo_type="model"
20
- )
21
-
22
- # Load the model with llama-cpp-python
23
- model = Llama(
24
- model_path=model_path,
25
- n_ctx=2048, # Context window
26
- n_gpu_layers=0 # Use CPU only for HF Spaces compatibility
27
- )
28
-
29
- return model
30
-
31
- def format_prompt(query):
32
- """Format query with instruction prompt."""
33
- return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
34
-
35
- ### Instruction:
36
- Convert this natural language query into a boolean search query by following these rules:
37
 
38
  1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
39
  - articles, papers, research, studies
@@ -59,57 +33,61 @@ Convert this natural language query into a boolean search query by following the
59
  - Right: "natural language processing"
60
  - Single words must NEVER have quotes (e.g., science, research, learning)
61
  - Use AND to connect required concepts
62
- - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))
63
 
64
- Example conversions showing proper quoting:
65
- "Research on machine learning for natural language processing"
66
- "machine learning" AND "natural language processing"
67
-
68
- "Studies examining anxiety depression stress in workplace"
69
- → (anxiety OR depression OR stress) AND workplace
70
-
71
- "Articles about deep learning impact on computer vision"
72
- "deep learning" AND "computer vision"
73
-
74
- "Research on sustainable agriculture practices and their impact on soil health or biodiversity"
75
- → "sustainable agriculture" AND ("soil health" OR biodiversity)
76
-
77
- "Articles about effective teaching methods for second language acquisition"
78
- → teaching AND "second language acquisition"
79
-
80
- ### Input:
81
- {query}
82
 
83
- ### Response:
84
- """
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- def get_boolean_query(query):
87
  """Generate boolean query from natural language."""
88
- prompt = format_prompt(query)
 
 
 
 
 
 
 
 
89
 
90
  # Generate response
91
- response = model(
92
- prompt,
93
- max_tokens=64,
94
- temperature=0,
95
- stop=["<|end_of_text|>", "###"] # Stop at these tokens
 
 
96
  )
97
 
98
- # Extract generated text
99
- text = response["choices"][0]["text"].strip()
100
-
101
- # Extract response section if present
102
- if "### Response:" in text:
103
- text = text.split("### Response:")[-1].strip()
104
-
105
- return text
106
 
107
- # Load model globally
108
- logger.info("Initializing model...")
109
- model = load_model()
110
- logger.info("Model loaded successfully")
111
-
112
- # Example queries using more natural language
113
  examples = [
114
  # Testing removal of meta-terms
115
  ["Find research papers examining the long-term effects of meditation on brain structure"],
@@ -157,12 +135,25 @@ examples = [
157
  ["Articles about renewable energy integration challenges in developing countries or island nations"]
158
  ]
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Create Gradio interface with metadata for deployment
162
- title = "Boolean Search Query Generator"
163
- description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
164
  demo = gr.Interface(
165
- fn=get_boolean_query,
166
  inputs=[
167
  gr.Textbox(
168
  label="Enter your natural language query",
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
  import logging
 
5
 
6
  # Setup logging
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger(__name__)
9
 
10
+ SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
13
  - articles, papers, research, studies
 
33
  - Right: "natural language processing"
34
  - Single words must NEVER have quotes (e.g., science, research, learning)
35
  - Use AND to connect required concepts
36
+ - Use OR with parentheses for alternatives"""
37
 
38
+ def load_model():
39
+ """Load the model and set up tokenizer."""
40
+ logger.info("Loading model...")
41
+ model = AutoModelForCausalLM.from_pretrained(
42
+ "boolean_model_merged",
43
+ device_map="auto",
44
+ torch_dtype=torch.float16
45
+ )
46
+ tokenizer = AutoTokenizer.from_pretrained("boolean_model_merged")
47
+ tokenizer.use_default_system_prompt = False
48
+ logger.info("Model loaded successfully")
49
+
50
+ return model, tokenizer
 
 
 
 
 
51
 
52
+ def extract_response(output: str) -> str:
53
+ """Extract the response part from the output."""
54
+ start_marker = "<|start_header_id|>assistant<|end_header_id|>"
55
+ end_marker = "<|eot_id|>"
56
+
57
+ start_idx = output.find(start_marker)
58
+ if start_idx != -1:
59
+ start_idx += len(start_marker)
60
+ end_idx = output.find(end_marker, start_idx)
61
+ if end_idx != -1:
62
+ return output[start_idx:end_idx].strip()
63
+
64
+ return output.strip()
65
 
66
+ def get_boolean_query(query: str, model=None, tokenizer=None) -> str:
67
  """Generate boolean query from natural language."""
68
+ # Format the conversation
69
+ conversation = [
70
+ {"role": "system", "content": SYSTEM_INSTRUCTION},
71
+ {"role": "user", "content": query}
72
+ ]
73
+
74
+ # Format into chat template
75
+ prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
76
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
77
 
78
  # Generate response
79
+ outputs = model.generate(
80
+ **inputs,
81
+ max_new_tokens=64,
82
+ do_sample=False,
83
+ use_cache=True,
84
+ pad_token_id=tokenizer.pad_token_id,
85
+ eos_token_id=tokenizer.eos_token_id
86
  )
87
 
88
+ return extract_response(tokenizer.batch_decode(outputs)[0])
 
 
 
 
 
 
 
89
 
90
+ # Example queries demonstrating various cases
 
 
 
 
 
91
  examples = [
92
  # Testing removal of meta-terms
93
  ["Find research papers examining the long-term effects of meditation on brain structure"],
 
135
  ["Articles about renewable energy integration challenges in developing countries or island nations"]
136
  ]
137
 
138
+ # Load model globally
139
+ logger.info("Initializing model...")
140
+ model, tokenizer = load_model()
141
+
142
+ # Create Gradio interface
143
+ title = "Natural Language to Boolean Search"
144
+ description = """Convert natural language queries into boolean search expressions. The model will:
145
+
146
+ 1. Remove search-related terms (like 'articles', 'research', etc.)
147
+ 2. Handle generic implied terms (like 'practices', 'methods')
148
+ 3. Format concepts using proper boolean syntax:
149
+ - Multi-word phrases in quotes
150
+ - Single words without quotes
151
+ - AND to connect required concepts
152
+ - OR with parentheses for alternatives
153
+ """
154
 
 
 
 
155
  demo = gr.Interface(
156
+ fn=lambda x: get_boolean_query(x, model, tokenizer),
157
  inputs=[
158
  gr.Textbox(
159
  label="Enter your natural language query",