om4r932 commited on
Commit
8555b30
·
1 Parent(s): 3e58bed

Add experimental keyword search

Browse files
Files changed (4) hide show
  1. app.py +102 -2
  2. static/script.js +100 -12
  3. static/style.css +11 -5
  4. templates/index.html +44 -0
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from io import StringIO
 
2
  import numpy as np
3
  import pandas as pd
4
  import requests
@@ -18,11 +19,15 @@ from dotenv import load_dotenv
18
  import warnings
19
  from fastapi import FastAPI, HTTPException
20
  from fastapi.middleware.cors import CORSMiddleware
 
 
21
  from fastapi.responses import FileResponse
22
  from fastapi.staticfiles import StaticFiles
23
  from pydantic import BaseModel
24
  from typing import Any, Dict, List, Literal, Optional
25
 
 
 
26
  load_dotenv()
27
 
28
  warnings.filterwarnings("ignore")
@@ -221,6 +226,13 @@ class BatchDocResponse(BaseModel):
221
  missing: List[str]
222
  search_time: float
223
 
 
 
 
 
 
 
 
224
  class KeywordRequest(BaseModel):
225
  keywords: Optional[str] = ""
226
  search_mode: Literal["quick", "deep"]
@@ -441,11 +453,99 @@ class SpecDocFinder:
441
 
442
  finder_tsg = TsgDocFinder()
443
  finder_spec = SpecDocFinder()
 
 
 
 
 
 
444
 
445
  @app.get("/")
446
  async def main_menu():
447
  return FileResponse(os.path.join("templates", "index.html"))
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  @app.post("/search-spec", response_model=KeywordResponse)
450
  def search_spec(request: KeywordRequest):
451
  start_time = time.time()
@@ -486,7 +586,7 @@ def search_spec(request: KeywordRequest):
486
  if search_mode == "deep":
487
  if docValid:
488
  for chapter in list(doc.keys())[1:]:
489
- if "references" not in chapter.lower() and "void" not in chapter.lower() and "annexe" not in doc[chapter].lower():
490
  if all(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
491
  put = True
492
  contents.append(chapter)
@@ -496,7 +596,7 @@ def search_spec(request: KeywordRequest):
496
  if search_mode == "deep":
497
  if docValid:
498
  for chapter in list(doc.keys())[1:]:
499
- if "references" not in chapter.lower() and "void" not in chapter.lower() and "annexe" not in doc[chapter].lower():
500
  if any(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
501
  put = True
502
  contents.append(chapter)
 
1
  from io import StringIO
2
+ import bm25s
3
  import numpy as np
4
  import pandas as pd
5
  import requests
 
19
  import warnings
20
  from fastapi import FastAPI, HTTPException
21
  from fastapi.middleware.cors import CORSMiddleware
22
+ import nltk
23
+ from nltk.stem import WordNetLemmatizer
24
  from fastapi.responses import FileResponse
25
  from fastapi.staticfiles import StaticFiles
26
  from pydantic import BaseModel
27
  from typing import Any, Dict, List, Literal, Optional
28
 
29
+ from sklearn.preprocessing import MinMaxScaler
30
+ nltk.download("wordnet")
31
  load_dotenv()
32
 
33
  warnings.filterwarnings("ignore")
 
226
  missing: List[str]
227
  search_time: float
228
 
229
+ class KeywordRequest2(BaseModel):
230
+ keywords: Optional[str] = ""
231
+ threshold: Optional[int] = 60
232
+ release: Optional[str] = None
233
+ working_group: Optional[str] = None
234
+ spec_type: Optional[Literal["TS", "TR"]] = None
235
+
236
  class KeywordRequest(BaseModel):
237
  keywords: Optional[str] = ""
238
  search_mode: Literal["quick", "deep"]
 
453
 
454
  finder_tsg = TsgDocFinder()
455
  finder_spec = SpecDocFinder()
456
+ lemmatizer = WordNetLemmatizer()
457
+
458
+ if os.path.exists("bm25s.zip"):
459
+ with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
460
+ zip_ref.extractall(".")
461
+ bm25_engine = bm25s.BM25.load("3gpp_bm25_docs", load_corpus=True)
462
 
463
  @app.get("/")
464
  async def main_menu():
465
  return FileResponse(os.path.join("templates", "index.html"))
466
 
467
+ @app.post("/search-spec/experimental", response_model=KeywordResponse)
468
+ def search_spec_bm25(request: KeywordRequest2):
469
+ start_time = time.time()
470
+ release = request.release
471
+ working_group = request.working_group
472
+ spec_type = request.spec_type
473
+ threshold = request.threshold
474
+ query = lemmatizer.lemmatize(request.keywords)
475
+
476
+ results_out = []
477
+ query_tokens = bm25s.tokenize(query)
478
+ results, scores = bm25_engine.retrieve(query_tokens, k=len(bm25_engine.corpus))
479
+
480
+ def calculate_boosted_score(metadata, score, query):
481
+ title = {lemmatizer.lemmatize(metadata['title']).lower()}
482
+ q = {query.lower()}
483
+ spec_id_presence = 0.5 if len(q & {metadata['id']}) > 0 else 0
484
+ booster = len(q & title) * 0.5
485
+ return score + spec_id_presence + booster
486
+
487
+ spec_scores = {}
488
+ spec_indices = {}
489
+ spec_details = {}
490
+
491
+ for i in range(results.shape[1]):
492
+ doc = results[0, i]
493
+ score = scores[0, i]
494
+ spec = doc["metadata"]["id"]
495
+
496
+ boosted_score = calculate_boosted_score(doc['metadata'], score, query)
497
+
498
+ if spec not in spec_scores or boosted_score > spec_scores[spec]:
499
+ spec_scores[spec] = boosted_score
500
+ spec_indices[spec] = i
501
+ spec_details[spec] = {
502
+ 'original_score': score,
503
+ 'boosted_score': boosted_score,
504
+ 'doc': doc
505
+ }
506
+
507
+ def normalize_scores(scores_dict):
508
+ if not scores_dict:
509
+ return {}
510
+
511
+ scores_array = np.array(list(scores_dict.values())).reshape(-1, 1)
512
+ scaler = MinMaxScaler()
513
+ normalized_scores = scaler.fit_transform(scores_array).flatten()
514
+
515
+ normalized_dict = {}
516
+ for i, spec in enumerate(scores_dict.keys()):
517
+ normalized_dict[spec] = normalized_scores[i]
518
+
519
+ return normalized_dict
520
+
521
+ normalized_scores = normalize_scores(spec_scores)
522
+
523
+ for spec in spec_details:
524
+ spec_details[spec]["normalized_score"] = normalized_scores[spec]
525
+
526
+ unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True)
527
+
528
+ for rank, spec in enumerate(unique_specs, 1):
529
+ details = spec_details[spec]
530
+ metadata = details['doc']['metadata']
531
+ if metadata.get('version', None) is None or (release is not None and metadata["version"].split(".")[0] != str(release)):
532
+ continue
533
+ if metadata.get('type', None) is None or (spec_type is not None and metadata["type"] != spec_type):
534
+ continue
535
+ if metadata.get('working_group', None) is None or (working_group is not None and metadata["working_group"] != working_group):
536
+ continue
537
+ if details['normalized_score'] < threshold / 100:
538
+ break
539
+ results_out.append(metadata)
540
+
541
+ if len(results_out) > 0:
542
+ return KeywordResponse(
543
+ results=results_out,
544
+ search_time=time.time() - start_time
545
+ )
546
+ else:
547
+ raise HTTPException(status_code=404, detail="Specifications not found")
548
+
549
  @app.post("/search-spec", response_model=KeywordResponse)
550
  def search_spec(request: KeywordRequest):
551
  start_time = time.time()
 
586
  if search_mode == "deep":
587
  if docValid:
588
  for chapter in list(doc.keys())[1:]:
589
+ if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
590
  if all(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
591
  put = True
592
  contents.append(chapter)
 
596
  if search_mode == "deep":
597
  if docValid:
598
  for chapter in list(doc.keys())[1:]:
599
+ if "references" not in chapter.lower() and "void" not in chapter.lower() and "annex" not in doc[chapter].lower():
600
  if any(kw in caseSensitive(doc[chapter], booleanLowered) for kw in kws):
601
  put = True
602
  contents.append(chapter)
static/script.js CHANGED
@@ -5,14 +5,18 @@ const dynamicTitle = document.getElementById("dynamicTitle");
5
  const singleModeBtn = document.getElementById('single-mode-btn');
6
  const batchModeBtn = document.getElementById('batch-mode-btn');
7
  const keywordModeBtn = document.getElementById("keyword-mode-btn");
 
8
 
9
- const singleInput = document.querySelector('.single-input');
10
- const batchInput = document.querySelector('.batch-input');
11
- const keywordSearchInput = document.querySelector(".keyword-input");
 
12
 
13
  const docIdInput = document.getElementById('doc-id');
14
  const batchIdsInput = document.getElementById('batch-ids');
15
  const keywordInput = document.getElementById("keywords");
 
 
16
 
17
  const releaseFilter = document.querySelector("input[name=release]")
18
  const modeFilter = document.querySelector("select[name=mode]")
@@ -21,9 +25,14 @@ const workingGroupFilter = document.querySelector("select[name=working_group]")
21
  const caseSensitiveFilter = document.querySelector("input[name=case_sensitive]")
22
  const searchMode = document.querySelector("select[name=search_mode]")
23
 
 
 
 
 
24
  const searchBtn = document.getElementById('search-btn');
25
  const batchSearchBtn = document.getElementById('batch-search-btn');
26
  const keywordSearchBtn = document.getElementById("keyword-search-btn");
 
27
 
28
  const loader = document.getElementById('loader');
29
  const resultsContainer = document.getElementById('results-container');
@@ -45,10 +54,12 @@ singleModeBtn.addEventListener('click', () => {
45
  singleModeBtn.classList.add('active');
46
  keywordModeBtn.classList.remove("active");
47
  batchModeBtn.classList.remove('active');
 
48
 
49
- singleInput.style.display = 'block';
50
- batchInput.style.display = 'none';
51
- keywordSearchInput.style.display = "none";
 
52
  });
53
 
54
  batchModeBtn.addEventListener('click', () => {
@@ -58,10 +69,12 @@ batchModeBtn.addEventListener('click', () => {
58
  batchModeBtn.classList.add('active');
59
  keywordModeBtn.classList.remove("active");
60
  singleModeBtn.classList.remove('active');
 
61
 
62
- batchInput.style.display = 'block';
63
- keywordSearchInput.style.display = "none";
64
- singleInput.style.display = 'none';
 
65
  });
66
 
67
  keywordModeBtn.addEventListener('click', () => {
@@ -71,10 +84,27 @@ keywordModeBtn.addEventListener('click', () => {
71
  keywordModeBtn.classList.add("active");
72
  singleModeBtn.classList.remove('active');
73
  batchModeBtn.classList.remove("active");
 
74
 
75
- singleInput.style.display = "none";
76
- batchInput.style.display = "none";
77
- keywordSearchInput.style.display = "block";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  })
79
 
80
  document.getElementById('toggleFilters').onclick = function() {
@@ -82,6 +112,58 @@ document.getElementById('toggleFilters').onclick = function() {
82
  target.style.display = (target.style.display === 'none' || target.style.display === '') ? 'flex' : 'none';
83
  };
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  keywordSearchBtn.addEventListener("click", async ()=>{
86
  let keywords = keywordInput.value.trim();
87
  let release = releaseFilter.value;
@@ -448,4 +530,10 @@ keywordInput.addEventListener('keypress', (event)=>{
448
  if (event.key === "Enter"){
449
  keywordSearchBtn.click();
450
  }
 
 
 
 
 
 
451
  })
 
5
  const singleModeBtn = document.getElementById('single-mode-btn');
6
  const batchModeBtn = document.getElementById('batch-mode-btn');
7
  const keywordModeBtn = document.getElementById("keyword-mode-btn");
8
+ const expModeBtn = document.getElementById("exp-mode-btn");
9
 
10
+ const singleInputField = document.querySelector('.single-input');
11
+ const batchInputField = document.querySelector('.batch-input');
12
+ const keywordInputField = document.querySelector(".keyword-input");
13
+ const expKeywordInputField = document.querySelector(".experimental-input");
14
 
15
  const docIdInput = document.getElementById('doc-id');
16
  const batchIdsInput = document.getElementById('batch-ids');
17
  const keywordInput = document.getElementById("keywords");
18
+ const expKeywordInput = document.getElementById("exp-keywords")
19
+ const thresholdInput = document.getElementById("threshold");
20
 
21
  const releaseFilter = document.querySelector("input[name=release]")
22
  const modeFilter = document.querySelector("select[name=mode]")
 
25
  const caseSensitiveFilter = document.querySelector("input[name=case_sensitive]")
26
  const searchMode = document.querySelector("select[name=search_mode]")
27
 
28
+ const releaseFilter2 = document.querySelector("input[name=release2]")
29
+ const specTypeFilter2 = document.querySelector("select[name=spec_type2]")
30
+ const workingGroupFilter2 = document.querySelector("select[name=working_group2]")
31
+
32
  const searchBtn = document.getElementById('search-btn');
33
  const batchSearchBtn = document.getElementById('batch-search-btn');
34
  const keywordSearchBtn = document.getElementById("keyword-search-btn");
35
+ const expKeywordSearchBtn = document.getElementById("exp-search-btn");
36
 
37
  const loader = document.getElementById('loader');
38
  const resultsContainer = document.getElementById('results-container');
 
54
  singleModeBtn.classList.add('active');
55
  keywordModeBtn.classList.remove("active");
56
  batchModeBtn.classList.remove('active');
57
+ expModeBtn.classList.remove('active');
58
 
59
+ singleInputField.style.display = 'block';
60
+ batchInputField.style.display = 'none';
61
+ keywordInputField.style.display = "none";
62
+ expKeywordInputField.style.display = "none";
63
  });
64
 
65
  batchModeBtn.addEventListener('click', () => {
 
69
  batchModeBtn.classList.add('active');
70
  keywordModeBtn.classList.remove("active");
71
  singleModeBtn.classList.remove('active');
72
+ expModeBtn.classList.remove('active');
73
 
74
+ batchInputField.style.display = 'block';
75
+ keywordInputField.style.display = "none";
76
+ singleInputField.style.display = 'none';
77
+ expKeywordInputField.style.display = "none";
78
  });
79
 
80
  keywordModeBtn.addEventListener('click', () => {
 
84
  keywordModeBtn.classList.add("active");
85
  singleModeBtn.classList.remove('active');
86
  batchModeBtn.classList.remove("active");
87
+ expModeBtn.classList.remove('active');
88
 
89
+ singleInputField.style.display = "none";
90
+ batchInputField.style.display = "none";
91
+ expKeywordInputField.style.display = "none";
92
+ keywordInputField.style.display = "block";
93
+ })
94
+
95
+ expModeBtn.addEventListener('click', () => {
96
+ dynamicTitle.textContent = "[EXPERIMENTAL] Search 3GPP specifications";
97
+ dynamicDesc.textContent = "With keywords and filters, find all of 3GPP's specifications that matches your needs (with keywords, specification number, release or even working group (C1, S5, SP, CP: always the first letter of the group followed by the workgroup number)";
98
+
99
+ keywordModeBtn.classList.remove("active");
100
+ singleModeBtn.classList.remove('active');
101
+ batchModeBtn.classList.remove("active");
102
+ expModeBtn.classList.add('active');
103
+
104
+ singleInputField.style.display = "none";
105
+ batchInputField.style.display = "none";
106
+ expKeywordInputField.style.display = "block";
107
+ keywordInputField.style.display = "none";
108
  })
109
 
110
  document.getElementById('toggleFilters').onclick = function() {
 
112
  target.style.display = (target.style.display === 'none' || target.style.display === '') ? 'flex' : 'none';
113
  };
114
 
115
+ document.getElementById('toggleFilters2').onclick = function() {
116
+ var target = document.getElementById('filtersForm2');
117
+ target.style.display = (target.style.display === 'none' || target.style.display === '') ? 'flex' : 'none';
118
+ };
119
+
120
+ expKeywordSearchBtn.addEventListener("click", async ()=>{
121
+ let keywords = expKeywordInput.value.trim();
122
+ let release = releaseFilter2.value.trim();
123
+ let wg = workingGroupFilter2.value.trim();
124
+ let specType = specTypeFilter2.value.trim();
125
+ let threshold = thresholdInput.value.trim();
126
+
127
+ if (!keywords){
128
+ showError("Please enter at least one keyword");
129
+ return;
130
+ }
131
+
132
+ showLoader();
133
+ hideError();
134
+
135
+ try{
136
+ let body = {
137
+ keywords,
138
+ threshold
139
+ };
140
+ if (release != ""){body["release"] = release}
141
+ if (wg != ""){body["working_group"] = wg}
142
+ if (specType != ""){body["spec_type"] = specType}
143
+ const response = await fetch("/search-spec/experimental", {
144
+ method: "POST",
145
+ headers: {
146
+ "Content-Type": "application/json"
147
+ },
148
+ body: JSON.stringify(body)
149
+ });
150
+
151
+ const data = await response.json();
152
+ if (response.ok){
153
+ displayKeywordResults(data, "");
154
+ } else if (response.status == 404) {
155
+ showError('No specification has been found');
156
+ } else {
157
+ showError(`Error processing keyword request: ${data.detail}`)
158
+ }
159
+ } catch (error) {
160
+ showError('Error connecting to the server. Please check if the API is running.');
161
+ console.error('Error:', error);
162
+ } finally {
163
+ hideLoader();
164
+ }
165
+ })
166
+
167
  keywordSearchBtn.addEventListener("click", async ()=>{
168
  let keywords = keywordInput.value.trim();
169
  let release = releaseFilter.value;
 
530
  if (event.key === "Enter"){
531
  keywordSearchBtn.click();
532
  }
533
+ })
534
+
535
+ expKeywordInput.addEventListener('keypress', (event)=>{
536
+ if (event.key === "Enter"){
537
+ keywordSearchBtn.click();
538
+ }
539
  })
static/style.css CHANGED
@@ -217,6 +217,16 @@ header {
217
  z-index: 100;
218
  }
219
 
 
 
 
 
 
 
 
 
 
 
220
  .header-content {
221
  display: flex;
222
  align-items: center;
@@ -340,11 +350,7 @@ header {
340
  border-bottom: 2px solid var(--primary-color);
341
  }
342
 
343
- .batch-input {
344
- display: none;
345
- }
346
-
347
- .keyword-input {
348
  display: none;
349
  }
350
 
 
217
  z-index: 100;
218
  }
219
 
220
+ .input-field input.filter-input{
221
+ flex: none;
222
+ padding: 7px 10px;
223
+ border: 1px solid #ddd;
224
+ border-radius: 4px;
225
+ background: #fff;
226
+ color: #333;
227
+ font-size: 15px;
228
+ }
229
+
230
  .header-content {
231
  display: flex;
232
  align-items: center;
 
350
  border-bottom: 2px solid var(--primary-color);
351
  }
352
 
353
+ .batch-input, .keyword-input, .experimental-input {
 
 
 
 
354
  display: none;
355
  }
356
 
templates/index.html CHANGED
@@ -30,6 +30,7 @@
30
  <button id="single-mode-btn" class="active">Single Document</button>
31
  <button id="batch-mode-btn">Batch Search</button>
32
  <button id="keyword-mode-btn">Keyword Search</button>
 
33
  </div>
34
 
35
  <div class="search-form">
@@ -106,6 +107,49 @@
106
  </form>
107
  </div>
108
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  </div>
110
 
111
  <div class="error-message" id="error-message"></div>
 
30
  <button id="single-mode-btn" class="active">Single Document</button>
31
  <button id="batch-mode-btn">Batch Search</button>
32
  <button id="keyword-mode-btn">Keyword Search</button>
33
+ <button id="exp-mode-btn">Experimental Search (Using BM25)</button>
34
  </div>
35
 
36
  <div class="search-form">
 
107
  </form>
108
  </div>
109
  </div>
110
+ <div class="input-group experimental-input">
111
+ <div class="input-field">
112
+ <input type="number" name="threshold" id="threshold" class="filter-input" min="30" max="100" placeholder="Min %">
113
+ <input type="text" id="exp-keywords" placeholder="Enter keywords separated by spaces">
114
+ <button id="exp-search-btn" class="btn">Search</button>
115
+ </div>
116
+ <div class="filter-tab-container">
117
+ <button type="button" id="toggleFilters2" class="filter-toggle-btn">
118
+ <span>Filters</span>
119
+ <svg width="16" height="16" style="vertical-align:middle" fill="currentColor">
120
+ <path d="M4 7l4 4 4-4"></path>
121
+ </svg>
122
+ </button>
123
+ <form id="filtersForm2" class="filters-row" style="display:none;">
124
+ <input type="number" min="0" max="21" name="release2" placeholder="Release"
125
+ class="filter-input">
126
+
127
+ <select name="spec_type2" class="filter-select">
128
+ <option value="">All types</option>
129
+ <option value="TR">Technical Report (TR)</option>
130
+ <option value="TS">Technical Specification (TS)</option>
131
+ </select>
132
+
133
+ <select name="working_group2" class="filter-select">
134
+ <option value="">Working Group</option>
135
+ <option value="CP">CP</option>
136
+ <option value="C1">C1</option>
137
+ <option value="C2">C2</option>
138
+ <option value="C3">C3</option>
139
+ <option value="C4">C4</option>
140
+ <option value="C5">C5</option>
141
+ <option value="C6">C6</option>
142
+ <option value="SP">SP</option>
143
+ <option value="S1">S1</option>
144
+ <option value="S2">S2</option>
145
+ <option value="S3">S3</option>
146
+ <option value="S4">S4</option>
147
+ <option value="S5">S5</option>
148
+ <option value="S6">S6</option>
149
+ </select>
150
+ </form>
151
+ </div>
152
+ </div>
153
  </div>
154
 
155
  <div class="error-message" id="error-message"></div>