Shakshi3104 commited on
Commit
9c0c936
·
1 Parent(s): 608c214

[delete] delete to make vector search only

Browse files
Files changed (4) hide show
  1. app.py +9 -9
  2. example.py +0 -14
  3. model/search/hybrid.py +0 -146
  4. model/search/surface.py +0 -147
app.py CHANGED
@@ -1,17 +1,17 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
4
- from model.search.hybrid import HybridSearchClient
5
  from model.data.notion_db import fetch_sakurap_corpus
6
 
7
 
8
- def search(search_client: HybridSearchClient):
9
  def _search(query: str) -> pd.DataFrame:
10
  results = search_client.search_top_n(query)
11
  result = results[0]
12
  result["rank"] = result["rank"] + 1
13
- result = result[["rank", "title", "content", "rank_sparse", "rank_dense"]]
14
- result.columns = ["rank", "title", "rap lyric", "rank: surface", "rank: vector"]
15
  return result
16
 
17
  return _search
@@ -21,13 +21,13 @@ if __name__ == "__main__":
21
  # Load dataset
22
  sakurap_df = fetch_sakurap_corpus("./data/sakurap_corpus.csv")
23
  # Initialize search client
24
- search_client = HybridSearchClient.from_dataframe(sakurap_df, "content")
25
 
26
  with gr.Blocks() as search_interface:
27
  gr.Markdown("""
28
- # 💎 Cobalt
29
- Demo app for hybrid search with vector and surface search using [Ruri](https://huggingface.co/cl-nagoya/ruri-large), [BM25](https://github.com/dorianbrown/rank_bm25) and [Voyager](https://spotify.github.io/voyager/).
30
-
31
  You can search ARASHI's songs with rap lyrics by Sho Sakurai.
32
  """)
33
  # Input query
@@ -49,4 +49,4 @@ if __name__ == "__main__":
49
 
50
  # App launch
51
  search_interface.queue()
52
- search_interface.launch(server_name="0.0.0.0")
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
+ from model.search.vector import RuriVoyagerSearchClient
5
  from model.data.notion_db import fetch_sakurap_corpus
6
 
7
 
8
+ def search(search_client: RuriVoyagerSearchClient):
9
  def _search(query: str) -> pd.DataFrame:
10
  results = search_client.search_top_n(query)
11
  result = results[0]
12
  result["rank"] = result["rank"] + 1
13
+ result = result[["rank", "title", "content", "score"]]
14
+ result.columns = ["rank", "title", "rap lyric", "distance"]
15
  return result
16
 
17
  return _search
 
21
  # Load dataset
22
  sakurap_df = fetch_sakurap_corpus("./data/sakurap_corpus.csv")
23
  # Initialize search client
24
+ search_client = RuriVoyagerSearchClient.from_dataframe(sakurap_df, "content")
25
 
26
  with gr.Blocks() as search_interface:
27
  gr.Markdown("""
28
+ # 💎 Cobalt DuckDB 🦆
29
+ Demo app for vector search using [Ruri](https://huggingface.co/cl-nagoya/ruri-large) and DuckDB.
30
+
31
  You can search ARASHI's songs with rap lyrics by Sho Sakurai.
32
  """)
33
  # Input query
 
49
 
50
  # App launch
51
  search_interface.queue()
52
+ search_interface.launch(server_name="0.0.0.0")
example.py DELETED
@@ -1,14 +0,0 @@
1
- import pandas as pd
2
-
3
- from model.search.hybrid import HybridSearchClient
4
- from model.data.notion_db import fetch_sakurap_corpus
5
-
6
-
7
- if __name__ == "__main__":
8
- # Load dataset
9
- sakurap_df = fetch_sakurap_corpus("./data/sakurap_corpus.csv")
10
- # sakurap_df = pd.read_csv("./data/sakurap_corpus.csv")
11
-
12
- # hybrid search
13
- search_client = HybridSearchClient.from_dataframe(sakurap_df, "content")
14
- results = search_client.search_top_n("嵐 5人の歴史")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/search/hybrid.py DELETED
@@ -1,146 +0,0 @@
1
- from typing import Union, List
2
-
3
- import pandas as pd
4
- from copy import deepcopy
5
-
6
- from dotenv import load_dotenv
7
- from loguru import logger
8
- from tqdm import tqdm
9
-
10
- from model.search.base import BaseSearchClient
11
- from model.search.surface import BM25SearchClient
12
- from model.search.vector import RuriVoyagerSearchClient
13
-
14
- from model.utils.timer import stop_watch
15
-
16
-
17
- def reciprocal_rank_fusion(sparse: pd.DataFrame, dense: pd.DataFrame, k=60) -> pd.DataFrame:
18
- """
19
- Reciprocal Rank Fusionを計算する
20
-
21
- Notes
22
- ----------
23
- RRFの計算は以下の式
24
-
25
- .. math:: RRF = \sum_{i=1}^n \frac{1}{k+r_i}
26
-
27
- Parameters
28
- ----------
29
- sparse:
30
- pd.DataFrame, 表層検索の検索結果
31
- dense:
32
- pd.DataFrame, ベクトル検索の結果
33
- k:
34
- int,
35
-
36
- Returns
37
- -------
38
- rank_results:
39
- pd.DataFrame, RRFによるリランク結果
40
-
41
- """
42
- # カラム名を変更
43
- sparse = sparse.rename(columns={"rank": "rank_sparse"})
44
- dense = dense.rename(columns={"rank": "rank_dense"})
45
- # denseはランク以外を落として結合する
46
- dense_ = dense["rank_dense"]
47
-
48
- # 順位を1からスタートするようにする
49
- sparse["rank_sparse"] += 1
50
- dense_ += 1
51
-
52
- # 文書のインデックスをキーに結合する
53
- rank_results = pd.merge(sparse, dense_, how="left", left_index=True, right_index=True)
54
-
55
- # RRFスコアの計算
56
- rank_results["rrf_score"] = 1 / (rank_results["rank_dense"] + k) + 1 / (rank_results["rank_sparse"] + k)
57
-
58
- # RRFスコアのスコアが大きい順にソート
59
- rank_results = rank_results.sort_values(["rrf_score"], ascending=False)
60
- rank_results["rank"] = deepcopy(rank_results.reset_index()).index
61
-
62
- return rank_results
63
-
64
-
65
- class HybridSearchClient(BaseSearchClient):
66
- def __init__(self, dense_model: BaseSearchClient, sparse_model: BaseSearchClient):
67
- self.dense_model = dense_model
68
- self.sparse_model = sparse_model
69
-
70
- @classmethod
71
- @stop_watch
72
- def from_dataframe(cls, _data: pd.DataFrame, _target: str):
73
- """
74
- 検索ドキュメントのpd.DataFrameから初期化する
75
-
76
- Parameters
77
- ----------
78
- _data:
79
- pd.DataFrame, 検索対象のDataFrame
80
-
81
- _target:
82
- str, 検索対象のカラム名
83
-
84
- Returns
85
- -------
86
-
87
- """
88
- # 表層検索の初期化
89
- dense_model = BM25SearchClient.from_dataframe(_data, _target)
90
- # ベクトル検索の初期化
91
- sparse_model = RuriVoyagerSearchClient.from_dataframe(_data, _target)
92
-
93
- return cls(dense_model, sparse_model)
94
-
95
- @stop_watch
96
- def search_top_n(self, _query: Union[List[str], str], n: int = 10) -> List[pd.DataFrame]:
97
- """
98
- クエリに対する検索結果をtop-n個取得する
99
-
100
- Parameters
101
- ----------
102
- _query:
103
- Union[List[str], str], 検索クエリ
104
- n:
105
- int, top-nの個数. デフォルト 10.
106
-
107
- Returns
108
- -------
109
- results:
110
- List[pd.DataFrame], ランキング結果
111
- """
112
-
113
- logger.info(f"🚦 [HybridSearchClient] Search top {n} | {_query}")
114
-
115
- # 型チェック
116
- if isinstance(_query, str):
117
- _query = [_query]
118
-
119
- # ランキングtop-nをクエリ毎に取得
120
- result = []
121
- for query in tqdm(_query):
122
- assert len(self.sparse_model.corpus) == len(
123
- self.dense_model.corpus), "The document counts do not match between sparse and dense!"
124
-
125
- # ドキュメント数
126
- doc_num = len(self.sparse_model.corpus)
127
-
128
- # 表層検索
129
- logger.info(f"🚦 [HybridSearchClient] run surface search ...")
130
- sparse_res = self.sparse_model.search_top_n(query, n=doc_num)
131
- # ベクトル検索
132
- logger.info(f"🚦 [HybridSearchClient] run vector search ...")
133
- dense_res = self.dense_model.search_top_n(query, n=doc_num)
134
-
135
- # RRFスコアの計算
136
- logger.info(f"🚦 [HybridSearchClient] compute RRF scores ...")
137
- rrf_res = reciprocal_rank_fusion(sparse_res[0], dense_res[0])
138
-
139
- # 結果をtop Nに絞る
140
- top_num = 10
141
- rrf_res = rrf_res.head(top_num)
142
- logger.info(f"🚦 [HybridSearchClient] return {top_num} results")
143
-
144
- result.append(rrf_res)
145
-
146
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/search/surface.py DELETED
@@ -1,147 +0,0 @@
1
- from copy import deepcopy
2
- from typing import List, Union
3
-
4
- import pandas as pd
5
- import numpy as np
6
-
7
- from loguru import logger
8
- from tqdm import tqdm
9
-
10
- from rank_bm25 import BM25Okapi
11
-
12
- from model.search.base import BaseSearchClient
13
- from model.utils.tokenizer import MeCabTokenizer
14
- from model.utils.timer import stop_watch
15
-
16
-
17
- class BM25Wrapper(BM25Okapi):
18
- def __init__(self, dataset: pd.DataFrame, target, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
19
- self.k1 = k1
20
- self.b = b
21
- self.epsilon = epsilon
22
- self.dataset = dataset
23
- corpus = dataset[target].values.tolist()
24
- super().__init__(corpus, tokenizer)
25
-
26
- def get_top_n(self, query, documents, n=5):
27
- assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
28
-
29
- scores = self.get_scores(query)
30
- top_n = np.argsort(scores)[::-1][:n]
31
-
32
- result = deepcopy(self.dataset.iloc[top_n])
33
- result["score"] = scores[top_n]
34
- return result
35
-
36
-
37
- class BM25SearchClient(BaseSearchClient):
38
- def __init__(self, _model: BM25Okapi, _corpus: List[List[str]]):
39
- """
40
-
41
- Parameters
42
- ----------
43
- _model:
44
- BM25Okapi
45
- _corpus:
46
- List[List[str]], 検索対象の分かち書き後のフィールド
47
- """
48
- self.model = _model
49
- self.corpus = _corpus
50
-
51
- @staticmethod
52
- def tokenize_ja(_text: List[str]):
53
- """MeCab日本語分かち書きによるコーパス作成
54
-
55
- Args:
56
- _text (List[str]): コーパス文のリスト
57
-
58
- Returns:
59
- List[List[str]]: 分かち書きされたテキストのリスト
60
- """
61
-
62
- # MeCabで分かち書き
63
- parser = MeCabTokenizer.from_tagger("-Owakati")
64
-
65
- corpus = []
66
- with tqdm(_text) as pbar:
67
- for i, t in enumerate(pbar):
68
- try:
69
- # 分かち書きをする
70
- corpus.append(parser.parse(t).split())
71
- except TypeError as e:
72
- if not isinstance(t, str):
73
- logger.info(f"🚦 [BM25SearchClient] Corpus index of {i} is not instance of String.")
74
- corpus.append(["[UNKNOWN]"])
75
- else:
76
- raise e
77
- return corpus
78
-
79
- @classmethod
80
- def from_dataframe(cls, _data: pd.DataFrame, _target: str):
81
- """
82
- 検索ドキュメントのpd.DataFrameから初期化する
83
-
84
- Parameters
85
- ----------
86
- _data:
87
- pd.DataFrame, 検索対象のDataFrame
88
-
89
- _target:
90
- str, 検索対象のカラム名
91
-
92
- Returns
93
- -------
94
-
95
- """
96
-
97
- logger.info("🚦 [BM25SearchClient] Initialize from DataFrame")
98
-
99
- search_field = _data[_target]
100
- corpus = search_field.values.tolist()
101
-
102
- # 分かち書きをする
103
- corpus_tokenized = cls.tokenize_ja(corpus)
104
- _data["tokenized"] = corpus_tokenized
105
-
106
- bm25 = BM25Wrapper(_data, "tokenized")
107
- return cls(bm25, corpus_tokenized)
108
-
109
- @stop_watch
110
- def search_top_n(self, _query: Union[List[str], str], n: int = 10) -> List[pd.DataFrame]:
111
- """
112
- クエリに対する検索結果をtop-n個取得する
113
-
114
- Parameters
115
- ----------
116
- _query:
117
- Union[List[str], str], 検索クエリ
118
- n:
119
- int, top-nの個数. デフォルト 10.
120
-
121
- Returns
122
- -------
123
- results:
124
- List[pd.DataFrame], ランキング結果
125
- """
126
-
127
- logger.info(f"🚦 [BM25SearchClient] Search top {n} | {_query}")
128
-
129
- # 型チェック
130
- if isinstance(_query, str):
131
- _query = [_query]
132
-
133
- # クエリを分かち書き
134
- query_tokens = self.tokenize_ja(_query)
135
-
136
- # ランキングtop-nをクエリ毎に取得
137
- result = []
138
- for query in tqdm(query_tokens):
139
- df_res = self.model.get_top_n(query, self.corpus, n)
140
- # ランク
141
- df_res["rank"] = deepcopy(df_res.reset_index()).index
142
- df_res = df_res.drop(columns=["tokenized"])
143
- result.append(df_res)
144
-
145
- logger.success(f"🚦 [BM25SearchClient] Executed")
146
-
147
- return result