zstanjj commited on
Commit
6f14a6c
·
verified ·
1 Parent(s): cae2f4c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +52 -19
README.md CHANGED
@@ -80,12 +80,32 @@ print(simplified_html)
80
  ```
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  ### 🌲 Build Block Tree
84
 
85
  ```python
86
  from htmlrag import build_block_tree
87
 
88
- block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10)
89
  for block in block_tree:
90
  print("Block Content: ", block[0])
91
  print("Block Path: ", block[1])
@@ -113,8 +133,21 @@ for block in block_tree:
113
  ```python
114
  from htmlrag import EmbedHTMLPruner
115
 
116
- embed_html_pruner = EmbedHTMLPruner(embed_model="bm25")
117
- block_rankings = embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  print(block_rankings)
119
 
120
  # [0, 2, 1]
@@ -123,8 +156,7 @@ from transformers import AutoTokenizer
123
 
124
  chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
125
 
126
- max_context_window = 60
127
- pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
128
  print(pruned_html)
129
 
130
  # <html>
@@ -140,18 +172,8 @@ print(pruned_html)
140
  from htmlrag import GenHTMLPruner
141
  import torch
142
 
143
- ckpt_path = "zstanjj/HTML-Pruner-Llama-1B"
144
- if torch.cuda.is_available():
145
- device="cuda"
146
- else:
147
- device="cpu"
148
- gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=5, device=device)
149
- block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
150
- print(block_rankings)
151
-
152
- # [1, 0]
153
-
154
- block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=10)
155
  for block in block_tree:
156
  print("Block Content: ", block[0])
157
  print("Block Path: ", block[1])
@@ -166,13 +188,24 @@ for block in block_tree:
166
  # Block Path: ['html', 'p']
167
  # Is Leaf: True
168
 
169
- max_context_window = 32
170
- pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
 
 
 
 
 
 
 
 
 
 
171
  print(pruned_html)
172
 
173
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
174
  ```
175
 
 
176
  ## Results
177
 
178
  - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.
 
80
  ```
81
 
82
 
83
+ ### 🔧 Configure Pruning Parameters
84
+
85
+ The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
86
+ ```python
87
+ # Maximum number of words in a node when constructing the block tree for pruning with the embedding model
88
+ MAX_NODE_WORDS_EMBED = 10
89
+ # MAX_NODE_WORDS_EMBED = 256 # a recommended setting for real-world HTML documents
90
+ # Maximum number of tokens in the output HTML document pruned with the embedding model
91
+ MAX_CONTEXT_WINDOW_EMBED = 60
92
+ # MAX_CONTEXT_WINDOW_EMBED = 6144 # a recommended setting for real-world HTML documents
93
+ # Maximum number of words in a node when constructing the block tree for pruning with the generative model
94
+ MAX_NODE_WORDS_GEN = 5
95
+ # MAX_NODE_WORDS_GEN = 128 # a recommended setting for real-world HTML documents
96
+ # Maximum number of tokens in the output HTML document pruned with the generative model
97
+ MAX_CONTEXT_WINDOW_GEN = 32
98
+ # MAX_CONTEXT_WINDOW_GEN = 4096 # a recommended setting for real-world HTML documents
99
+ ```
100
+
101
+
102
+
103
  ### 🌲 Build Block Tree
104
 
105
  ```python
106
  from htmlrag import build_block_tree
107
 
108
+ block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
109
  for block in block_tree:
110
  print("Block Content: ", block[0])
111
  print("Block Path: ", block[1])
 
133
  ```python
134
  from htmlrag import EmbedHTMLPruner
135
 
136
+ embed_model="/train_data_load/huggingface/tjj_hf/bge-large-en/"
137
+ query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
138
+ embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
139
+ # alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
140
+ # tei_endpoint="http://YOUR_TEI_ENDPOINT"
141
+ # embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)
142
+ block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
143
+ print(block_rankings)
144
+
145
+ # [0, 2, 1]
146
+
147
+ #. alternatively you can use bm25 to rank the blocks
148
+ from htmlrag import BM25HTMLPruner
149
+ bm25_html_pruner = BM25HTMLPruner()
150
+ block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
151
  print(block_rankings)
152
 
153
  # [0, 2, 1]
 
156
 
157
  chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
158
 
159
+ pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_EMBED)
 
160
  print(pruned_html)
161
 
162
  # <html>
 
172
  from htmlrag import GenHTMLPruner
173
  import torch
174
 
175
+ # construct a finer block tree
176
+ block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
 
 
 
 
 
 
 
 
 
 
177
  for block in block_tree:
178
  print("Block Content: ", block[0])
179
  print("Block Path: ", block[1])
 
188
  # Block Path: ['html', 'p']
189
  # Is Leaf: True
190
 
191
+ ckpt_path = "zstanjj/HTML-Pruner-Llama-1B"
192
+ if torch.cuda.is_available():
193
+ device="cuda"
194
+ else:
195
+ device="cpu"
196
+ gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=MAX_NODE_WORDS_GEN, device=device)
197
+ block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
198
+ print(block_rankings)
199
+
200
+ # [1, 0]
201
+
202
+ pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_GEN)
203
  print(pruned_html)
204
 
205
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
206
  ```
207
 
208
+
209
  ## Results
210
 
211
  - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.