Update README.md
Browse files
README.md
CHANGED
@@ -36,6 +36,7 @@ pip install -e .
|
|
36 |
|
37 |
---
|
38 |
|
|
|
39 |
## 📖 User Guide
|
40 |
|
41 |
### 🧹 HTML Cleaning
|
@@ -66,6 +67,11 @@ document.write("Hello World!");
|
|
66 |
</html>
|
67 |
"""
|
68 |
|
|
|
|
|
|
|
|
|
|
|
69 |
simplified_html = clean_html(html)
|
70 |
print(simplified_html)
|
71 |
|
@@ -79,7 +85,6 @@ print(simplified_html)
|
|
79 |
# </html>
|
80 |
```
|
81 |
|
82 |
-
|
83 |
### 🔧 Configure Pruning Parameters
|
84 |
|
85 |
The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
|
@@ -106,6 +111,7 @@ MAX_CONTEXT_WINDOW_GEN = 32
|
|
106 |
from htmlrag import build_block_tree
|
107 |
|
108 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
|
|
109 |
for block in block_tree:
|
110 |
print("Block Content: ", block[0])
|
111 |
print("Block Path: ", block[1])
|
@@ -174,6 +180,7 @@ import torch
|
|
174 |
|
175 |
# construct a finer block tree
|
176 |
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
|
|
177 |
for block in block_tree:
|
178 |
print("Block Content: ", block[0])
|
179 |
print("Block Path: ", block[1])
|
@@ -205,6 +212,8 @@ print(pruned_html)
|
|
205 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
206 |
```
|
207 |
|
|
|
|
|
208 |
|
209 |
## Results
|
210 |
|
|
|
36 |
|
37 |
---
|
38 |
|
39 |
+
|
40 |
## 📖 User Guide
|
41 |
|
42 |
### 🧹 HTML Cleaning
|
|
|
67 |
</html>
|
68 |
"""
|
69 |
|
70 |
+
#. alternatively you can read html files and merge them
|
71 |
+
# html_files=["/path/to/html/file1.html", "/path/to/html/file2.html"]
|
72 |
+
# htmls=[open(file).read() for file in html_files]
|
73 |
+
# html = "\n".join(htmls)
|
74 |
+
|
75 |
simplified_html = clean_html(html)
|
76 |
print(simplified_html)
|
77 |
|
|
|
85 |
# </html>
|
86 |
```
|
87 |
|
|
|
88 |
### 🔧 Configure Pruning Parameters
|
89 |
|
90 |
The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
|
|
|
111 |
from htmlrag import build_block_tree
|
112 |
|
113 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
114 |
+
# block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
115 |
for block in block_tree:
|
116 |
print("Block Content: ", block[0])
|
117 |
print("Block Path: ", block[1])
|
|
|
180 |
|
181 |
# construct a finer block tree
|
182 |
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
183 |
+
# block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
184 |
for block in block_tree:
|
185 |
print("Block Content: ", block[0])
|
186 |
print("Block Path: ", block[1])
|
|
|
212 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
213 |
```
|
214 |
|
215 |
+
---
|
216 |
+
|
217 |
|
218 |
## Results
|
219 |
|