update
Browse files- pages_helpers.py +5 -5
pages_helpers.py
CHANGED
|
@@ -17,15 +17,14 @@ from text_utils import *
|
|
| 17 |
|
| 18 |
from llm import *
|
| 19 |
|
| 20 |
-
from mode_llm import llm_html_to_md, md_to_text, get_html_body_with_soup
|
| 21 |
|
| 22 |
-
from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
|
| 23 |
|
| 24 |
# Create an instance of WebCrawler
|
| 25 |
-
crawler = WebCrawler()
|
| 26 |
|
| 27 |
# Warm up the crawler (load necessary models)
|
| 28 |
-
crawler.warmup()
|
| 29 |
|
| 30 |
## Cách lấy cookies và headers sử dụng https://curlconverter.com
|
| 31 |
cookies = {
|
|
@@ -193,6 +192,7 @@ Your connection is not private
|
|
| 193 |
|
| 194 |
|
| 195 |
meta = None
|
|
|
|
| 196 |
if html is None or len(html) < 500:
|
| 197 |
# Thử lần 2 bằng CRAWL4AI
|
| 198 |
print("GET HTML CRAWL4AI", filename, flush=True)
|
|
@@ -223,7 +223,7 @@ Your connection is not private
|
|
| 223 |
html = None
|
| 224 |
meta = {}
|
| 225 |
break
|
| 226 |
-
|
| 227 |
|
| 228 |
if html is None or len(html) < 500:
|
| 229 |
# Thử lần 3 bằng reader api
|
|
|
|
| 17 |
|
| 18 |
from llm import *
|
| 19 |
|
|
|
|
| 20 |
|
| 21 |
+
# from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
|
| 22 |
|
| 23 |
# Create an instance of WebCrawler
|
| 24 |
+
# crawler = WebCrawler()
|
| 25 |
|
| 26 |
# Warm up the crawler (load necessary models)
|
| 27 |
+
# crawler.warmup()
|
| 28 |
|
| 29 |
## Cách lấy cookies và headers sử dụng https://curlconverter.com
|
| 30 |
cookies = {
|
|
|
|
| 192 |
|
| 193 |
|
| 194 |
meta = None
|
| 195 |
+
'''
|
| 196 |
if html is None or len(html) < 500:
|
| 197 |
# Thử lần 2 bằng CRAWL4AI
|
| 198 |
print("GET HTML CRAWL4AI", filename, flush=True)
|
|
|
|
| 223 |
html = None
|
| 224 |
meta = {}
|
| 225 |
break
|
| 226 |
+
'''
|
| 227 |
|
| 228 |
if html is None or len(html) < 500:
|
| 229 |
# Thử lần 3 bằng reader api
|