Spaces:

Symato
/

tomtat

Sleeping

tiendung commited on Sep 29, 2024

Commit

8d68b9b

1 Parent(s): db91fa3

update

Files changed (1) hide show

pages_helpers.py CHANGED Viewed

@@ -17,15 +17,14 @@ from text_utils import *
 from llm import *
-from mode_llm import llm_html_to_md, md_to_text, get_html_body_with_soup
-from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
 # Create an instance of WebCrawler
-crawler = WebCrawler()
 # Warm up the crawler (load necessary models)
-crawler.warmup()
 ## Cách lấy cookies và headers sử dụng https://curlconverter.com
 cookies = {
@@ -193,6 +192,7 @@ Your connection is not private
     meta = None
     if html is None or len(html) < 500:
         # Thử lần 2 bằng CRAWL4AI
         print("GET HTML CRAWL4AI", filename, flush=True)
@@ -223,7 +223,7 @@ Your connection is not private
                     html = None
                     meta = {}
                     break
     if html is None or len(html) < 500:
         # Thử lần 3 bằng reader api

 from llm import *
+# from crawl4ai import WebCrawler # pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git"
 # Create an instance of WebCrawler
+# crawler = WebCrawler()
 # Warm up the crawler (load necessary models)
+# crawler.warmup()
 ## Cách lấy cookies và headers sử dụng https://curlconverter.com
 cookies = {
     meta = None
+    '''
     if html is None or len(html) < 500:
         # Thử lần 2 bằng CRAWL4AI
         print("GET HTML CRAWL4AI", filename, flush=True)
                     html = None
                     meta = {}
                     break
+    '''
     if html is None or len(html) < 500:
         # Thử lần 3 bằng reader api