Spaces:

retopara
/

ragflow

Build error

App Files Files Community

Kevin Hu commited on Nov 15, 2024

Commit

fd3ed7b

1 Parent(s): 02e5242

Enlarge the term weight difference (#3435)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement

Files changed (3) hide show

api/apps/document_app.py +12 -1
rag/nlp/query.py +1 -1
rag/nlp/term_weight.py +2 -0

api/apps/document_app.py CHANGED Viewed

@@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License
 #
 import pathlib
 import re
@@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
 from api.settings import RetCode, docStoreConn
 from api.utils.api_utils import get_json_result
 from rag.utils.storage_factory import STORAGE_IMPL
-from api.utils.file_utils import filename_type, thumbnail
 from api.utils.web_utils import html2pdf, is_valid_url
 from api.constants import IMG_BASE64_PREFIX
@@ -529,15 +530,25 @@ def parse():
         if not is_valid_url(url):
             return get_json_result(
                 data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
         from selenium.webdriver import Chrome, ChromeOptions
         options = ChromeOptions()
         options.add_argument('--headless')
         options.add_argument('--disable-gpu')
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')
         driver = Chrome(options=options)
         driver.get(url)
         sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
         return get_json_result(data="\n".join(sections))
     if 'file' not in request.files:

 #  See the License for the specific language governing permissions and
 #  limitations under the License
 #
+import os.path
 import pathlib
 import re
 from api.settings import RetCode, docStoreConn
 from api.utils.api_utils import get_json_result
 from rag.utils.storage_factory import STORAGE_IMPL
+from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
 from api.utils.web_utils import html2pdf, is_valid_url
 from api.constants import IMG_BASE64_PREFIX
         if not is_valid_url(url):
             return get_json_result(
                 data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
+        download_path = os.path.join(get_project_base_directory(), "logs/downloads")
+        os.makedirs(download_path, exist_ok=True)
         from selenium.webdriver import Chrome, ChromeOptions
         options = ChromeOptions()
         options.add_argument('--headless')
         options.add_argument('--disable-gpu')
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')
+        options.add_experimental_option('prefs', {
+            'download.default_directory': download_path,
+            'download.prompt_for_download': False,
+            'download.directory_upgrade': True,
+            'safebrowsing.enabled': True
+        })
         driver = Chrome(options=options)
         driver.get(url)
+        print(driver.get_downloadable_files())
         sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
+        driver.close()
         return get_json_result(data="\n".join(sections))
     if 'file' not in request.files:

rag/nlp/query.py CHANGED Viewed

@@ -66,7 +66,7 @@ class FulltextQueryer:
     def question(self, txt, tbl="qa", min_match:float=0.6):
         txt = re.sub(
-            r"[ :\r\n\t,，。？?/`!！&\^%%()^]+",
             " ",
             rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
         ).strip()

     def question(self, txt, tbl="qa", min_match:float=0.6):
         txt = re.sub(
+            r"[ :\r\n\t,，。？?/`!！&\^%%()^\[\]]+",
             " ",
             rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
         ).strip()

rag/nlp/term_weight.py CHANGED Viewed

@@ -228,6 +228,7 @@ class Dealer:
             idf2 = np.array([idf(df(t), 1000000000) for t in tks])
             wts = (0.3 * idf1 + 0.7 * idf2) * \
                 np.array([ner(t) * postag(t) for t in tks])
             tw = list(zip(tks, wts))
         else:
             for tk in tks:
@@ -236,6 +237,7 @@ class Dealer:
                 idf2 = np.array([idf(df(t), 1000000000) for t in tt])
                 wts = (0.3 * idf1 + 0.7 * idf2) * \
                     np.array([ner(t) * postag(t) for t in tt])
                 tw.extend(zip(tt, wts))
         S = np.sum([s for _, s in tw])

             idf2 = np.array([idf(df(t), 1000000000) for t in tks])
             wts = (0.3 * idf1 + 0.7 * idf2) * \
                 np.array([ner(t) * postag(t) for t in tks])
+            wts = [math.exp(s) for s in wts]
             tw = list(zip(tks, wts))
         else:
             for tk in tks:
                 idf2 = np.array([idf(df(t), 1000000000) for t in tt])
                 wts = (0.3 * idf1 + 0.7 * idf2) * \
                     np.array([ner(t) * postag(t) for t in tt])
+                wts = [math.exp(s) for s in wts]
                 tw.extend(zip(tt, wts))
         S = np.sum([s for _, s in tw])