Kevin Hu
commited on
Commit
·
fd3ed7b
1
Parent(s):
02e5242
Enlarge the term weight difference (#3435)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- api/apps/document_app.py +12 -1
- rag/nlp/query.py +1 -1
- rag/nlp/term_weight.py +2 -0
api/apps/document_app.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
|
|
16 |
import pathlib
|
17 |
import re
|
18 |
|
@@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
|
|
36 |
from api.settings import RetCode, docStoreConn
|
37 |
from api.utils.api_utils import get_json_result
|
38 |
from rag.utils.storage_factory import STORAGE_IMPL
|
39 |
-
from api.utils.file_utils import filename_type, thumbnail
|
40 |
from api.utils.web_utils import html2pdf, is_valid_url
|
41 |
from api.constants import IMG_BASE64_PREFIX
|
42 |
|
@@ -529,15 +530,25 @@ def parse():
|
|
529 |
if not is_valid_url(url):
|
530 |
return get_json_result(
|
531 |
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
|
|
|
|
532 |
from selenium.webdriver import Chrome, ChromeOptions
|
533 |
options = ChromeOptions()
|
534 |
options.add_argument('--headless')
|
535 |
options.add_argument('--disable-gpu')
|
536 |
options.add_argument('--no-sandbox')
|
537 |
options.add_argument('--disable-dev-shm-usage')
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
driver = Chrome(options=options)
|
539 |
driver.get(url)
|
|
|
540 |
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
|
|
541 |
return get_json_result(data="\n".join(sections))
|
542 |
|
543 |
if 'file' not in request.files:
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
16 |
+
import os.path
|
17 |
import pathlib
|
18 |
import re
|
19 |
|
|
|
37 |
from api.settings import RetCode, docStoreConn
|
38 |
from api.utils.api_utils import get_json_result
|
39 |
from rag.utils.storage_factory import STORAGE_IMPL
|
40 |
+
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
41 |
from api.utils.web_utils import html2pdf, is_valid_url
|
42 |
from api.constants import IMG_BASE64_PREFIX
|
43 |
|
|
|
530 |
if not is_valid_url(url):
|
531 |
return get_json_result(
|
532 |
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
533 |
+
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
534 |
+
os.makedirs(download_path, exist_ok=True)
|
535 |
from selenium.webdriver import Chrome, ChromeOptions
|
536 |
options = ChromeOptions()
|
537 |
options.add_argument('--headless')
|
538 |
options.add_argument('--disable-gpu')
|
539 |
options.add_argument('--no-sandbox')
|
540 |
options.add_argument('--disable-dev-shm-usage')
|
541 |
+
options.add_experimental_option('prefs', {
|
542 |
+
'download.default_directory': download_path,
|
543 |
+
'download.prompt_for_download': False,
|
544 |
+
'download.directory_upgrade': True,
|
545 |
+
'safebrowsing.enabled': True
|
546 |
+
})
|
547 |
driver = Chrome(options=options)
|
548 |
driver.get(url)
|
549 |
+
print(driver.get_downloadable_files())
|
550 |
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
551 |
+
driver.close()
|
552 |
return get_json_result(data="\n".join(sections))
|
553 |
|
554 |
if 'file' not in request.files:
|
rag/nlp/query.py
CHANGED
@@ -66,7 +66,7 @@ class FulltextQueryer:
|
|
66 |
|
67 |
def question(self, txt, tbl="qa", min_match:float=0.6):
|
68 |
txt = re.sub(
|
69 |
-
r"[ :\r\n\t,,。??/`!!&\^%%()
|
70 |
" ",
|
71 |
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
72 |
).strip()
|
|
|
66 |
|
67 |
def question(self, txt, tbl="qa", min_match:float=0.6):
|
68 |
txt = re.sub(
|
69 |
+
r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
|
70 |
" ",
|
71 |
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
72 |
).strip()
|
rag/nlp/term_weight.py
CHANGED
@@ -228,6 +228,7 @@ class Dealer:
|
|
228 |
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
230 |
np.array([ner(t) * postag(t) for t in tks])
|
|
|
231 |
tw = list(zip(tks, wts))
|
232 |
else:
|
233 |
for tk in tks:
|
@@ -236,6 +237,7 @@ class Dealer:
|
|
236 |
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
237 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
238 |
np.array([ner(t) * postag(t) for t in tt])
|
|
|
239 |
tw.extend(zip(tt, wts))
|
240 |
|
241 |
S = np.sum([s for _, s in tw])
|
|
|
228 |
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
230 |
np.array([ner(t) * postag(t) for t in tks])
|
231 |
+
wts = [math.exp(s) for s in wts]
|
232 |
tw = list(zip(tks, wts))
|
233 |
else:
|
234 |
for tk in tks:
|
|
|
237 |
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
238 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
239 |
np.array([ner(t) * postag(t) for t in tt])
|
240 |
+
wts = [math.exp(s) for s in wts]
|
241 |
tw.extend(zip(tt, wts))
|
242 |
|
243 |
S = np.sum([s for _, s in tw])
|