Kevin Hu commited on
Commit
fd3ed7b
·
1 Parent(s): 02e5242

Enlarge the term weight difference (#3435)

Browse files

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement

api/apps/document_app.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
 
16
  import pathlib
17
  import re
18
 
@@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
36
  from api.settings import RetCode, docStoreConn
37
  from api.utils.api_utils import get_json_result
38
  from rag.utils.storage_factory import STORAGE_IMPL
39
- from api.utils.file_utils import filename_type, thumbnail
40
  from api.utils.web_utils import html2pdf, is_valid_url
41
  from api.constants import IMG_BASE64_PREFIX
42
 
@@ -529,15 +530,25 @@ def parse():
529
  if not is_valid_url(url):
530
  return get_json_result(
531
  data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
 
 
532
  from selenium.webdriver import Chrome, ChromeOptions
533
  options = ChromeOptions()
534
  options.add_argument('--headless')
535
  options.add_argument('--disable-gpu')
536
  options.add_argument('--no-sandbox')
537
  options.add_argument('--disable-dev-shm-usage')
 
 
 
 
 
 
538
  driver = Chrome(options=options)
539
  driver.get(url)
 
540
  sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
 
541
  return get_json_result(data="\n".join(sections))
542
 
543
  if 'file' not in request.files:
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
16
+ import os.path
17
  import pathlib
18
  import re
19
 
 
37
  from api.settings import RetCode, docStoreConn
38
  from api.utils.api_utils import get_json_result
39
  from rag.utils.storage_factory import STORAGE_IMPL
40
+ from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
41
  from api.utils.web_utils import html2pdf, is_valid_url
42
  from api.constants import IMG_BASE64_PREFIX
43
 
 
530
  if not is_valid_url(url):
531
  return get_json_result(
532
  data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
533
+ download_path = os.path.join(get_project_base_directory(), "logs/downloads")
534
+ os.makedirs(download_path, exist_ok=True)
535
  from selenium.webdriver import Chrome, ChromeOptions
536
  options = ChromeOptions()
537
  options.add_argument('--headless')
538
  options.add_argument('--disable-gpu')
539
  options.add_argument('--no-sandbox')
540
  options.add_argument('--disable-dev-shm-usage')
541
+ options.add_experimental_option('prefs', {
542
+ 'download.default_directory': download_path,
543
+ 'download.prompt_for_download': False,
544
+ 'download.directory_upgrade': True,
545
+ 'safebrowsing.enabled': True
546
+ })
547
  driver = Chrome(options=options)
548
  driver.get(url)
549
+ print(driver.get_downloadable_files())
550
  sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
551
+ driver.close()
552
  return get_json_result(data="\n".join(sections))
553
 
554
  if 'file' not in request.files:
rag/nlp/query.py CHANGED
@@ -66,7 +66,7 @@ class FulltextQueryer:
66
 
67
  def question(self, txt, tbl="qa", min_match:float=0.6):
68
  txt = re.sub(
69
- r"[ :\r\n\t,,。??/`!!&\^%%()^]+",
70
  " ",
71
  rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
72
  ).strip()
 
66
 
67
  def question(self, txt, tbl="qa", min_match:float=0.6):
68
  txt = re.sub(
69
+ r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
70
  " ",
71
  rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
72
  ).strip()
rag/nlp/term_weight.py CHANGED
@@ -228,6 +228,7 @@ class Dealer:
228
  idf2 = np.array([idf(df(t), 1000000000) for t in tks])
229
  wts = (0.3 * idf1 + 0.7 * idf2) * \
230
  np.array([ner(t) * postag(t) for t in tks])
 
231
  tw = list(zip(tks, wts))
232
  else:
233
  for tk in tks:
@@ -236,6 +237,7 @@ class Dealer:
236
  idf2 = np.array([idf(df(t), 1000000000) for t in tt])
237
  wts = (0.3 * idf1 + 0.7 * idf2) * \
238
  np.array([ner(t) * postag(t) for t in tt])
 
239
  tw.extend(zip(tt, wts))
240
 
241
  S = np.sum([s for _, s in tw])
 
228
  idf2 = np.array([idf(df(t), 1000000000) for t in tks])
229
  wts = (0.3 * idf1 + 0.7 * idf2) * \
230
  np.array([ner(t) * postag(t) for t in tks])
231
+ wts = [math.exp(s) for s in wts]
232
  tw = list(zip(tks, wts))
233
  else:
234
  for tk in tks:
 
237
  idf2 = np.array([idf(df(t), 1000000000) for t in tt])
238
  wts = (0.3 * idf1 + 0.7 * idf2) * \
239
  np.array([ner(t) * postag(t) for t in tt])
240
+ wts = [math.exp(s) for s in wts]
241
  tw.extend(zip(tt, wts))
242
 
243
  S = np.sum([s for _, s in tw])