ldhldh commited on
Commit
7e589bc
·
verified ·
1 Parent(s): 7178c35

Update util/preprocessing.py

Browse files
Files changed (1) hide show
  1. util/preprocessing.py +48 -0
util/preprocessing.py CHANGED
@@ -1,5 +1,7 @@
1
  import difflib
2
  import pandas as pd
 
 
3
 
4
  def word_to_market_name(word):
5
  markets_df = pd.read_csv('data/market_name_utf8.csv')
@@ -28,6 +30,37 @@ def word_to_market_name(word):
28
  return output
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def check_word(word):
33
  markets_df = pd.read_csv('data/market_name_utf8.csv')
@@ -38,4 +71,19 @@ def check_word(word):
38
  print(f"check_word, {word}")
39
  return True
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return False
 
1
  import difflib
2
  import pandas as pd
3
+ from util.search_data import *
4
+
5
 
6
  def word_to_market_name(word):
7
  markets_df = pd.read_csv('data/market_name_utf8.csv')
 
30
  return output
31
 
32
 
33
+ def word_to_product_name(word):
34
+ if not os.path.exists("data/products.txt"):
35
+ products = get_all_product_names()
36
+ else:
37
+ temp = ''
38
+ with open("data/products.txt", "r", encoding = "utf-8") as f:
39
+ temp = f.read()
40
+ products = temp.split("\n")[:-2]
41
+
42
+ output = []
43
+ scores = dict()
44
+
45
+ for p in products:
46
+ flag = True
47
+ for c in range(len(word)):
48
+ if c < len(p):
49
+ if p[c] != word[c]:
50
+ flag = False
51
+ if flag:
52
+ output.append(p)
53
+ else:
54
+ sm = difflib.SequenceMatcher(None, word, p)
55
+ scores[p] = sm.ratio()
56
+ sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
57
+ top_3_product = [product[0] for product in sorted_scores[:3]]
58
+ for i in range(len(top_3_product)):
59
+ output.append(top_3_product[i])
60
+
61
+ return output
62
+
63
+
64
 
65
  def check_word(word):
66
  markets_df = pd.read_csv('data/market_name_utf8.csv')
 
71
  print(f"check_word, {word}")
72
  return True
73
 
74
+ return False
75
+
76
+ def check_product(word):
77
+ if not os.path.exists("data/products.txt"):
78
+ products = get_all_product_names()
79
+ else:
80
+ temp = ''
81
+ with open("data/products.txt", "r", encoding = "utf-8") as f:
82
+ temp = f.read()
83
+ products = temp.split("\n")[:-2]
84
+ for p in products:
85
+ if word == p:
86
+ print(f"check_word, {word}")
87
+ return True
88
+
89
  return False