Spaces:
Runtime error
Runtime error
Commit
·
7c9c311
1
Parent(s):
a5003e3
add model param, -inurl to exclude unhelpful websites
Browse files
sheet.py
CHANGED
@@ -90,11 +90,12 @@ def test_get_condensed_result():
|
|
90 |
res = get_serp(query)
|
91 |
cond_res = get_condensed_result(res)
|
92 |
|
93 |
-
def compose_analysis( client, query, search_results):
|
94 |
"""
|
95 |
Argument
|
96 |
query: str
|
97 |
search_results: str
|
|
|
98 |
Return
|
99 |
response: str
|
100 |
"""
|
@@ -119,7 +120,7 @@ def compose_analysis( client, query, search_results):
|
|
119 |
''',
|
120 |
}
|
121 |
],
|
122 |
-
model =
|
123 |
response_format = {"type": "json_object"},
|
124 |
temperature = 0,
|
125 |
# stream = True
|
@@ -151,12 +152,14 @@ def compose_classication(
|
|
151 |
evidence,
|
152 |
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
|
153 |
backup_classes: list = [ '中式', '西式'],
|
|
|
154 |
) -> str:
|
155 |
"""
|
156 |
Argument
|
157 |
client:
|
158 |
evidence: str
|
159 |
classes: list
|
|
|
160 |
Return
|
161 |
response: str
|
162 |
"""
|
@@ -187,7 +190,7 @@ def compose_classication(
|
|
187 |
''',
|
188 |
}
|
189 |
],
|
190 |
-
model =
|
191 |
response_format = {"type": "json_object"},
|
192 |
temperature = 0,
|
193 |
# stream = True
|
@@ -278,7 +281,7 @@ def test_get_evidence_classification():
|
|
278 |
analysis_results = classify_results( analysis_results)
|
279 |
patch_analysis_results = classify_results( patch_analysis_results)
|
280 |
|
281 |
-
def compose_query( address, name, with_index: bool = True):
|
282 |
"""
|
283 |
Argumemnt
|
284 |
# d: series with d[1]: 地址, d[4]: 營業人名稱 #
|
@@ -292,7 +295,7 @@ def compose_query( address, name, with_index: bool = True):
|
|
292 |
# query = f"{d[1][:3]} {d[4]}"
|
293 |
# else:
|
294 |
# query = f"{d[0][:3]} {d[3]}"
|
295 |
-
query = f"{address[:3]} {name}"
|
296 |
return query
|
297 |
|
298 |
def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
|
@@ -563,7 +566,7 @@ def main(args):
|
|
563 |
"""
|
564 |
|
565 |
## 讀取資料名單 ##
|
566 |
-
data = get_leads(args.data_path)
|
567 |
|
568 |
## 進行爬蟲與分析 ##
|
569 |
# crawled_results = crawl_results(data)
|
@@ -611,7 +614,7 @@ def main(args):
|
|
611 |
)
|
612 |
|
613 |
formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
|
614 |
-
formatted_results.to_csv(
|
615 |
|
616 |
|
617 |
category2supercategory = {
|
@@ -658,13 +661,15 @@ if __name__=='__main__':
|
|
658 |
|
659 |
parser = argparse.ArgumentParser()
|
660 |
parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
|
661 |
-
parser.add_argument("--classified_file_path", type=str, default="data/classified_results.joblib")
|
662 |
-
parser.add_argument("--extracted_file_path", type=str, default="data/extracted_results.joblib")
|
663 |
-
parser.add_argument("--crawled_file_path", type=str, default="data/crawled_results.joblib")
|
664 |
-
parser.add_argument("--combined_file_path", type=str, default="data/combined_results.joblib")
|
665 |
-
parser.add_argument("--postprocessed_results", type=str, default="data/postprocessed_results.joblib")
|
|
|
666 |
parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
|
667 |
parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
|
|
|
668 |
parser.add_argument("--n_processes", type=int, default=4)
|
669 |
args = parser.parse_args()
|
670 |
|
|
|
90 |
res = get_serp(query)
|
91 |
cond_res = get_condensed_result(res)
|
92 |
|
93 |
+
def compose_analysis( client, query, search_results, model: str = 'gpt-3.5-turbo-0125'):
|
94 |
"""
|
95 |
Argument
|
96 |
query: str
|
97 |
search_results: str
|
98 |
+
model: "gpt-4-0125-preview" or 'gpt-3.5-turbo-0125'
|
99 |
Return
|
100 |
response: str
|
101 |
"""
|
|
|
120 |
''',
|
121 |
}
|
122 |
],
|
123 |
+
model = model,
|
124 |
response_format = {"type": "json_object"},
|
125 |
temperature = 0,
|
126 |
# stream = True
|
|
|
152 |
evidence,
|
153 |
classes: list = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'],
|
154 |
backup_classes: list = [ '中式', '西式'],
|
155 |
+
model: str = 'gpt-3.5-turbo-0125'
|
156 |
) -> str:
|
157 |
"""
|
158 |
Argument
|
159 |
client:
|
160 |
evidence: str
|
161 |
classes: list
|
162 |
+
model: 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview'
|
163 |
Return
|
164 |
response: str
|
165 |
"""
|
|
|
190 |
''',
|
191 |
}
|
192 |
],
|
193 |
+
model = model,
|
194 |
response_format = {"type": "json_object"},
|
195 |
temperature = 0,
|
196 |
# stream = True
|
|
|
281 |
analysis_results = classify_results( analysis_results)
|
282 |
patch_analysis_results = classify_results( patch_analysis_results)
|
283 |
|
284 |
+
def compose_query( address, name, with_index: bool = True, exclude: str = "-inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw"):
|
285 |
"""
|
286 |
Argumemnt
|
287 |
# d: series with d[1]: 地址, d[4]: 營業人名稱 #
|
|
|
295 |
# query = f"{d[1][:3]} {d[4]}"
|
296 |
# else:
|
297 |
# query = f"{d[0][:3]} {d[3]}"
|
298 |
+
query = f"{address[:3]} {name} {exclude}"
|
299 |
return query
|
300 |
|
301 |
def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
|
|
|
566 |
"""
|
567 |
|
568 |
## 讀取資料名單 ##
|
569 |
+
data = get_leads(args.data_path).head(20)
|
570 |
|
571 |
## 進行爬蟲與分析 ##
|
572 |
# crawled_results = crawl_results(data)
|
|
|
614 |
)
|
615 |
|
616 |
formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
|
617 |
+
formatted_results.to_csv( args.formatted_results, index=False)
|
618 |
|
619 |
|
620 |
category2supercategory = {
|
|
|
661 |
|
662 |
parser = argparse.ArgumentParser()
|
663 |
parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
|
664 |
+
parser.add_argument("--classified_file_path", type=str, default="data/gpt3.5/classified_results.joblib")
|
665 |
+
parser.add_argument("--extracted_file_path", type=str, default="data/gpt3.5/extracted_results.joblib")
|
666 |
+
parser.add_argument("--crawled_file_path", type=str, default="data/gpt3.5/crawled_results.joblib")
|
667 |
+
parser.add_argument("--combined_file_path", type=str, default="data/gpt3.5/combined_results.joblib")
|
668 |
+
parser.add_argument("--postprocessed_results", type=str, default="data/gpt3.5/postprocessed_results.joblib")
|
669 |
+
parser.add_argument("--formatted_results", type=str, default="data/gpt3.5/formatted_results.csv")
|
670 |
parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)'])
|
671 |
parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
|
672 |
+
parser.add_argument("--strategy", type=str, default='replace', choices=['replace', 'patch'])
|
673 |
parser.add_argument("--n_processes", type=int, default=4)
|
674 |
args = parser.parse_args()
|
675 |
|