linpershey commited on
Commit
07d2942
·
1 Parent(s): 2547429

major release - add pipeline & batch for difference use cases

Browse files
.gitignore CHANGED
@@ -161,7 +161,9 @@ data/*
161
  .env
162
  *.env
163
 
164
- clients/
165
  !clients/.gitkeep
166
 
167
- creds/
 
 
 
161
  .env
162
  *.env
163
 
164
+ clients/*
165
  !clients/.gitkeep
166
 
167
+ creds/*
168
+ logs/*
169
+ !logs/.gitkeep
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import logging
3
 
4
  import gradio as gr
5
  import pandas as pd
@@ -9,13 +8,14 @@ jieba.cut('你好')
9
  from wordcloud import WordCloud
10
  from PIL import Image
11
  import matplotlib.pyplot as plt
 
12
 
13
  from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory
14
 
15
  load_dotenv()
16
 
17
- logger = logging.getLogger(__name__)
18
- logger.setLevel(logging.DEBUG)
19
 
20
  classes = list([ x for x in category2supercategory.keys() if len(x)>0])
21
 
@@ -38,6 +38,7 @@ def format_category( formatted_results):
38
  return "\n\n".join([
39
  f"> 大類別:{formatted_results['supercategory'].values[0]}",
40
  f"> 小類別:{formatted_results['category'].values[0]}",
 
41
  f"> 商家名稱:{formatted_results['store_name'].values[0]}",
42
  f"> 電話:{formatted_results['phone_number'].values[0]}",
43
  f"> 描述:{formatted_results['description'].values[0]}"
@@ -48,8 +49,8 @@ def do( business_name: str, address: str):
48
  """
49
 
50
  crawled_results = []
51
- provider = 'openai'
52
- model = 'gpt-4-0125-preview'
53
 
54
  google_domain = "google.com.tw"
55
  gl = 'tw'
@@ -77,19 +78,30 @@ def do( business_name: str, address: str):
77
  # logger.debug(crawled_results)
78
  extracted_results = extract_results( crawled_results, classes=classes, provider = provider, model = model)
79
  # logger.error(extracted_results['extracted_results'].columns)
80
- extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name'] ]
81
  logger.debug( extracted_results['category'])
82
- print(extracted_results['category'])
83
  postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
84
  os.remove("/tmp/postprocessed_results.joblib")
85
 
86
  formatted_results = format_output( postprocessed_results)
87
  logger.debug( formatted_results)
88
- print(formatted_results)
89
  formatted_output = format_category( formatted_results)
90
 
91
  img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
92
- return f"【搜尋結果】\n{formatted_results['formatted_evidence'].values[0]}", img, f"【判斷結果】\n{formatted_output}"
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  ## --- interface --- ##
95
  # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
@@ -99,20 +111,43 @@ def do( business_name: str, address: str):
99
  # outputs=outputs,
100
  # )
101
 
 
 
 
 
 
 
 
 
102
  ## --- block --- ##
103
- with gr.Blocks() as demo:
104
- gr.Markdown("🌟 自動分類餐廳型態 🌟")
105
- with gr.Row():
106
- # gr.Textbox( label="統一編號", placeholder="輸入八碼數字(optional)"),
107
- inputs = [ gr.Textbox( label="商家名稱", placeholder="輸入商家或公司名稱"), gr.Textbox(label="地址", placeholder="至少輸入縣市,完整地址更好")]
108
- with gr.Row():
109
- # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
110
- outputs = [ gr.Markdown( label="參考資料(google search)"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
111
- btn = gr.Button("Submit")
112
- btn.click(fn=do, inputs=inputs, outputs=outputs)
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
 
115
  if __name__ == "__main__":
116
 
117
- demo.launch(share=True, auth=("kota", "kota"))
 
 
 
118
 
 
1
  import os
 
2
 
3
  import gradio as gr
4
  import pandas as pd
 
8
  from wordcloud import WordCloud
9
  from PIL import Image
10
  import matplotlib.pyplot as plt
11
+ from loguru import logger
12
 
13
  from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory
14
 
15
  load_dotenv()
16
 
17
+ # logger = logging.getLogger(__name__)
18
+ # logger.setLevel(logging.DEBUG)
19
 
20
  classes = list([ x for x in category2supercategory.keys() if len(x)>0])
21
 
 
38
  return "\n\n".join([
39
  f"> 大類別:{formatted_results['supercategory'].values[0]}",
40
  f"> 小類別:{formatted_results['category'].values[0]}",
41
+ f"> 推測提供酒品:{ '是' if formatted_results['provide_alcohol'].values[0] else '否' }",
42
  f"> 商家名稱:{formatted_results['store_name'].values[0]}",
43
  f"> 電話:{formatted_results['phone_number'].values[0]}",
44
  f"> 描述:{formatted_results['description'].values[0]}"
 
49
  """
50
 
51
  crawled_results = []
52
+ provider = os.environ.get("DEFAULT_PROVIDER", "openai")
53
+ model = os.environ.get("DEFAULT_MODEL", "'gpt-4o'")
54
 
55
  google_domain = "google.com.tw"
56
  gl = 'tw'
 
78
  # logger.debug(crawled_results)
79
  extracted_results = extract_results( crawled_results, classes=classes, provider = provider, model = model)
80
  # logger.error(extracted_results['extracted_results'].columns)
81
+ extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name', 'provide_alcohol'] ]
82
  logger.debug( extracted_results['category'])
83
+
84
  postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
85
  os.remove("/tmp/postprocessed_results.joblib")
86
 
87
  formatted_results = format_output( postprocessed_results)
88
  logger.debug( formatted_results)
89
+
90
  formatted_output = format_category( formatted_results)
91
 
92
  img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
93
+ return f"【搜尋結果】\n{formatted_results['formatted_evidence'].values[0][6:]}", img, f"【判斷結果】\n{formatted_output}"
94
+
95
+ def load( blob, progress=gr.Progress()):
96
+ """
97
+ """
98
+ if isinstance(blob, str):
99
+ # df = pd.read_csv(StringIO(temp_file), parse_dates=[ "Start", "Finish"])
100
+ df = pd.read_csv(blob, names=COLUMNS, header=None) # parse_dates=[ "Start", "Finish"]
101
+ else:
102
+ df = pd.read_csv(blob.name, names=COLUMNS, header=None) # parse_dates=[ "Start", "Finish"]
103
+ print( df.head() )
104
+ return df
105
 
106
  ## --- interface --- ##
107
  # outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
 
111
  # outputs=outputs,
112
  # )
113
 
114
+ COLUMNS = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
115
+ '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']
116
+ CSS = """
117
+ h1 {
118
+ text-align: center;
119
+ display:block;
120
+ }
121
+ """
122
  ## --- block --- ##
123
+ with gr.Blocks(css=CSS) as demo:
124
+ gr.Markdown("# 🌟 自動分類餐廳型態 🌟")
125
+ with gr.Tab('單筆'):
126
+ with gr.Row():
127
+ inputs = [ gr.Textbox( label="商家名稱", placeholder="輸入商家或公司名稱"), gr.Textbox(label="地址", placeholder="至少輸入縣���,完整地址更好")]
128
+ with gr.Row():
129
+ btn = gr.Button("Submit")
130
+ with gr.Row():
131
+ outputs = [ gr.Markdown( label="參考資料(google search)"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
132
+ btn.click(fn=do, inputs=inputs, outputs=outputs)
133
+ with gr.Tab('批次'):
134
+ with gr.Row():
135
+ batch_inputs = [ gr.UploadButton("上傳檔案", file_count="single")]
136
+ with gr.Row():
137
+ batch_btn = gr.Button("批量處理")
138
+ with gr.Row():
139
+ batch_outputs = [ gr.Dataframe(
140
+ headers=COLUMNS,
141
+ datatype=["str"] * 16
142
+ )]
143
+ batch_btn.click(fn=load, inputs=batch_inputs, outputs=batch_outputs)
144
+
145
 
146
 
147
  if __name__ == "__main__":
148
 
149
+ demo.launch(
150
+ # share=True,
151
+ server_name = '0.0.0.0', auth=( os.environ.get('USERNAME'), os.environ.get('PASSWORD'))
152
+ )
153
 
batch.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import glob
5
+ import time
6
+ import yaml
7
+ import joblib
8
+ import argparse
9
+
10
+ import jinja2
11
+ import anthropic
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+ from pathlib import Path
15
+ from loguru import logger
16
+ from openai import OpenAI
17
+ from dotenv import load_dotenv
18
+ import google.generativeai as genai
19
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
20
+
21
+ from data import get_leads
22
+ from utils import parse_json_garbage, compose_query
23
+
24
+ tqdm.pandas()
25
+
26
+ try:
27
+ logger.remove(0)
28
+ logger.add(sys.stderr, level="INFO")
29
+ except ValueError:
30
+ pass
31
+
32
+ load_dotenv()
33
+
34
+
35
+ def prepare_batch( crawled_result_path: str, config: dict, output_path: str, topn: int = None):
36
+ """
37
+ Argument
38
+ --------
39
+ crawled_result_path: str
40
+ Path to the crawled result file (result from the crawl task)
41
+ config: dict
42
+ Configuration for the batch job
43
+ output_path: str
44
+ Path to the output file
45
+ Return
46
+ ------
47
+ items: list
48
+
49
+ Example
50
+ {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
51
+ {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
52
+ model = model,
53
+ response_format = {"type": "json_object"},
54
+ temperature = 0,
55
+ max_tokens = 4096,
56
+ """
57
+ assert os.path.exists(crawled_result_path), f"File not found: {crawled_result_path}"
58
+ crawled_results = joblib.load(open(crawled_result_path, "rb"))['crawled_results']
59
+ if topn:
60
+ crawled_results = crawled_results.head(topn)
61
+
62
+ jenv = jinja2.Environment()
63
+
64
+ template = jenv.from_string(config['extraction_prompt'])
65
+ system_prompt = template.render( classes = config['classes'], traits = config['traits'])
66
+
67
+ template = jenv.from_string(config['user_content'])
68
+
69
+ items = []
70
+ for i, d in tqdm(enumerate(crawled_results.itertuples())):
71
+ idx = d.index # d[1]
72
+ evidence = d.googlemap_results +"\n" + d.search_results
73
+ business_id = d.business_id # d[2]
74
+ business_name = d.business_name # d[3]
75
+ address = d.address # d[7]
76
+ ana_res = None
77
+ query = compose_query( address, business_name, use_exclude=False)
78
+
79
+ user_content = template.render( query = query, search_results = evidence)
80
+ item = {
81
+ "custom_id": str(business_id),
82
+ "method": "POST",
83
+ "url": "/v1/chat/completions",
84
+ "body": {
85
+ "model": config['model'],
86
+ "messages": [
87
+ {"role": "system", "content": system_prompt},
88
+ {"role": "user", "content": user_content}
89
+ ],
90
+ "max_tokens": config['max_tokens'],
91
+ "temperature": config['temperature'],
92
+ "response_format": {"type": "json_object"},
93
+ }
94
+ }
95
+ items.append( json.dumps(item, ensure_ascii=False))
96
+
97
+ with open(output_path, "w") as f:
98
+ for item in items:
99
+ f.write(item + "\n")
100
+
101
+ def prepare_regularization( extracted_result_path: str, config: dict, output_path: str, topn: int = None):
102
+ """
103
+ Argument
104
+ --------
105
+ extracted_file_path: str
106
+ Path to the extracted result file (result from the extraction task)
107
+ config: dict
108
+ Configuration for the batch job
109
+ output_path: str
110
+ Path to the output file
111
+ topn: int
112
+ Number of records to be processed
113
+ Return
114
+ ------
115
+ items: list
116
+
117
+ Example
118
+ {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
119
+ {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
120
+ model = model,
121
+ response_format = {"type": "json_object"},
122
+ temperature = 0,
123
+ max_tokens = 4096,
124
+ """
125
+ assert os.path.exists(extracted_result_path), f"File not found: {extracted_result_path}"
126
+ extracted_results = joblib.load(open(extracted_result_path, "rb"))['extracted_results']
127
+ if topn:
128
+ extracted_results = extracted_results.head(topn)
129
+
130
+ jenv = jinja2.Environment()
131
+
132
+ template = jenv.from_string(config['regularization_prompt'])
133
+ system_prompt = template.render()
134
+
135
+ template = jenv.from_string(config['regularization_user_content'])
136
+
137
+ items = []
138
+ for i, d in tqdm(enumerate(extracted_results.itertuples())):
139
+ idx = d.index # d[1]
140
+ category = d.category
141
+ business_id = d.business_id
142
+ if pd.isna(category) or len(category)==0:
143
+ category = ""
144
+ user_content = template.render( category = category)
145
+ item = {
146
+ "custom_id": str(business_id),
147
+ "method": "POST",
148
+ "url": "/v1/chat/completions",
149
+ "body": {
150
+ "model": config['model'],
151
+ "messages": [
152
+ {"role": "system", "content": system_prompt},
153
+ {"role": "user", "content": user_content}
154
+ ],
155
+ "max_tokens": config['max_tokens'],
156
+ "temperature": config['temperature'],
157
+ "response_format": {"type": "json_object"},
158
+ }
159
+ }
160
+ items.append( json.dumps(item, ensure_ascii=False))
161
+ with open(output_path, "w") as f:
162
+ for item in items:
163
+ f.write(item + "\n")
164
+
165
+ def run_batch( input_path: str, job_path: str, jsonl_path: str):
166
+ """
167
+ Argument
168
+ --------
169
+ input_path: str
170
+ Path to the prepared batch input file (result from prepare_batch)
171
+ job_path: str
172
+ Path to the job file (response from creating a batch job)
173
+ jsonl_path: str
174
+ Path to the output file
175
+ extracted_result_path: str
176
+ Path to the extracted result file
177
+
178
+ """
179
+ assert os.path.exists(input_path), f"File not found: {input_path}"
180
+ st = time.time()
181
+
182
+ client = OpenAI( organization = os.getenv('ORGANIZATION_ID'))
183
+ batch_input_file = client.files.create(
184
+ file=open( input_path, "rb"),
185
+ purpose="batch"
186
+ )
187
+ batch_input_file_id = batch_input_file.id
188
+ logger.info(f"batch_input_file_id -> {batch_input_file_id}")
189
+ batch_resp = client.batches.create(
190
+ input_file_id=batch_input_file_id,
191
+ endpoint="/v1/chat/completions",
192
+ completion_window="24h",
193
+ metadata={
194
+ "description": "batch job"
195
+ }
196
+ )
197
+ logger.info(f"batch resp -> {batch_resp}")
198
+ try:
199
+ with open( job_path, "wb") as f:
200
+ joblib.dump(batch_resp, f)
201
+ except Exception as e:
202
+ logger.error(f"Error -> {e}")
203
+ with open("./job.joblib", "wb") as f:
204
+ joblib.dump(batch_resp, f)
205
+
206
+ is_ready = False
207
+ while 1:
208
+ batch_resp = client.batches.retrieve(batch_resp.id)
209
+
210
+ if batch_resp.status == 'validating':
211
+ logger.info("the input file is being validated before the batch can begin")
212
+
213
+ elif batch_resp.status == 'failed':
214
+ logger.info("the input file has failed the validation process")
215
+ break
216
+ elif batch_resp.status == 'in_progress':
217
+ logger.info("the input file was successfully validated and the batch is currently being ru")
218
+
219
+ elif batch_resp.status == 'finalizing':
220
+ logger.info("the batch has completed and the results are being prepared")
221
+
222
+ elif batch_resp.status == 'completed':
223
+ logger.info("the batch has been completed and the results are ready")
224
+ is_ready = True
225
+ break
226
+ elif batch_resp.status == 'expired':
227
+ logger.info("the batch was not able to be completed within the 24-hour time window")
228
+ break
229
+ elif batch_resp.status == 'cancelling':
230
+ logger.info("the batch is being cancelled (may take up to 10 minutes)")
231
+
232
+ elif batch_resp.status == 'cancelled':
233
+ logger.info("the batch was cancelled")
234
+ break
235
+ else:
236
+ raise logger.error("Invalid status")
237
+
238
+ time.sleep(10)
239
+
240
+ if is_ready:
241
+ output_resp = client.files.content(batch_resp.output_file_id)
242
+ llm_results = []
243
+ try:
244
+ with open(jsonl_path, "w") as f:
245
+ for line in output_resp.content.decode('utf-8').split("\n"):
246
+ line = line.strip()
247
+ if len(line)==0:
248
+ break
249
+ llm_results.append(line)
250
+ f.write(f"{line}\n")
251
+
252
+ except Exception as e:
253
+ logger.error(f"Error -> {e}")
254
+ with open("./output.jsonl", "w") as f:
255
+ for line in output_resp.content.decode('utf-8').split("\n"):
256
+ line = line.strip()
257
+ if len(line)==0:
258
+ break
259
+ llm_results.append(line)
260
+ f.write(f"{line}\n")
261
+ print( f"Time elapsed: {time.time()-st:.2f} seconds")
262
+
263
+ def batch2extract( jsonl_path: str, crawled_result_path: str, extracted_result_path: str):
264
+ """
265
+ Argument
266
+ --------
267
+ jsonl_path: str
268
+ Path to the batch output file
269
+ crawled_result_path: str
270
+ Path to the crawled result file (result from the crawl task)
271
+ extracted_result_path: str
272
+ Path to the extracted result file
273
+
274
+ """
275
+ assert os.path.exists(jsonl_path), f"File not found: {jsonl_path}"
276
+ assert os.path.exists(crawled_result_path), f"File not found: {crawled_result_path}"
277
+ crawled_results = joblib.load(open(crawled_result_path, "rb"))
278
+ extracted_results = []
279
+ empty_indices = []
280
+ llm_results = []
281
+
282
+ for line in open(jsonl_path, "r"):
283
+ line = line.strip()
284
+ if len(line)==0:
285
+ break
286
+ llm_results.append(line)
287
+
288
+ for i,llm_result in enumerate(llm_results):
289
+ try:
290
+ llm_result = json.loads(llm_result)
291
+ business_id = llm_result['custom_id']
292
+ llm_result = llm_result['response']['body']['choices'][0]['message']['content']
293
+ llm_result = parse_json_garbage(llm_result)
294
+ llm_result['business_id'] = business_id
295
+ extracted_results.append(llm_result)
296
+ except Exception as e:
297
+ logger.error(f"Error -> {e}, llm_result -> {llm_result}")
298
+ empty_indices.append(i)
299
+ extracted_results = pd.DataFrame(extracted_results)
300
+
301
+ basic_info = []
302
+ for i, d in tqdm(enumerate(crawled_results['crawled_results'].itertuples())):
303
+ idx = d.index # d[1]
304
+ evidence = d.googlemap_results +"\n" + d.search_results
305
+ business_id = d.business_id # d[2]
306
+ business_name = d.business_name # d[3]
307
+ address = d.address # d[7]
308
+ # ana_res = None
309
+ # query = compose_query( address, business_name, use_exclude=False)
310
+ basic_info.append( {
311
+ "index": idx,
312
+ "business_id": business_id,
313
+ "business_name": business_name,
314
+ "evidence": evidence,
315
+ # ** ext_res
316
+ } )
317
+ basic_info = pd.DataFrame(basic_info)
318
+
319
+ extracted_results = basic_info.astype({"business_id": str}).merge(extracted_results, on="business_id", how="inner")
320
+ print( f"{ extracted_results.shape[0]} records merged.")
321
+ extracted_results = {"extracted_results": extracted_results, "empty_indices": empty_indices}
322
+ with open(extracted_result_path, "wb") as f:
323
+ joblib.dump(extracted_results, f)
324
+
325
+ def batch2reg( jsonl_path: str, extracted_result_path: str, regularized_result_path: str):
326
+ """
327
+ Argument
328
+ --------
329
+ jsonl_path: str
330
+ Path to the batch output file
331
+ extracted_result_path: str
332
+ Path to the extracted result file
333
+ regularized_result_path: str
334
+ Path to the regularization result file
335
+
336
+ """
337
+ assert os.path.exists(jsonl_path), f"File not found: {jsonl_path}"
338
+ assert os.path.exists(extracted_result_path), f"File not found: {extracted_result_path}"
339
+ extracted_results = joblib.load(open(extracted_result_path, "rb"))['extracted_results']
340
+
341
+ llm_results, regularized_results, empty_indices = [], [], []
342
+ for line in open(jsonl_path, "r"):
343
+ line = line.strip()
344
+ if len(line)==0:
345
+ break
346
+ llm_results.append(line)
347
+
348
+ for i,llm_result in enumerate(llm_results):
349
+ try:
350
+ llm_result = json.loads(llm_result)
351
+ business_id = llm_result['custom_id']
352
+ llm_result = llm_result['response']['body']['choices'][0]['message']['content']
353
+ llm_result = parse_json_garbage(llm_result)
354
+ llm_result['business_id'] = business_id
355
+ regularized_results.append(llm_result)
356
+ except Exception as e:
357
+ logger.error(f"Error -> {e}, llm_result -> {llm_result}")
358
+ empty_indices.append(i)
359
+ regularized_results = pd.DataFrame(regularized_results)
360
+
361
+ basic_info = []
362
+ for i, d in tqdm(enumerate(extracted_results.itertuples())):
363
+ idx = d.index # d[1]
364
+ # evidence = d.googlemap_results +"\n" + d.search_results
365
+ evidence = d.evidence
366
+ business_id = d.business_id # d[2]
367
+ business_name = d.business_name # d[3]
368
+ # address = d.address # d[7]
369
+ # ana_res = None
370
+ # query = compose_query( address, business_name, use_exclude=False)
371
+ basic_info.append( {
372
+ "index": idx,
373
+ "business_id": business_id,
374
+ "business_name": business_name,
375
+ "evidence": evidence,
376
+ # ** ext_res
377
+ } )
378
+ basic_info = pd.DataFrame(basic_info)
379
+
380
+ regularized_results = basic_info.astype({"business_id": str}).merge(regularized_results, on="business_id", how="inner")
381
+ print( f"{ regularized_results.shape[0]} records merged.")
382
+ regularized_results = {"regularized_results": regularized_results, "empty_indices": empty_indices}
383
+ with open(regularized_result_path, "wb") as f:
384
+ joblib.dump(regularized_results, f)
385
+
386
+
387
+ def postprocess_result( config: dict, regularized_result_path: str, postprocessed_result_path, category_hierarchy: dict, column_name: str = 'category') -> pd.DataFrame:
388
+ """
389
+ Argument
390
+ config: dict
391
+ regularized_results_path: str
392
+ analysis_result: `evidence`, `result`
393
+ postprocessed_results_path
394
+ Return
395
+ """
396
+ assert os.path.exists(regularized_result_path), f"File not found: {regularized_result_path}"
397
+ regularized_results = joblib.load(open(regularized_result_path, "rb"))['regularized_results']
398
+
399
+ if True:
400
+ # if not os.path.exists(postprocessed_result_path):
401
+ postprocessed_results = regularized_results.copy()
402
+ postprocessed_results.loc[ :, "category"] = postprocessed_results[column_name].progress_apply(lambda x: "" if x not in category_hierarchy else x)
403
+ postprocessed_results['supercategory'] = postprocessed_results[column_name].progress_apply(lambda x: category_hierarchy.get(x, ''))
404
+ # with open( postprocessed_results_path, "wb") as f:
405
+ # joblib.dump( postprocessed_results, f)
406
+ postprocessed_results.to_csv( postprocessed_result_path, index=False)
407
+ else:
408
+ # with open( postprocessed_results_path, "rb") as f:
409
+ # postprocessed_results = joblib.load(f)
410
+ postprocessed_results = pd.read_csv( postprocessed_result_path)
411
+ return postprocessed_results
412
+
413
+
414
+ def combine_postprocessed_results( config: dict, input_path: str, postprocessed_result_path: str, reference_path: str, output_path: str):
415
+ """
416
+ Argument
417
+ config: dict
418
+ input_path: str
419
+ postprocessed_result_path: str
420
+ reference_path: str
421
+ output_path: str
422
+ """
423
+ file_pattern = str(Path(input_path).joinpath( postprocessed_result_path, "postprocessed_results.csv"))
424
+ logger.info(f"file_pattern -> {file_pattern}")
425
+ file_paths = list(glob.glob(file_pattern))
426
+ assert len(file_paths)>0, f"File not found: {postprocessed_result_path}"
427
+ postprocessed_results = pd.concat([pd.read_csv(file_path, dtype={"business_id": str}) for file_path in file_paths], axis=0)
428
+ reference_results = get_leads( reference_path)
429
+ # reference_results = reference_results.rename(config['column_mapping'], axis=1)
430
+ postprocessed_results = reference_results.merge( postprocessed_results, left_on = "統一編號", right_on="business_id", how="left")
431
+ postprocessed_results.to_csv( output_path, index=False)
432
+
433
+
434
+ if __name__ == "__main__":
435
+ parser = argparse.ArgumentParser()
436
+ parser.add_argument( "-c", "--config", type=str, default='config/config.yml', help="Path to the configuration file")
437
+ parser.add_argument( "-t", "--task", type=str, default='prepare_batch', choices=['prepare_batch', 'prepare_regularization', 'run_batch', 'batch2extract', 'batch2reg', 'postprocess', 'combine'])
438
+ parser.add_argument( "-i", "--input_path", type=str, default='', )
439
+ parser.add_argument( "-o", "--output_path", type=str, default='', )
440
+ parser.add_argument( "-b", "--batch_path", type=str, default='', )
441
+ parser.add_argument( "-j", "--job_path", type=str, default='', )
442
+ parser.add_argument( "-jp", "--jsonl_path", type=str, default='', )
443
+ parser.add_argument( "-crp", "--crawled_result_path", type=str, default='', )
444
+ parser.add_argument( "-erp", "--extracted_result_path", type=str, default='', )
445
+ parser.add_argument( "-rrp", "--regularized_result_path", type=str, default='', )
446
+ parser.add_argument( "-prp", "--postprocessed_result_path", type=str, default='', )
447
+ parser.add_argument( "-rp", "--reference_path", type=str, default='', )
448
+ parser.add_argument( "-topn", "--topn", type=int, default=None )
449
+ args = parser.parse_args()
450
+ # classes = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', ]
451
+ # backup_classes = [ '中式', '西式']
452
+
453
+ assert os.path.exists(args.config), f"File not found: {args.config}"
454
+ config = yaml.safe_load(open(args.config, "r").read())
455
+
456
+ if args.task == 'prepare_batch':
457
+ prepare_batch( crawled_result_path = args.crawled_result_path, config = config, output_path = args.output_path, topn = args.topn)
458
+
459
+ elif args.task == 'run_batch':
460
+ run_batch( input_path = args.input_path, job_path = args.job_path, jsonl_path = args.jsonl_path)
461
+
462
+ elif args.task == 'prepare_regularization':
463
+ prepare_regularization( extracted_result_path = args.extracted_result_path, config = config, output_path = args.output_path, topn = args.topn)
464
+
465
+ elif args.task == 'batch2extract':
466
+ batch2extract(
467
+ jsonl_path = args.jsonl_path,
468
+ crawled_result_path = args.crawled_result_path,
469
+ extracted_result_path = args.extracted_result_path
470
+ )
471
+
472
+ elif args.task == 'batch2reg':
473
+ batch2reg(
474
+ jsonl_path = args.jsonl_path,
475
+ extracted_result_path = args.extracted_result_path,
476
+ regularized_result_path = args.regularized_result_path
477
+ )
478
+ elif args.task == 'postprocess':
479
+ postprocess_result(
480
+ config = config,
481
+ regularized_result_path = args.regularized_result_path,
482
+ postprocessed_result_path = args.postprocessed_result_path,
483
+ category_hierarchy = config['category2supercategory'],
484
+ column_name = 'category'
485
+ )
486
+ elif args.task == 'combine':
487
+ combine_postprocessed_results(
488
+ config,
489
+ args.input_path,
490
+ args.postprocessed_result_path,
491
+ args.reference_path,
492
+ args.output_path
493
+ )
494
+
495
+ else:
496
+ raise Exception("Invalid task")
497
+
498
+
config/config.yml ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: "gpt-4o-mini"
2
+ provider: openai
3
+ column_mapping:
4
+ "營業地址": "address"
5
+ "統一編號": "business_id"
6
+ "總機構統一編號": "main_business_id"
7
+ "營業人名稱": "store_name"
8
+ "資本額": "capital"
9
+ "設立日期": "date"
10
+ "組織別名稱": "business_name"
11
+ "使用統一發票": "use_receipt"
12
+ "行業代號": "business_code"
13
+ "名稱": "business_name"
14
+ "行業代號1": "business_code_1"
15
+ "名稱1": "business_code_name_1"
16
+ "行業代號2": "business_code_2"
17
+ "名稱2": "business_code_name_2"
18
+ "行業代號3": "business_code_3"
19
+ "名稱3": "business_code_name_3"
20
+ classes:
21
+ - 小吃店
22
+ - 日式料理(含居酒屋,串燒)
23
+ - 火(鍋/爐)
24
+ - 東南亞料理(不含日韓)
25
+ - 海鮮熱炒
26
+ - 特色餐廳(含雞、鵝、牛、羊肉)
27
+ - 釣蝦場
28
+ - 傳統餐廳
29
+ - 燒烤
30
+ - 韓式料理(含火鍋,烤肉)
31
+ - PUB(Live Band)
32
+ - PUB(一般,含Lounge)
33
+ - PUB(電音\舞場)
34
+ - 五星級飯店
35
+ - 自助KTV(含連鎖,庭園自助)
36
+ - 西餐廳(含美式,義式,墨式)
37
+ - 咖啡廳(泡沫紅茶)
38
+ - 飯店(星級/旅館,不含五星級)
39
+ - 運動休閒館(含球類練習場,飛鏢等)
40
+ - 西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)
41
+ - 西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)
42
+ - 早餐
43
+ category2supercategory:
44
+ "小吃店": "中式"
45
+ "日式料理(含居酒屋,串燒)": "中式"
46
+ "火(鍋/爐)": "中式"
47
+ "東南亞料理(不含日韓)": "中式"
48
+ "海鮮熱炒": "中式"
49
+ "特色餐廳(含雞、鵝、牛、羊肉)": "中式"
50
+ "釣蝦場": "中式"
51
+ "傳統餐廳": "中式"
52
+ "燒烤": "中式"
53
+ "韓式料理(含火鍋,烤肉)": "中式"
54
+ 'PUB(Live Band)': "西式"
55
+ 'PUB(一般,含Lounge)': "西式"
56
+ 'PUB(電音\舞場)': "西式"
57
+ "五星級飯店": "西式"
58
+ '自助KTV(含連鎖,庭園自助)': "西式"
59
+ "西餐廳(含美式,義式,墨式)": "西式"
60
+ '咖啡廳(泡沫紅茶)': "西式"
61
+ '飯店(星級/旅館,不含五星級)': "西式"
62
+ '運動休閒館(含球類練習場,飛鏢等)': "西式"
63
+ "中式": "中式"
64
+ "西式": "西式"
65
+ "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式"
66
+ "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式"
67
+ "早餐": ""
68
+ traits: "Gathering, Chill, Enjoying Together, Drinking Freely, Winery, Wine Cellar, Wine Storage, Relaxing, Unwinding, Lyrical, Romantic, Pleasant, Stress Relief, Wine and Dine, Light Drinking Gatherings, Birthday Celebrations, Socializing, Parties, Networking, After Work Relaxation with a Drink, Relaxing Places Suitable for Drinking, Every Dish Goes Well with Beer, Shared Dishes, Dining Together, Atmosphere Suitable for Celebratory Drinking, Places Suitable for Light Drinking Gatherings with Friends, Small Shops Suitable for Relaxing and Light Drinking"
69
+ extraction_prompt: |
70
+ As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, your task is to first use store name and address to identify relevant information.
71
+ After that, from the relevant information, extract `store_name`, `address`, `description`, `category`, `provide_alcohol` and `phone_number` from the found relevant information.
72
+ Note that `category` can only be {{classes}}.
73
+ According to our experience,`provide_alcohol` can be inferred based on whether a store is suitable for scenarios such as {{traits}}.
74
+ `description` is a summary of key piece of evidence and reasons that lead you decide `category` and `provide_alcohol` .
75
+
76
+ It's very important to omit unrelated results. Do not make up any assumption.
77
+ Please think step by step, and output a single json that starts with `{` and ends with `}`. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "...", "provide_alcohol": true or false}
78
+ If no relevant information has been found, simply output json with empty values.
79
+ user_content: "`query`: `{{query}}`\n`search_results`: {{search_results}}"
80
+ max_tokens: 4096
81
+ temperature: 0.0
82
+ classification_prompt: |
83
+ As a helpful and rigorous retail analyst, given the provided information about a store,
84
+ your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
85
+ Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
86
+ It's very important to omit unrelated piece of evidence and don't make up any assumption.
87
+ Please think step by step, and must output in json format. An example output json is like {"category": "..."}
88
+ If no relevant piece of information can ever be found at all, simply output json with empty string "".
89
+ I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
90
+
91
+ regularization_prompt: |
92
+ As a helpful and factual assistant, your task is to classify the provided raw cuisine category into a conformed category. The definition of each conformed category is show below (in the format of `category`: `... definition ...`):
93
+ - `小吃店`:小吃、擔仔麵、小吃攤、街邊小店、傳統小吃、麵食、麵攤、炒飯、餃子館、鯊魚煙、黑白切、牛肉麵、銅板美食、小點心、簡餐、色小菜、開放空間攤販
94
+ - `日式料理(含居酒屋,串燒)`:居酒屋、酒場、水產、清酒、生魚片、壽司、日式啤酒、日式料理、代烤服務、日本餐飲場所、日本傳統食物、日式定食
95
+ - `火(鍋/爐)`:麻辣鍋、薑母鴨、鴨味仔、鍋物、湯底、滋補、冬令補、涮涮鍋、個人鍋、冬天圍爐、羊肉爐、鴛鴦鍋、炭火爐、氣火爐、燒酒雞、蒸氣海鮮鍋
96
+ - `東南亞料理(不含日韓)`:印尼、越式、泰式、沙嗲、海南雞、河粉、馬來西亞料理、新加坡料理、寮國料理、緬甸料理、南洋風味、印度料理、越南春捲、泰式綠咖哩、異國風情裝潢、滇緬料理
97
+ - `海鮮熱炒`:海鮮、現撈、活海鮮、生猛、大排檔、活魚活蝦、生猛海鮮、快炒、海產、台式海鮮、下酒菜
98
+ - `特色餐廳(含雞、鵝、牛、羊肉)`:烤鴨、燒鵝、甕仔雞、甕缸雞、桶仔雞、牛雜、蒙古烤肉、鵝肉城、金山鴨肉、生牛肉、全羊宴、活鱉、烤雞店、鵝肉餐廳、溫體牛、現宰羊肉、鹹水鵝、土羊肉
99
+ - `傳統餐廳`:江浙、台菜、合菜、桌菜、粵菜、中式、川菜、港式、上海菜、砂鍋魚頭、東北菜、北京烤鴨、一鴨三吃、婚宴、辦桌、老字號、宴會廳、台灣料理
100
+ - `燒烤`:燒烤、串燒、串串、烤魚、鮮蚵、炭烤、直火、碳火、和牛、戶外生火、烤肉、路邊燒烤
101
+ - `韓式料理(含火鍋,烤肉)`:韓國泡菜、韓式年糕、首爾、燒酒、韓式炸雞、春川辣炒雞、韓式炸醬麵、海鮮煎餅、烤三層肉、烤五花、烤韓牛、醬料和飯、石鍋拌飯、韓式風格、韓式清酒、啤酒、銅盤烤肉、韓流
102
+ - `PUB(Live Band)`:音樂餐廳、樂團表演、現場表演、LIVE表演、樂團駐唱、定期表演、有舞台場地、樂隊、專人駐唱
103
+ - `PUB(一般,含Lounge)`:酒吧、bar、lounge、飛鏢、調酒、運動酒吧、音樂酒吧、沙發聊天、女公關、互動調酒師、公關服務
104
+ - `PUB(電音\舞場)`:夜店、舞池電音、藝人、包廂低消制、電子音樂表演、DJ、派對狂歡
105
+ - `五星級飯店`:高級飯店、奢華酒店、連鎖五星級飯店、國際集團飯店、米其林飯店、高檔住宿
106
+ - `自助KTV(含連鎖,庭園自助)`:卡拉OK、唱歌、歌坊、歡唱吧、自行點歌、自助唱歌、唱歌包廂、慶生聯誼包廂
107
+ - `西餐廳(含美式,義式,墨式)`:牛排、餐酒、歐式、義式、西餐、義大利麵、凱薩沙拉、紅酒、白酒、調酒、墨西哥式料理、阿根廷式料理、漢堡、比薩
108
+ - `咖啡廳(泡沫紅茶)`:泡沫紅茶店、咖啡店、café、coffee、輕食、軟性飲料、簡餐、茶街
109
+ - `飯店(星級/旅館,不含五星級)`:飯店、酒店、商務旅館、平價住宿
110
+ - `運動休閒館(含球類練習場,飛鏢等)`:撞球、高爾夫、運動、保齡球、娛樂、高爾夫練習場、大魯閣棒球場、籃球、羽毛球、PHOENIX鳳凰、羽球館、看球賽
111
+ - `釣蝦場`:釣蝦、蝦寶、投幣卡拉OK、釣竿和餌料、蝦子現場烹煮食用、泰國蝦、現烤蝦子、包廂唱歌、現釣現烤、自備或租用釣竿。
112
+ Note that you must choose from the above categories. Other ones are strongly prohibited.
113
+ Output in json format such as `{"category": "..."}`.
114
+
115
+ regularization_user_content: "{{ category }}"
data.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+
5
+ import pandas as pd
6
+ from loguru import logger
7
+
8
+
9
+ def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
10
+ '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']):
11
+ """
12
+ """
13
+ assert os.path.exists(file_path), f"File not found: {file_path}"
14
+ data = pd.read_csv( file_path, names=names, dtype={"統一編號": str})
15
+ return data
16
+
17
+ def format_search_results(evidence):
18
+ """Format evidence (serp_results)
19
+ Argument
20
+ evidence: str. The format is a list of dictionaries
21
+ Return
22
+ formatted_evidence: str
23
+ """
24
+ search_results = []
25
+ gmap_store_mentions = []
26
+ evidence = json.loads(evidence)
27
+ # print( len(evidence) )
28
+ for i in range(len(evidence)):
29
+ if 'title' in evidence[i] and '顧客評價' in evidence[i]:
30
+ f = f"\n> 顧客評價: {evidence[i]['顧客評價']}"
31
+ gmap_store_mentions.append(f)
32
+ elif 'title' in evidence[i] and evidence[i]['title']=='類似的店':
33
+ f = f"\n> 類似的店: {evidence[i]['snippet']}"
34
+ gmap_store_mentions.append(f)
35
+ elif 'status' in evidence[i]:
36
+ f = f"\n> 經營狀態: {evidence[i]['status']}"
37
+ gmap_store_mentions.append(f)
38
+ elif 'telephone_number' in evidence[i]:
39
+ f = f"\n> 電話號碼: {evidence[i]['telephone_number']}"
40
+ gmap_store_mentions.append(f)
41
+ else:
42
+ try:
43
+ f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})"
44
+ except KeyError:
45
+ logger.error( evidence[i] )
46
+ raise KeyError
47
+ search_results.append(f)
48
+ return "## 店面資訊: " + "\n".join(gmap_store_mentions) + "\n" + "\n## 網路搜尋結果: " + "\n".join(search_results)
49
+
50
+ def split_data( data: pd.DataFrame, samples: int = 4000):
51
+ """
52
+ """
53
+ data_len = len(data)
54
+ n = math.ceil(data_len/samples)
55
+ data_list = [data[ i*samples: (i+1)*samples] for i in range(n)]
56
+ return data_list
creds → logs/.gitkeep RENAMED
File without changes
model.py CHANGED
@@ -1,12 +1,28 @@
1
  import os
2
- import argparse
 
3
  import time
 
 
 
4
 
5
- from dotenv import load_dotenv
6
  import anthropic
 
 
 
7
  from openai import OpenAI
 
 
 
8
 
9
- from utils import parse_json_garbage
 
 
 
 
 
 
10
 
11
  load_dotenv()
12
 
@@ -45,92 +61,105 @@ def llm( provider, model, system_prompt, user_content, delay:int = 0):
45
  model = model,
46
  response_format = {"type": "json_object"},
47
  temperature = 0,
 
48
  # stream = True
49
  )
50
  response = chat_completion.choices[0].message.content
51
 
52
  elif provider=='anthropic':
53
- client = anthropic.Client(api_key=os.getenv('ANTHROPIC_APIKEY'))
54
  response = client.messages.create(
55
  model= model,
56
  system= system_prompt,
57
  messages=[
58
  {"role": "user", "content": user_content} # <-- user prompt
59
  ],
60
- max_tokens = 1024
61
  )
62
  response = response.content[0].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  else:
64
  raise Exception("Invalid provider")
65
 
66
  return response
67
 
68
-
69
  if __name__ == "__main__":
70
  parser = argparse.ArgumentParser()
71
- parser.add_argument("--provider", type=str, default='anthropic', help="openai or anthropic")
72
- parser.add_argument("--model", type=str, default='gpt-4o', help="Model name for the API",
73
- choices = ["claude-3-sonnet-20240229", "claude-3-haiku-20240307", "gpt-3.5-turbo-0125", "gpt-4-0125-preview"])
74
- parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', '西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)', '西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)', '早餐'])
75
- parser.add_argument("--task", type=list, default='extract', choices=['extract', 'classify'])
76
  args = parser.parse_args()
 
 
77
 
 
 
78
 
79
-
80
- classes = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', ]
81
- backup_classes = [ '中式', '西式']
82
-
83
- extraction_prompt = '''
84
- As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
85
- your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`.
86
- It's very important to omit unrelated results. Do not make up any assumption.
87
- Please think step by step, and output in json format. An example output json is like {"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}
88
- If no relevant information has been found, simply output json with empty values.
89
- I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
90
- '''
91
- classification_prompt = f"""
92
- As a helpful and rigorous retail analyst, given the provided information about a store,
93
- your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
94
- Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
95
- It's very important to omit unrelated piece of evidence and don't make up any assumption.
96
- Please think step by step, and must output in json format. An example output json is like {{"category": "..."}}
97
- If no relevant piece of information can ever be found at all, simply output json with empty string "".
98
- I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
99
- """
100
-
101
  if args.task == 'extract':
102
- system_prompt = extraction_prompt
103
- elif args.task == 'classify':
104
- system_prompt = classification_prompt
105
- else:
106
- raise Exception("Invalid task")
107
-
108
- query = "山の迴饗"
109
- search_results = str([{"title": "山の迴饗", "snippet": "謝謝大家這麼支持山の迴饗 我們會繼續努力用心做出美味的料理 ————————— ⛰️ 山の迴饗地址:台東縣關山鎮中華路56號訂位專線:0975-957-056 · #山的迴饗 · #夢想起飛"}, {"title": "山的迴饗餐館- 店家介紹", "snippet": "營業登記資料 · 統一編號. 92433454 · 公司狀況. 營業中 · 公司名稱. 山的迴饗餐館 · 公司類型. 獨資 · 資本總額. 30000 · 所在地. 臺東縣關山鎮中福里中華路56號 · 使用發票."}, {"title": "關山漫遊| 💥山の迴饗x night bar", "snippet": "山の迴饗x night bar 即將在12/1號台東關山開幕! 別再煩惱池上、鹿野找不到宵夜餐酒館 各位敬請期待並關注我們✨ night bar❌山的迴饗 12/1 ..."}, {"title": "山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵 - 台灣美食網", "snippet": "山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵|台式三杯雞|滷肉飯|便當|CP美食營業時間 ; 星期一, 休息 ; 星期二, 10:00–14:00 16:00–21:00 ; 星期三, 10:00–14:00 16:00– ..."}, {"title": "便當|CP美食- 山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵", "snippet": "餐廳山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵|台式三杯雞|滷肉飯|便當|CP美食google map 導航. 臺東縣關山鎮中華路56號 +886 975 957 056 ..."}, {"title": "山的迴饗餐館", "snippet": "山的迴饗餐館,統編:92433454,地址:臺東縣關山鎮中福里中華路56號,負責人姓名:周偉慈,設立日期:112年11月15日."}, {"title": "山的迴饗餐館", "snippet": "山的迴饗餐館. 資本總額(元), 30,000. 負責人, 周偉慈. 登記地址, 看地圖 臺東縣關山鎮中福里中華路56號 郵遞區號查詢. 設立日期, 2023-11-15. 資料管理 ..."}, {"title": "山的迴饗餐館, 公司統一編號92433454 - 食品業者登錄資料集", "snippet": "公司或商業登記名稱山的迴饗餐館的公司統一編號是92433454, 登錄項目是餐飲場所, 業者地址是台東縣關山鎮中福里中華路56號, 食品業者登錄字號是V-202257990-00001-5."}, {"title": "山的迴饗餐館, 公司統一編號92433454 - 食品業者登錄資料集", "snippet": "公司或商業登記名稱山的迴饗餐館的公司統一編號是92433454, 登錄項目是公司/商業登記, 業者地址是台東縣關山鎮中福里中華路56號, 食品業者登錄字號是V-202257990-00000-4 ..."}, {"title": "山的迴饗餐館", "snippet": "負責人, 周偉慈 ; 登記地址, 台東縣關山鎮中福里中華路56號 ; 公司狀態, 核准設立 「查詢最新營業狀況請至財政部稅務入口網 」 ; 資本額, 30,000元 ; 所在縣市 ..."}, {"title": "山的迴饗 | 關山美食|焗烤飯|酒吧|義大利麵|台式三杯雞|滷肉飯|便當|CP美食", "顧客評價": "324晚餐餐點豬排簡餐加白醬焗烤等等餐點。\t店家也提供免費的紅茶 綠茶 白開水 多種的調味料自取 總而言之 CP值真的很讚\t空間舒適涼爽,店員服務周到"}, {"title": "類似的店", "snippet": "['中國菜']\t['客家料理']\t['餐廳']\t['熟食店']\t['餐廳']"}, {"telephone_number": "0975 957 056"}])
110
-
111
- # query = "大吃一斤泰國蝦麻辣牛肉爐"
112
- # search_results = str([{"title": "大吃一斤泰國蝦麻辣牛肉爐", "snippet": "... 一支、本店特賣價600元免費代料理、 保證、活的!歡迎來電預定0975-147-848大吃一斤活蝦料理店新北市三重區自強路一段222號泰國蝦活蝦現場料理不漲價一斤維持一斤480元."}, {"title": "大吃一斤泰國蝦麻辣牛肉爐", "snippet": "... 一支、本店特賣價600元免費代料理、 保證、活的!歡迎來電預定0975-147-848大吃一斤活蝦料理店新北市三重區自強路一段222號泰國蝦活蝦現場料理不漲價一斤維持一斤480元."}, {"title": "大吃一斤", "snippet": "大吃一斤在foodpanda點的到,更多New Taipei City 推薦美食,線上訂立即送,下載foodpanda APP,20分鐘外送上門!瀏覽菜單和獨家優惠折扣."}, {"title": "大吃一斤(新北板橋店)菜單", "snippet": "大吃一斤(新北板橋店) 在foodpanda點的到,更多New Taipei City 推薦美食,線上訂立即送,下載foodpanda APP,20分鐘外送上門!"}, {"title": "大吃一斤活蝦餐廳- 店家介紹", "snippet": "大吃一斤活蝦餐廳. 資本總額. 200000. 代表人. 李錦鴻. 所在區域. 新北市. 所在地. 新北市三重區自強路1段222號(1樓). 商業類型. 獨資. 異動紀錄. 1111108. 營業狀態為: ..."}, {"title": "新北市| 三重區大吃一斤(泰國蝦牛肉料理店)", "snippet": "大吃一斤(泰國蝦牛肉料理店) 餐廳介紹 ; phone icon 電話, 0975 147 848 ; 營業時間, 星期一17:00–04:00 星期二17:00–04:00 星期三17:00–04:00 星期四17:00– ..."}, {"title": "大吃一斤活蝦餐廳", "snippet": "大吃一斤活蝦餐廳. 負責人姓名, 李錦鴻. 地址, 新北市三重區自強路1段222號(1樓). 現況, 核准設立. 資本額(元), 200,000. 組織類型, 獨資. 登記機關, 新北市政府經濟發展局."}, {"title": "【大吃一斤(泰國蝦牛肉料理店)】網友評價- 新北三重區合菜餐廳", "snippet": "大吃一斤(泰國蝦牛肉料理店) - 網友評論、最新食記(132則) 評分: 4.4分。大吃一斤(泰國蝦牛肉料理店)是位於新北三重區的餐廳,地址: 新北市 ... 生猛活海鮮."}, {"title": "大吃一斤生猛海鮮/活魚料理超值優惠方案", "snippet": "大吃一斤生猛海鮮/活魚料理. 電話:0975-147-848. 地址:新北市三重區自強路一段222號. 營業時間:週一至週日17: ..."}, {"title": "大吃一斤三重店 (泰國蝦料理.平價快炒熱炒.各式海鮮)", "顧客評價": "塔香蛤蜊、胡椒蝦、檸檬蝦、胡椒鳳螺 口味不錯食材新鮮 拍照時蛤蜊已經快被小孩吃光\t蝦子不大,店面不大,魚腥味很重,廁所很多蚊子,連菜裡面也有蚊子🦟,根本吃不下去\t新鮮好吃😋老闆人很Nice 推薦鹽烤蝦以及蒜味奶油蝦👍👍👍"}, {"title": "類似的店", "snippet": "['海鮮']\t['海鮮']\t['海鮮']\t['海鮮']"}, {"telephone_number": "0975 147 848"}])
113
-
114
- if args.provider == "openai":
115
- client = OpenAI( organization = os.getenv('ORGANIZATION_ID'))
116
- # categories = ", ".join([ "`"+x+"`" for x in args.classes if x!='早餐' ])+ " or " + "`早餐`"
117
- user_content = f'''
118
- `query`: `{query}`,
119
- `search_results`: {search_results}
120
- '''
121
- resp = llm( args.provider, args.model, system_prompt, user_content)
122
- print(f"resp -> {resp}")
123
-
124
-
125
- elif args.provider == "anthropic":
126
- client = anthropic.Client(api_key=os.getenv('ANTHROPIC_APIKEY'))
127
  user_content = f'''
128
  `query`: `{query}`,
129
  `search_results`: {search_results}
130
  '''
131
  print(f"user_content -> {user_content}")
132
- resp = llm( args.provider, args.model, system_prompt, user_content)
133
  print(resp)
134
 
 
 
 
135
  else:
136
- raise Exception("Invalid provider")
 
 
 
1
  import os
2
+ import sys
3
+ import json
4
  import time
5
+ import yaml
6
+ import joblib
7
+ import argparse
8
 
9
+ import jinja2
10
  import anthropic
11
+ import pandas as pd
12
+ from tqdm import tqdm
13
+ from loguru import logger
14
  from openai import OpenAI
15
+ from dotenv import load_dotenv
16
+ import google.generativeai as genai
17
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
18
 
19
+ from utils import parse_json_garbage, compose_query
20
+
21
+ try:
22
+ logger.remove(0)
23
+ logger.add(sys.stderr, level="INFO")
24
+ except ValueError:
25
+ pass
26
 
27
  load_dotenv()
28
 
 
61
  model = model,
62
  response_format = {"type": "json_object"},
63
  temperature = 0,
64
+ max_tokens = 4096,
65
  # stream = True
66
  )
67
  response = chat_completion.choices[0].message.content
68
 
69
  elif provider=='anthropic':
70
+ client = anthropic.Client(api_key=os.getenv('ANTHROPIC_API_KEY'))
71
  response = client.messages.create(
72
  model= model,
73
  system= system_prompt,
74
  messages=[
75
  {"role": "user", "content": user_content} # <-- user prompt
76
  ],
77
+ max_tokens = 4000
78
  )
79
  response = response.content[0].text
80
+
81
+ elif provider=='google':
82
+ genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
83
+ model = genai.GenerativeModel(
84
+ model_name = model,
85
+ system_instruction = system_prompt,
86
+ generation_config={
87
+ "temperature": 0,
88
+ "max_output_tokens": 8192,
89
+ "response_mime_type": "application/json"
90
+ })
91
+ safety_settings = {
92
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
93
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
94
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
95
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
96
+ }
97
+ messages = []
98
+ # messages.append({
99
+ # 'role':'user',
100
+ # 'parts': [f"System instruction: {system_prompt}"]
101
+ # })
102
+ # response = model.generate_content(messages, safety_settings=safety_settings)
103
+ # try:
104
+ # messages.append({
105
+ # 'role': 'model',
106
+ # 'parts': [response.text]
107
+ # })
108
+ # except Exception as e:
109
+ # logger.error(f"response.candidates -> {response.candidates}")
110
+ # logger.error(f"error -> {e}")
111
+ # messages.append({
112
+ # 'role': 'model',
113
+ # 'parts': ["OK. I'm ready to help you."]
114
+ # })
115
+ messages.append({
116
+ 'role': 'user',
117
+ 'parts': [user_content]
118
+ })
119
+ try:
120
+ response = model.generate_content(messages, safety_settings=safety_settings, )
121
+ response = response.text
122
+ except Exception as e:
123
+ logger.error(f"Error (will still return response) -> {e}")
124
+ logger.error(f"response.candidates -> {response.candidates}")
125
+ return response
126
  else:
127
  raise Exception("Invalid provider")
128
 
129
  return response
130
 
 
131
  if __name__ == "__main__":
132
  parser = argparse.ArgumentParser()
133
+ parser.add_argument( "-c", "--config", type=str, default='config/config.yml', help="Path to the configuration file")
134
+ parser.add_argument( "-t", "--task", type=str, default='prepare_batch', choices=['extract', 'classify'])
135
+ parser.add_argument( "-i", "--input_path", type=str, default='', )
136
+ parser.add_argument( "-o", "--output_path", type=str, default='', )
137
+ parser.add_argument( "-topn", "--topn", type=int, default=None )
138
  args = parser.parse_args()
139
+ # classes = ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', ]
140
+ # backup_classes = [ '中式', '西式']
141
 
142
+ assert os.path.exists(args.config), f"File not found: {args.config}"
143
+ config = yaml.safe_load(open(args.config, "r").read())
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  if args.task == 'extract':
146
+ jenv = jinja2.Environment()
147
+ template = jenv.from_string(config['extraction_prompt'])
148
+ system_prompt = template.render( classes = config['classes'], traits = config['traits'])
149
+ query = "山の迴饗"
150
+ search_results = str([{"title": "山の迴饗", "snippet": "謝謝大家這麼支持山の迴饗 我們會繼續努力用心做出美味的料理 ————————— ⛰️ 山の迴饗地址:台東縣關山鎮中華路56號訂位專線:0975-957-056 · #山的迴饗 · #夢想起飛"}, {"title": "山的迴饗餐館- 店家介紹", "snippet": "營業登記資料 · 統一編號. 92433454 · 公司狀況. 營業中 · 公司名稱. 山的迴饗餐館 · 公司類型. 獨資 · 資本總額. 30000 · 所在地. 臺東縣關山鎮中福里中華路56號 · 使用發票."}, {"title": "關山漫遊| 💥山の迴饗x night bar", "snippet": "山の迴饗x night bar 即將在12/1號台東關山開幕! 別再煩惱池上、鹿野找不到宵夜餐酒館 各位敬請期待並關注我們✨ night bar❌山的迴饗 12/1 ..."}, {"title": "山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵 - 台灣美食網", "snippet": "山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵|台式三杯雞|滷肉飯|便當|CP美食營業時間 ; 星期一, 休息 ; 星期二, 10:00–14:00 16:00–21:00 ; 星期三, 10:00–14:00 16:00– ..."}, {"title": "便當|CP美食- 山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵", "snippet": "餐廳山的迴饗| 中西複合式餐廳|焗烤飯|義大利麵|台式三杯雞|滷肉飯|便當|CP美食google map 導航. 臺東縣關山鎮中華路56號 +886 975 957 056 ..."}, {"title": "山的迴饗餐館", "snippet": "山的迴饗餐館,統編:92433454,地址:臺東縣關山鎮中福里中華路56號,負責人姓名:周偉慈,設立日期:112年11月15日."}, {"title": "山的迴饗餐館", "snippet": "山的迴饗餐館. 資本總額(元), 30,000. 負責人, 周偉慈. 登記地址, 看地圖 臺東縣關山鎮中福里中華路56號 郵遞區號查詢. 設立日期, 2023-11-15. 資料管理 ..."}, {"title": "山的迴饗餐館, 公司統一編號92433454 - 食品業者登錄資料集", "snippet": "公司或商業登記名稱山的迴饗餐館的公司統一編號是92433454, 登錄項目是餐飲場所, 業者地址是台東縣關山鎮中福里中華路56號, 食品業者登錄字號是V-202257990-00001-5."}, {"title": "山的迴饗餐館, 公司統一編號92433454 - 食品業者登錄資料集", "snippet": "公司或商業登記名稱山的迴饗餐館的公司統一編號是92433454, 登錄項目是公司/商業登記, 業者地址是台東縣關山鎮中福里中華路56號, 食品業者登錄字號是V-202257990-00000-4 ..."}, {"title": "山的迴饗餐館", "snippet": "負責人, 周偉慈 ; 登記地址, 台東縣關山鎮中福里中華路56號 ; 公司狀態, 核准設立 「查詢最新營業狀況請至財政部稅務入口網 」 ; 資本額, 30,000元 ; 所在縣市 ..."}, {"title": "山的迴饗 | 關山美食|焗烤飯|酒吧|義大利麵|台式三杯雞|滷肉飯|便當|CP美食", "顧客評價": "324晚餐餐點豬排簡餐加白醬焗烤等等餐點。\t店家也提供免費的紅茶 綠茶 白開水 多種的調味料自取 總而言之 CP值真的很讚\t空間舒適涼爽,店員服務周到"}, {"title": "類似的店", "snippet": "['中國菜']\t['客家料理']\t['餐廳']\t['熟食店']\t['餐廳']"}, {"telephone_number": "0975 957 056"}])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  user_content = f'''
152
  `query`: `{query}`,
153
  `search_results`: {search_results}
154
  '''
155
  print(f"user_content -> {user_content}")
156
+ resp = llm( config['provider'], config['model'], system_prompt, user_content)
157
  print(resp)
158
 
159
+ elif args.task == 'classify':
160
+ system_prompt = config['classification_prompt']
161
+
162
  else:
163
+ raise Exception("Invalid task")
164
+
165
+
notebooks/chaining.ipynb ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "id": "f9e39a86-a9db-4571-bdc1-bf2a14675345",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "\n",
11
+ "import os\n",
12
+ "import json\n",
13
+ "\n",
14
+ "from dotenv import load_dotenv\n",
15
+ "load_dotenv()\n",
16
+ "\n",
17
+ "TAVILY_API_KEY = os.environ.get(\"TAVILY_API_KEY\")\n",
18
+ "OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "markdown",
23
+ "id": "5348dd2b-a2ae-4fa6-8268-be6da402898c",
24
+ "metadata": {},
25
+ "source": [
26
+ "### Use tools ###"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "e47f6678-69ff-4ead-96c7-d4b7cd59e561",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "data": {
37
+ "text/plain": [
38
+ "[{'url': 'https://maps.google.com.tw/',\n",
39
+ " 'content': 'Find local businesses, view maps and get driving directions in Google Maps.'},\n",
40
+ " {'url': 'https://www.google.com.tw/maps?hl=zh-TW&tab=wl&output=classic&dg=brw',\n",
41
+ " 'content': '利用「Google 地圖」尋找本地商家、檢視地圖或規劃行車路線。'},\n",
42
+ " {'url': 'https://twfile.com/五角味食堂/92277974',\n",
43
+ " 'content': '五角味食堂,商業統一編號:92277974,地址:彰化縣北斗鎮文昌里復興路臨100-1號,負責人姓名:李雅筑,OpenData(4) 五角味食堂(24項情報)-台灣公司情報網 台灣公司情報網'},\n",
44
+ " {'url': 'https://www.findcompany.com.tw/五角味食堂',\n",
45
+ " 'content': '五角味食堂. 資本總額 (元) 30,000. 負責人. 李雅筑. 登記地址. 彰化縣北斗鎮文昌里復興路臨100-1號 郵遞區號查詢. 設立日期. 2023-03-31.'},\n",
46
+ " {'url': 'https://twypage.com/sd-1654806-B-彰化-五角味食堂.html',\n",
47
+ " 'content': '五角味食堂,統編:92277974,地址:彰化縣北斗鎮文昌里復興路臨100-1號,負責人:李雅筑,設立日期:2023-03-31,變更日期:2023-03-31,公司狀態:核准設立 「查詢最新營業狀況請至財政部稅務入口網 」,營業項目:食品什貨、飲料零售業,餐館業'}]"
48
+ ]
49
+ },
50
+ "execution_count": 4,
51
+ "metadata": {},
52
+ "output_type": "execute_result"
53
+ }
54
+ ],
55
+ "source": [
56
+ "from langchain_community.tools.tavily_search import TavilySearchResults\n",
57
+ "\n",
58
+ "\n",
59
+ "tool = TavilySearchResults()\n",
60
+ "tool.invoke({\"query\": \"五角味食堂\t彰化縣北斗鎮文昌里復興路臨100-1號\"})\n"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "id": "2c1f236e-88bf-46e7-8e39-cbc9d7330f56",
66
+ "metadata": {},
67
+ "source": [
68
+ "### Define agent - AgentExecutor ###"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 8,
74
+ "id": "22837788-d2f0-48a0-ad71-ff6226d5bd39",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "from langchain import hub\n",
79
+ "from langchain.agents import AgentExecutor, create_openai_functions_agent\n",
80
+ "from langchain_openai import ChatOpenAI\n",
81
+ "\n",
82
+ "\n",
83
+ "instructions = \"\"\"As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, your task is to first use store name and address to identify relevant information. After that, from the relevant information, extract `store_name`, `address`, `description`, `category`, `provide_alcohol` and `phone_number` from the found relevant information. Note that `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`. According to our experience,`provide_alcohol` can be inferred based on whether a store is suitable for scenarios such as Gathering, Chill, Enjoying Together, Drinking Freely, Winery, Wine Cellar, Wine Storage, Relaxing, Unwinding, Lyrical, Romantic, Pleasant, Stress Relief, Wine and Dine, Light Drinking Gatherings, Birthday Celebrations, Socializing, Parties, Networking, After Work Relaxation with a Drink, Relaxing Places Suitable for Drinking, Every Dish Goes Well with Beer, Shared Dishes, Dining Together, Atmosphere Suitable for Celebratory Drinking, Places Suitable for Light Drinking Gatherings with Friends, Small Shops Suitable for Relaxing and Light Drinking. `description` is a summary of key piece of evidence and reasons that lead you decide `category` and `provide_alcohol` .\n",
84
+ "\n",
85
+ " It's very important to omit unrelated results. Do not make up any assumption.\n",
86
+ " Please think step by step, and output a single json that starts with `{` and ends with `}`. An example output json is like {\"store_name\": \"...\", \"address\": \"...\", \"description\": \"... products, service or highlights ...\", \"category\": \"...\", \"phone_number\": \"...\", \"provide_alcohol\": true or false}\n",
87
+ " If no relevant information has been found, simply output json with empty values.\"\"\"\n",
88
+ "base_prompt = hub.pull(\"langchain-ai/openai-functions-template\")\n",
89
+ "prompt = base_prompt.partial(instructions=instructions)\n",
90
+ "llm = ChatOpenAI(temperature=0, )\n",
91
+ "tavily_tool = TavilySearchResults()\n",
92
+ "tools = [tavily_tool]\n",
93
+ "agent = create_openai_functions_agent(llm, tools, prompt)\n",
94
+ "agent_executor = AgentExecutor(\n",
95
+ " agent=agent,\n",
96
+ " tools=tools,\n",
97
+ " verbose=True,\n",
98
+ ")\n",
99
+ "agent_executor.invoke({\"input\": \"五角味食堂\t彰化縣北斗鎮文昌里復興路臨100-1號\"})\n"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 9,
105
+ "id": "38770f28-9950-49dc-a305-0ea180ae417c",
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "\n",
113
+ "\n",
114
+ "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
115
+ "\u001b[32;1m\u001b[1;3m\n",
116
+ "Invoking: `tavily_search_results_json` with `{'query': '五角味食堂 彰化縣北斗鎮文昌里復興路臨100-1號'}`\n",
117
+ "\n",
118
+ "\n",
119
+ "\u001b[0m\u001b[36;1m\u001b[1;3m[{'url': 'https://www.facebook.com/people/五角味食堂/100088207986612/', 'content': '五角味食堂, 彰化縣. 209 likes · 1 talking about this. 五角味食堂~ 一個結合.台灣.日本.中國.韓國.泰國.口味料理的複合式美食餐廳。 裡面以餃子類為主打!'}, {'url': 'https://twfile.com/五角味食堂/92277974', 'content': '五角味食堂,商業統一編號:92277974,地址:彰化縣北斗鎮文昌里復興路臨100-1號,負責人姓名:李雅筑,OpenData(4) 五角味食堂(24項情報)-台灣公司情報網 台灣公司情報網'}, {'url': 'https://opengovtw.com/ban/92277974', 'content': '所在地為彰化縣北斗鎮文昌里復興路臨100-1號。 ... 五角味食堂: 彰化縣北斗鎮文昌里復興路臨100-1號: 李雅筑: 30000: 獨資: 1120816972: 財政部營業稅籍資料. 營業人名稱: 五角味食堂 : 營業地址: 彰化縣北斗鎮文昌里復興路臨100-1號 :'}, {'url': 'https://www.google.com.tw/maps?hl=zh-TW&tab=wl&output=classic&dg=brw', 'content': '利用「Google 地圖」尋找本地商家、檢視地圖或規劃行車路線。'}, {'url': 'https://maps.google.com.tw/', 'content': 'Find local businesses, view maps and get driving directions in Google Maps.'}]\u001b[0m\u001b[32;1m\u001b[1;3m{\n",
120
+ "\"store_name\": \"五角味食堂\",\n",
121
+ "\"address\": \"彰化縣北斗鎮文昌里復興路臨100-1號\",\n",
122
+ "\"description\": \"五角味食堂是一個結合台灣、日本、中國、韓國、泰國口味料理的複合式美食餐廳,主打餃子類食物。\",\n",
123
+ "\"category\": \"小吃店\",\n",
124
+ "\"phone_number\": \"\",\n",
125
+ "\"provide_alcohol\": false\n",
126
+ "}\u001b[0m\n",
127
+ "\n",
128
+ "\u001b[1m> Finished chain.\u001b[0m\n"
129
+ ]
130
+ },
131
+ {
132
+ "data": {
133
+ "text/plain": [
134
+ "{'input': '五角味食堂\\t彰化縣北斗鎮文昌里復興路臨100-1號',\n",
135
+ " 'output': '{\\n\"store_name\": \"五角味食堂\",\\n\"address\": \"彰化縣北斗鎮文昌里復興路臨100-1號\",\\n\"description\": \"五角味食堂是一個結合台灣、日本、中國、韓國、泰國口味料理的複合式美食餐廳,主打餃子類食物。\",\\n\"category\": \"小吃店\",\\n\"phone_number\": \"\",\\n\"provide_alcohol\": false\\n}'}"
136
+ ]
137
+ },
138
+ "execution_count": 9,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": []
144
+ },
145
+ {
146
+ "cell_type": "markdown",
147
+ "id": "479c0691-72b1-4f43-9120-43007ea4c041",
148
+ "metadata": {},
149
+ "source": [
150
+ "### Utilities - external API wrapper (NOT TOOL!) ###"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 28,
156
+ "id": "3d5884e2-c6b4-4454-bf21-f7b6bbfe2193",
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "from langchain_community.utilities import SerpAPIWrapper\n",
161
+ "SERPAPI_API_KEY=\"9d4e074bca8f9f7fa9ca5e05b874ea7da4f07cb505292ab29a0e1b91ffa0340a\"\n",
162
+ "os.environ[\"SERPAPI_API_KEY\"] = SERPAPI_API_KEY\n",
163
+ "SERPER_API_KEY=\"37128e339289d0e855c54f9afa9aa489bd7c23da\"\n",
164
+ "os.environ['SERPER_API_KEY']=SERPER_API_KEY\n",
165
+ "\n",
166
+ "from langchain_community.utilities import GoogleSerperAPIWrapper\n",
167
+ "\n",
168
+ "### SERPER ###\n",
169
+ "serper = GoogleSerperAPIWrapper(gl = 'tw', lr = 'lang_zh-TW', hl='zh-tw', k = 30) \n",
170
+ "# search.run(\"Obama's first name?\")\n",
171
+ "# serper_results = serper.results(\"宜窩餐飲有限公司 台北市 -inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw\")\n",
172
+ "serper_results = serper.results(\"致仙飲食店 澎湖縣 -inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw\")\n",
173
+ "print( serper_results.keys() )\n",
174
+ "print( serper_results['knowledgeGraph'])\n",
175
+ "print( serper_results['organic'] )\n",
176
+ "\n",
177
+ "\n",
178
+ "### SERP API ###\n",
179
+ "serp = SerpAPIWrapper(params = {\n",
180
+ " \"gl\": 'tw', \n",
181
+ " \"lr\": 'lang_zh-TW', \n",
182
+ " \"hl\": 'zh-tw',\n",
183
+ " \"google_domain\": \"google.com.tw\"\n",
184
+ "}) # utilities\n",
185
+ "# serp_results = serp.results(\"宜窩餐飲有限公司 台北市 -inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw\")\n",
186
+ "# serp_results = serp.results(\"吉祥餛飩麵食館 台北市 -inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw\")\n",
187
+ "serp_results = serp.results(\"致仙飲食店 澎湖縣 -inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw\")\n",
188
+ "\n",
189
+ "\n",
190
+ "print( serp_results.keys() )\n",
191
+ "print( serp_results['knowledge_graph'] \n",
192
+ " "
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 22,
198
+ "id": "4c7290fb-931a-4f93-8469-ab4821b6c033",
199
+ "metadata": {},
200
+ "outputs": [
201
+ {
202
+ "name": "stdout",
203
+ "output_type": "stream",
204
+ "text": [
205
+ "\n",
206
+ "\n",
207
+ "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n"
208
+ ]
209
+ },
210
+ {
211
+ "ename": "ValueError",
212
+ "evalue": "An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse output: No.",
213
+ "output_type": "error",
214
+ "traceback": [
215
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
216
+ "\u001b[0;31mOutputParserException\u001b[0m Traceback (most recent call last)",
217
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:1167\u001b[0m, in \u001b[0;36mAgentExecutor._iter_next_step\u001b[0;34m(self, name_to_tool_map, color_mapping, inputs, intermediate_steps, run_manager)\u001b[0m\n\u001b[1;32m 1166\u001b[0m \u001b[38;5;66;03m# Call the LLM to see what to do.\u001b[39;00m\n\u001b[0;32m-> 1167\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplan\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1168\u001b[0m \u001b[43m \u001b[49m\u001b[43mintermediate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_child\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1172\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m OutputParserException \u001b[38;5;28;01mas\u001b[39;00m e:\n",
218
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:732\u001b[0m, in \u001b[0;36mAgent.plan\u001b[0;34m(self, intermediate_steps, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 731\u001b[0m full_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mllm_chain\u001b[38;5;241m.\u001b[39mpredict(callbacks\u001b[38;5;241m=\u001b[39mcallbacks, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfull_inputs)\n\u001b[0;32m--> 732\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput_parser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_output\u001b[49m\u001b[43m)\u001b[49m\n",
219
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/output_parsers/self_ask.py:41\u001b[0m, in \u001b[0;36mSelfAskOutputParser.parse\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinish_string \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m last_line:\n\u001b[0;32m---> 41\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m OutputParserException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not parse output: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtext\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m AgentFinish({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m: last_line[\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinish_string) :]}, text)\n",
220
+ "\u001b[0;31mOutputParserException\u001b[0m: Could not parse output: No.",
221
+ "\nDuring handling of the above exception, another exception occurred:\n",
222
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
223
+ "Cell \u001b[0;32mIn[22], line 22\u001b[0m\n\u001b[1;32m 12\u001b[0m tools \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 13\u001b[0m Tool(\n\u001b[1;32m 14\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIntermediate Answer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 17\u001b[0m )\n\u001b[1;32m 18\u001b[0m ]\n\u001b[1;32m 19\u001b[0m self_ask_with_search \u001b[38;5;241m=\u001b[39m initialize_agent(\n\u001b[1;32m 20\u001b[0m tools, llm, agent\u001b[38;5;241m=\u001b[39mAgentType\u001b[38;5;241m.\u001b[39mSELF_ASK_WITH_SEARCH, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 21\u001b[0m )\n\u001b[0;32m---> 22\u001b[0m \u001b[43mself_ask_with_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m五角味食堂\t彰化縣北斗鎮文昌里復興路臨100-1號\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
224
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:168\u001b[0m, in \u001b[0;36mdeprecated.<locals>.deprecate.<locals>.warning_emitting_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 166\u001b[0m warned \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 167\u001b[0m emit_warning()\n\u001b[0;32m--> 168\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
225
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/chains/base.py:600\u001b[0m, in \u001b[0;36mChain.run\u001b[0;34m(self, callbacks, tags, metadata, *args, **kwargs)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`run` supports only one positional argument.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 600\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m[\n\u001b[1;32m 601\u001b[0m _output_key\n\u001b[1;32m 602\u001b[0m ]\n\u001b[1;32m 604\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwargs \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m args:\n\u001b[1;32m 605\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m(kwargs, callbacks\u001b[38;5;241m=\u001b[39mcallbacks, tags\u001b[38;5;241m=\u001b[39mtags, metadata\u001b[38;5;241m=\u001b[39mmetadata)[\n\u001b[1;32m 606\u001b[0m _output_key\n\u001b[1;32m 607\u001b[0m ]\n",
226
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:168\u001b[0m, in \u001b[0;36mdeprecated.<locals>.deprecate.<locals>.warning_emitting_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 166\u001b[0m warned \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 167\u001b[0m emit_warning()\n\u001b[0;32m--> 168\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
227
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/chains/base.py:383\u001b[0m, in \u001b[0;36mChain.__call__\u001b[0;34m(self, inputs, return_only_outputs, callbacks, tags, metadata, run_name, include_run_info)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Execute the chain.\u001b[39;00m\n\u001b[1;32m 352\u001b[0m \n\u001b[1;32m 353\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[38;5;124;03m `Chain.output_keys`.\u001b[39;00m\n\u001b[1;32m 375\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 376\u001b[0m config \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 377\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcallbacks\u001b[39m\u001b[38;5;124m\"\u001b[39m: callbacks,\n\u001b[1;32m 378\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags\u001b[39m\u001b[38;5;124m\"\u001b[39m: tags,\n\u001b[1;32m 379\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata\u001b[39m\u001b[38;5;124m\"\u001b[39m: metadata,\n\u001b[1;32m 380\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: run_name,\n\u001b[1;32m 381\u001b[0m }\n\u001b[0;32m--> 383\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mcast\u001b[49m\u001b[43m(\u001b[49m\u001b[43mRunnableConfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43mk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_only_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_only_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_run_info\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_run_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
228
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/chains/base.py:166\u001b[0m, in \u001b[0;36mChain.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 165\u001b[0m run_manager\u001b[38;5;241m.\u001b[39mon_chain_error(e)\n\u001b[0;32m--> 166\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 167\u001b[0m run_manager\u001b[38;5;241m.\u001b[39mon_chain_end(outputs)\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m include_run_info:\n",
229
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/chains/base.py:156\u001b[0m, in \u001b[0;36mChain.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_inputs(inputs)\n\u001b[1;32m 155\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m--> 156\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m new_arg_supported\n\u001b[1;32m 158\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call(inputs)\n\u001b[1;32m 159\u001b[0m )\n\u001b[1;32m 161\u001b[0m final_outputs: Dict[\u001b[38;5;28mstr\u001b[39m, Any] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprep_outputs(\n\u001b[1;32m 162\u001b[0m inputs, outputs, return_only_outputs\n\u001b[1;32m 163\u001b[0m )\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
230
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:1433\u001b[0m, in \u001b[0;36mAgentExecutor._call\u001b[0;34m(self, inputs, run_manager)\u001b[0m\n\u001b[1;32m 1431\u001b[0m \u001b[38;5;66;03m# We now enter the agent loop (until it returns something).\u001b[39;00m\n\u001b[1;32m 1432\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_should_continue(iterations, time_elapsed):\n\u001b[0;32m-> 1433\u001b[0m next_step_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_take_next_step\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1434\u001b[0m \u001b[43m \u001b[49m\u001b[43mname_to_tool_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1435\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolor_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1436\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1437\u001b[0m \u001b[43m \u001b[49m\u001b[43mintermediate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1438\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1439\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1440\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(next_step_output, AgentFinish):\n\u001b[1;32m 1441\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_return(\n\u001b[1;32m 1442\u001b[0m next_step_output, intermediate_steps, run_manager\u001b[38;5;241m=\u001b[39mrun_manager\n\u001b[1;32m 1443\u001b[0m )\n",
231
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:1139\u001b[0m, in \u001b[0;36mAgentExecutor._take_next_step\u001b[0;34m(self, name_to_tool_map, color_mapping, inputs, intermediate_steps, run_manager)\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_take_next_step\u001b[39m(\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1132\u001b[0m name_to_tool_map: Dict[\u001b[38;5;28mstr\u001b[39m, BaseTool],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1136\u001b[0m run_manager: Optional[CallbackManagerForChainRun] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1137\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[AgentFinish, List[Tuple[AgentAction, \u001b[38;5;28mstr\u001b[39m]]]:\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_consume_next_step(\n\u001b[0;32m-> 1139\u001b[0m [\n\u001b[1;32m 1140\u001b[0m a\n\u001b[1;32m 1141\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter_next_step(\n\u001b[1;32m 1142\u001b[0m name_to_tool_map,\n\u001b[1;32m 1143\u001b[0m color_mapping,\n\u001b[1;32m 1144\u001b[0m inputs,\n\u001b[1;32m 1145\u001b[0m intermediate_steps,\n\u001b[1;32m 1146\u001b[0m run_manager,\n\u001b[1;32m 1147\u001b[0m )\n\u001b[1;32m 1148\u001b[0m ]\n\u001b[1;32m 1149\u001b[0m )\n",
232
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:1139\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_take_next_step\u001b[39m(\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1132\u001b[0m name_to_tool_map: Dict[\u001b[38;5;28mstr\u001b[39m, BaseTool],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1136\u001b[0m run_manager: Optional[CallbackManagerForChainRun] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1137\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[AgentFinish, List[Tuple[AgentAction, \u001b[38;5;28mstr\u001b[39m]]]:\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_consume_next_step(\n\u001b[0;32m-> 1139\u001b[0m [\n\u001b[1;32m 1140\u001b[0m a\n\u001b[1;32m 1141\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter_next_step(\n\u001b[1;32m 1142\u001b[0m name_to_tool_map,\n\u001b[1;32m 1143\u001b[0m color_mapping,\n\u001b[1;32m 1144\u001b[0m inputs,\n\u001b[1;32m 1145\u001b[0m intermediate_steps,\n\u001b[1;32m 1146\u001b[0m run_manager,\n\u001b[1;32m 1147\u001b[0m )\n\u001b[1;32m 1148\u001b[0m ]\n\u001b[1;32m 1149\u001b[0m )\n",
233
+ "File \u001b[0;32m/data1/env/py310helperbot/lib/python3.10/site-packages/langchain/agents/agent.py:1178\u001b[0m, in \u001b[0;36mAgentExecutor._iter_next_step\u001b[0;34m(self, name_to_tool_map, color_mapping, inputs, intermediate_steps, run_manager)\u001b[0m\n\u001b[1;32m 1176\u001b[0m raise_error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 1177\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m raise_error:\n\u001b[0;32m-> 1178\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1179\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn output parsing error occurred. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIn order to pass this error back to the agent and have it try \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1181\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124magain, pass `handle_parsing_errors=True` to the AgentExecutor. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1182\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis is the error: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1183\u001b[0m )\n\u001b[1;32m 1184\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 1185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandle_parsing_errors, \u001b[38;5;28mbool\u001b[39m):\n",
234
+ "\u001b[0;31mValueError\u001b[0m: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse output: No."
235
+ ]
236
+ }
237
+ ],
238
+ "source": [
239
+ "from langchain.agents import AgentType, Tool, initialize_agent\n",
240
+ "\n",
241
+ "instructions = \"\"\"As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, your task is to first use store name and address to identify relevant information. After that, from the relevant information, extract `store_name`, `address`, `description`, `category`, `provide_alcohol` and `phone_number` from the found relevant information. Note that `category` can only be `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`. According to our experience,`provide_alcohol` can be inferred based on whether a store is suitable for scenarios such as Gathering, Chill, Enjoying Together, Drinking Freely, Winery, Wine Cellar, Wine Storage, Relaxing, Unwinding, Lyrical, Romantic, Pleasant, Stress Relief, Wine and Dine, Light Drinking Gatherings, Birthday Celebrations, Socializing, Parties, Networking, After Work Relaxation with a Drink, Relaxing Places Suitable for Drinking, Every Dish Goes Well with Beer, Shared Dishes, Dining Together, Atmosphere Suitable for Celebratory Drinking, Places Suitable for Light Drinking Gatherings with Friends, Small Shops Suitable for Relaxing and Light Drinking. `description` is a summary of key piece of evidence and reasons that lead you decide `category` and `provide_alcohol` .\n",
242
+ "\n",
243
+ " It's very important to omit unrelated results. Do not make up any assumption.\n",
244
+ " Please think step by step, and output a single json that starts with `{` and ends with `}`. An example output json is like {\"store_name\": \"...\", \"address\": \"...\", \"description\": \"... products, service or highlights ...\", \"category\": \"...\", \"phone_number\": \"...\", \"provide_alcohol\": true or false}\n",
245
+ " If no relevant information has been found, simply output json with empty values.\"\"\"\n",
246
+ "base_prompt = hub.pull(\"langchain-ai/openai-functions-template\")\n",
247
+ "prompt = base_prompt.partial(instructions=instructions)\n",
248
+ "llm = ChatOpenAI(temperature=0, )\n",
249
+ "serp = SerpAPIWrapper() # utilities\n",
250
+ "# serp.results(\"五角味食堂\t彰化縣北斗鎮文昌里復興路臨100-1號\")\n",
251
+ "tools = [\n",
252
+ " Tool(\n",
253
+ " name=\"Intermediate Answer\",\n",
254
+ " func=serp.run,\n",
255
+ " description=\"useful for when you need to get factual information with search\",\n",
256
+ " )\n",
257
+ "]\n",
258
+ "self_ask_with_search = initialize_agent(\n",
259
+ " tools, llm, agent=AgentType.SELF_ASK_WITH_SEARCH, verbose=True\n",
260
+ ")\n",
261
+ "self_ask_with_search.run(\"五角味食堂\t彰化縣北斗鎮文昌里復興路臨100-1號\")\n"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": null,
267
+ "id": "68b499ad-3014-4d52-a30a-d750980d030e",
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": []
271
+ },
272
+ {
273
+ "cell_type": "markdown",
274
+ "id": "3a4c34ba-7af8-4e63-aa82-946cb27651ec",
275
+ "metadata": {},
276
+ "source": [
277
+ "### LCEL ###"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "id": "03543ce2-1406-4534-8071-12e172b770a8",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "import getpass\n",
288
+ "import os\n",
289
+ "\n",
290
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()\n",
291
+ "\n",
292
+ "from langchain_openai import ChatOpenAI\n",
293
+ "\n",
294
+ "model = ChatOpenAI(model=\"gpt-4\")"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "id": "3980df79-f7b2-465f-9c88-d6bf401eb1e2",
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": []
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "id": "4505be03-1225-41bf-a62b-9bf29784a24c",
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": []
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "id": "e48659bc-d39c-425e-a452-39f0c6e16428",
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": []
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "id": "72723ffe-4b96-4bfb-b9ae-9718b3da4ea6",
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": []
328
+ }
329
+ ],
330
+ "metadata": {
331
+ "kernelspec": {
332
+ "display_name": "Python 3 (ipykernel)",
333
+ "language": "python",
334
+ "name": "python3"
335
+ },
336
+ "language_info": {
337
+ "codemirror_mode": {
338
+ "name": "ipython",
339
+ "version": 3
340
+ },
341
+ "file_extension": ".py",
342
+ "mimetype": "text/x-python",
343
+ "name": "python",
344
+ "nbconvert_exporter": "python",
345
+ "pygments_lexer": "ipython3",
346
+ "version": "3.10.14"
347
+ }
348
+ },
349
+ "nbformat": 4,
350
+ "nbformat_minor": 5
351
+ }
classify POI.ipynb → notebooks/classify POI.ipynb RENAMED
The diff for this file is too large to render. See raw diff
 
pipeline.py ADDED
@@ -0,0 +1,699 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import json
5
+ import joblib
6
+ import math
7
+ import itertools
8
+ import argparse
9
+ import multiprocessing as mp
10
+ from typing import List
11
+ from pathlib import Path
12
+
13
+ import jinja2
14
+ import requests
15
+ import pandas as pd
16
+ from dotenv import load_dotenv
17
+ from serpapi import GoogleSearch
18
+ import tiktoken
19
+ from openai import OpenAI
20
+ from tqdm import tqdm
21
+ from loguru import logger
22
+
23
+ from model import llm
24
+ from data import get_leads, format_search_results
25
+ from utils import (parse_json_garbage, split_dataframe, merge_results,
26
+ combine_results, split_dict, format_df,
27
+ clean_quotes, compose_query)
28
+ from batch import postprocess_result
29
+
30
+ load_dotenv()
31
+ ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
32
+ SERP_API_KEY = os.getenv('SERP_APIKEY')
33
+ SERPER_API_KEY = os.getenv('SERPER_API_KEY')
34
+
35
+ try:
36
+ logger.remove(0)
37
+ logger.add(sys.stderr, level="INFO")
38
+ except ValueError:
39
+ pass
40
+
41
+ def get_serp( query: str, google_domain: str, gl: str, lr: str, top_k: int = 20, hl: str = "zh-tw", location: str = 'Taiwan', provider: str = 'serp') -> dict:
42
+ """
43
+ """
44
+ results = []
45
+
46
+ if provider == 'serp':
47
+ search = GoogleSearch({
48
+ "q": query,
49
+ 'google_domain': google_domain,
50
+ 'gl': gl,
51
+ 'lr': lr,
52
+ "api_key": SERP_API_KEY
53
+ })
54
+ result = search.get_dict()
55
+ # print(result['organic_results'][0])
56
+ # return result['organic_results'][0]
57
+ return result
58
+ elif provider == 'serper':
59
+ try:
60
+ payload = json.dumps({
61
+ "q": query,
62
+ "location": "Taiwan",
63
+ "gl": gl,
64
+ "hl": hl,
65
+ "num": top_k,
66
+ "autocorrect": False
67
+ })
68
+ response = requests.request(
69
+ "POST",
70
+ "https://google.serper.dev/search",
71
+ headers = { 'X-API-KEY': SERPER_API_KEY, 'Content-Type': 'application/json'},
72
+ data = payload
73
+ )
74
+ except Exception as e:
75
+ logger.error()
76
+ raise Exception(f"SERPER error -> {e}, query -> {query}")
77
+ result = response.json()
78
+ if 'searchParamters' in result:
79
+ result['search_parameters'] = result.pop('searchParamters')
80
+ if 'knowledgeGraph' in result:
81
+ result['knowledge_graph'] = result.pop('knowledgeGraph')
82
+ if 'organic' in result:
83
+ result['organic_results'] = result.pop('organic')
84
+ return result
85
+ else:
86
+ raise Exception(f"Unknown provider: {provider}")
87
+
88
+ def get_condensed_result(result: dict):
89
+ """
90
+ Argument
91
+ result
92
+ Return
93
+ condensed_result:
94
+ Example:
95
+
96
+ """
97
+ filtered_results = [
98
+ {"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results']
99
+ ]
100
+
101
+ condensed_result = json.dumps(filtered_results, ensure_ascii=False)
102
+ # print( condensed_results )
103
+ return condensed_result
104
+
105
+ def get_googlemap_results(result: dict):
106
+ """Get a store's google map results (in knowledge_graph)
107
+ Argument
108
+ result: dict
109
+ - 'knowledge_graph'
110
+ - 'title', 'thumbnail', 'type', 'entity_type', 'kgmid', 'knowledge_graph_search_link', 'serpapi_knowledge_graph_search_link', 'tabs', 'place_id', 'directions', 'local_map', 'rating', 'review_count', '服務項目', '地址', '地址_links', 'raw_hours', 'hours', '電話號碼', '電話號碼_links', 'popular_times', 'user_reviews', 'reviews_from_the_web', 'unclaimed_listing', '個人資料', '其他人也搜尋了以下項目', '其他人也搜尋了以下項目_link', '其他人也搜尋了以下項目_stick'
111
+ Return
112
+ googlemap_result: str
113
+ """
114
+ googlemap_result = "## Google map results\n"
115
+ if 'knowledge_graph' in result:
116
+ if 'user_reviews' in result['knowledge_graph']:
117
+ user_review = "\t".join([ _.get('summary', '') for _ in result['knowledge_graph']['user_reviews']])
118
+ store_name = result['knowledge_graph']['title']
119
+ googlemap_result += ( f"### store name: {store_name}\n")
120
+ googlemap_result += ( f"\t- 顧客評價: {user_review}\n")
121
+ if '其他人也搜尋了以下項目' in result['knowledge_graph']:
122
+ similar_store_types = "\t".join([ str(_.get('extensions', '')) for _ in result['knowledge_graph']['其他人也搜尋了以下項目']])
123
+ googlemap_result += ( f"\t- 類似店面類型: {similar_store_types}\n")
124
+ if '暫停營業' in result['knowledge_graph']:
125
+ store_status = '暫停營業' if result['knowledge_graph']['暫停營業'] else '營業中'
126
+ googlemap_result += ( f"\t- 營業狀態: {store_status}\n")
127
+ if '電話號碼' in result['knowledge_graph']:
128
+ phone_number = result['knowledge_graph']['電話號碼']
129
+ googlemap_result += ( f"\t- 電話號碼: {phone_number}\n")
130
+ if 'type' in result['knowledge_graph']:
131
+ store_type = result['knowledge_graph']['type']
132
+ googlemap_result += ( f"\t- 餐飲屬性: {store_type}\n")
133
+
134
+ else:
135
+ googlemap_result += ("empty\n")
136
+ return clean_quotes(googlemap_result)
137
+
138
+ def get_organic_result(result: dict) -> str:
139
+ """Get a store's organic search results
140
+ Argument
141
+ result: str
142
+ Return
143
+ organic_result: str
144
+ """
145
+ organic_result = "## Search results\n"
146
+ # filtered_results = [
147
+ # {"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results']
148
+ # ]
149
+ for r in result['organic_results']:
150
+ organic_result += ( f"### {r.get('title','')}: {r.get('snippet','')}\n")
151
+ return clean_quotes(organic_result)
152
+
153
+ def compose_classification( user_content, config: dict) -> str:
154
+ """
155
+ Argument
156
+ user_content: str
157
+ config: dict
158
+ classes: list
159
+ backup_classes: list
160
+ provider: e.g. 'google', 'openai'
161
+ model: e.g. 'gemini-1.5-flash', 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview'
162
+ Return
163
+ response: str
164
+ Example
165
+ system_prompt =
166
+ As a helpful and rigorous retail analyst, given the provided information about a store,
167
+ your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
168
+ Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
169
+ It's very important to omit unrelated piece of evidence and don't make up any assumption.
170
+ Please think step by step, and must output in json format. An example output json is like {{"category": "..."}}
171
+ If no relevant piece of information can ever be found at all, simply output json with empty string "".
172
+ I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
173
+ """
174
+
175
+ if isinstance(classes, list):
176
+ classes = ", ".join([ f"`{x}`" for x in classes])
177
+ elif isinstance(classes, str):
178
+ pass
179
+ else:
180
+ raise Exception(f"Incorrect classes type: {type(classes)}")
181
+ template = jinja2.Environment().from_string(config['classification_prompt'])
182
+ system_prompt = template.render( classes=config['classes'], backup_classes=config['backup_classes'])
183
+ response = llm(
184
+ provider = config['provider'],
185
+ model = config['model'],
186
+ system_prompt = system_prompt,
187
+ user_content = user_content,
188
+ )
189
+ return response
190
+
191
+ def classify_results(
192
+ analysis_results: pd.DataFrame,
193
+ config: dict,
194
+ input_column: str = 'evidence',
195
+ output_column: str = 'classified_category',
196
+ ):
197
+ """Classify the results
198
+ Argument
199
+ analysis_results: dataframe
200
+ config: dict
201
+ classes: list,
202
+ backup_classes: list,
203
+ provider: str,
204
+ model: str,
205
+ input_column: str
206
+ output_column: str
207
+ Return
208
+ analysis_results: dataframe
209
+ """
210
+ classified_results = analysis_results.copy()
211
+ labels, empty_indices = [], []
212
+ for idx, evidence in zip( analysis_results['index'], analysis_results[input_column]):
213
+ try:
214
+ user_content = f'''`evidence`: `{evidence}`'''
215
+ pred_cls = compose_classification( user_content, config)
216
+ label = parse_json_garbage(pred_cls)['category']
217
+ labels.append(label)
218
+ except Exception as e:
219
+ logger.error(f"# CLASSIFICATION error: e -> {e}, user_content -> {user_content}, evidence: {evidence}")
220
+ labels.append("")
221
+ empty_indices.append(idx)
222
+
223
+ classified_results[output_column] = labels
224
+ return {
225
+ "classified_results": classified_results,
226
+ "empty_indices": empty_indices
227
+ }
228
+
229
+ def classify_results_mp( extracted_results: pd.DataFrame, classified_file_path: str, config: dict, n_processes: int = 4):
230
+ """
231
+ Argument
232
+ extracted_results:
233
+ classified_file_path:
234
+ config: dict
235
+ classes: list
236
+ backup_classes: list
237
+ provider: str
238
+ model: str,
239
+ n_processes: int
240
+ Return
241
+ classified_results: dataframe
242
+ Reference
243
+ 200 records, 4 processes, 122.4695s
244
+ """
245
+ st = time.time()
246
+ # classified_file_path = "data/classified_result.joblib"
247
+ if not os.path.exists(classified_file_path):
248
+ split_data = split_dataframe(extracted_results)
249
+ with mp.Pool(args.n_processes) as pool:
250
+ classified_results = pool.starmap(
251
+ classify_results,
252
+ [ (
253
+ d, config, 'evidence', 'classified_category'
254
+ ) for d in split_data]
255
+ )
256
+ classified_results = merge_results( classified_results, dataframe_columns=['classified_results'], list_columns=['empty_indices'])
257
+ try:
258
+ with open( classified_file_path, "wb") as f:
259
+ joblib.dump( classified_results, f)
260
+ except FileNotFoundError as e:
261
+ logger.error(f"# CLASSIFICATION error: e -> {e}")
262
+ with open( f"./{Path(classified_file_path).name}.joblib", "wb") as f:
263
+ joblib.dump( classified_results, f)
264
+
265
+ else:
266
+ with open( classified_file_path, "rb") as f:
267
+ classified_results = joblib.load(f)
268
+ print( f"total time: {time.time() - st}")
269
+ return classified_results
270
+
271
+ def compose_filter( query, search_results, config: dict):
272
+ """Filter the search results based on the query (store name and address)
273
+ Argument
274
+ query: str
275
+ search_results: str
276
+ system_prompt: str
277
+ config: dict
278
+ provider: default to be "google"
279
+ model: default to be "gemini-1.5-flash"
280
+ Return
281
+ response: str
282
+ """
283
+ system_prompt = f'''As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, execute this task step by step. Search results contains a list of search results, where some could be totally irrelevant to our query.
284
+ Steps:
285
+ First, use store name and address to identify relevant and irrelevant information from search results.
286
+ Second, look through the list of search results; keep relevant ones and drop irrelevant ones.
287
+ Third, filter the results and only output relevant ones. Original numbering must be kept.
288
+
289
+ Output in json format such as {{ "relevant_results": [ "result ...", "result ...", "result ..." ], "irrelevant_results": ["result ..."]}}. It's very important to omit unrelated results. Do not make up any assumption.
290
+ '''
291
+ user_content = f"## query: `{query}`\n## search_results: {search_results}"
292
+ response = llm(
293
+ provider = config['provider'],
294
+ model = config['model'],
295
+ system_prompt = system_prompt,
296
+ user_content = user_content
297
+ )
298
+ return response
299
+
300
+ def filter_results( results: pd.DataFrame, config: dict):
301
+ """Filter the results
302
+ Argument
303
+ results: dataframe
304
+ config: dict
305
+ provider
306
+ model:
307
+ Return
308
+ analysis_results: dataframe
309
+ """
310
+ results = results.copy()
311
+ relevant_results, empty_indices = [], []
312
+ for i, d in tqdm(enumerate(results.itertuples())):
313
+ idx = d.index # d[1]
314
+ search_results = d.search_results
315
+ # evidence = d.googlemap_results +"\n" + d.search_results
316
+ # business_id = d.business_id # d[2]
317
+ business_name = d.business_name # d[3]
318
+ address = d.address # d[7]
319
+ try:
320
+ query = compose_query( address, business_name, use_exclude=False)
321
+ filtered_results = compose_filter( query = query, search_results=search_results, config=config)
322
+ relevant_result = parse_json_garbage(filtered_results)['relevant_results']
323
+ relevant_result = "### 搜尋結果: \n" + "\n".join([ "- " + r for r in relevant_result])
324
+ relevant_results.append(relevant_result)
325
+ except Exception as e:
326
+ logger.error(f"# FILTER error (add to empty_indices): e -> {e}, query -> {query}, filtered_results: {filtered_results}")
327
+ relevant_results.append(search_results)
328
+ empty_indices.append(idx)
329
+
330
+ results.loc[ :, "search_results"] = relevant_results
331
+ return {
332
+ "filtered_results": results,
333
+ "empty_indices": empty_indices
334
+ }
335
+
336
+ def filter_results_mp( data: pd.DataFrame, filtered_file_path: str, config: dict, n_processes: int = 4):
337
+ """Filter results in parallel
338
+ Argument
339
+ data: dataframe
340
+ filtered_file_path: str
341
+ config: dict
342
+ provider: str
343
+ model: str
344
+ n_processes: int
345
+ Return
346
+ filtered_results: dataframe
347
+ """
348
+ st = time.time()
349
+ # crawl_file_path = "data/crawled_results.joblib"
350
+ if not os.path.exists(filtered_file_path):
351
+ split_data = split_dataframe( data )
352
+ with mp.Pool(n_processes) as pool:
353
+ filtered_results = pool.starmap(
354
+ filter_results,
355
+ [ (d, config) for d in split_data]
356
+ )
357
+ filtered_results = merge_results( filtered_results, dataframe_columns=['filtered_results'], list_columns=['empty_indices'])
358
+ # with open( filtered_file_path, "wb") as f:
359
+ # joblib.dump( filtered_results, f)
360
+ filtered_results['filtered_results'].to_csv( filtered_file_path, index=False)
361
+ else:
362
+ # with open( filtered_file_path, "rb") as f:
363
+ # filtered_results = joblib.load(f)
364
+ filtered_results = { 'filtered_results': pd.read_csv( filtered_file_path)}
365
+ logger.debug( f"total time: {time.time() - st}")
366
+ return filtered_results
367
+
368
+ def crawl_results( data: pd.DataFrame, serp_provider: str = 'serp', google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
369
+ """
370
+ Argument
371
+ data: dataframe
372
+ google_domain: str
373
+ gl: str
374
+ lr: str
375
+ Return
376
+ {
377
+ `crawled_results`: df
378
+ `empty_indices`: list
379
+ }
380
+ df format:
381
+ [
382
+ {'title': '達米娜魚料理食堂',
383
+ 'snippet': 'Zhenzhen · 台東縣台東市洛陽街204號 · 08 934 1662 · 其他美食 · 外送・提供廁所・免費Wifi · ・休息中・將於20:00 開始營業 · NT$120 · 座位數15 · 現金.'},
384
+ ...
385
+ {'title': '台東美食推薦》25間台東市美食小吃/特色餐廳/早餐伴手禮',
386
+ 'snippet': '好漁日鬼頭刀專屬料理MAHI MAHI TODAY如店名所說,是間專賣鬼頭刀料理的餐廳,台灣主要盛產鬼頭刀的地方就位於台東的成功新港漁港,所以推薦大家來台東 ...'},
387
+ {'title': '類似的店', 'snippet': "['餐廳']\t['早午餐']\t['餐廳']"},
388
+ {'status': '暫停營業'},
389
+ {'telephone_number': '08 934 1662'}
390
+ ]
391
+ Reference
392
+ 200 records, 4 processes, 171.36490321159363
393
+ """
394
+ # serp_results = []
395
+ # condensed_results = []
396
+ crawled_results = []
397
+ empty_indices = []
398
+ for i, d in tqdm(enumerate(data.itertuples())):
399
+ idx = d[0]
400
+ address = d[1]
401
+ business_id = d[2]
402
+ business_name = d[4]
403
+ query = compose_query(address, business_name)
404
+ try:
405
+ res = get_serp( query, google_domain, gl, lr, provider=serp_provider)
406
+ # serp_results.append(res)
407
+ except:
408
+ logger.warning( f"# SERP error (will add to empty indices): i = {i}, idx = {idx}, query = {query}")
409
+ empty_indices.append(i)
410
+ continue
411
+ try:
412
+ # cond_res = get_condensed_result(res)
413
+ googlemap_res = get_googlemap_results(res)
414
+ search_res = get_organic_result(res)
415
+ # condensed_results.append(cond_res)
416
+ except:
417
+ logger.warning(f"# get googlemap & organic results error (will add to empty indices): i = {i}, idx = {idx}, res = {res}")
418
+ empty_indices.append(i)
419
+ continue
420
+
421
+ crawled_results.append( {
422
+ "index": idx,
423
+ "business_id": business_id,
424
+ "business_name": business_name,
425
+ "serp": res,
426
+ # "evidence": cond_res,
427
+ "googlemap_results": googlemap_res,
428
+ "search_results": search_res,
429
+ "address": address,
430
+ } )
431
+ crawled_results = pd.DataFrame(crawled_results)
432
+
433
+ return {
434
+ "crawled_results": crawled_results,
435
+ "empty_indices": empty_indices
436
+ }
437
+
438
+ def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, serp_provider: str, n_processes: int = 4):
439
+ st = time.time()
440
+ # crawl_file_path = "data/crawled_results.joblib"
441
+ if not os.path.exists(crawl_file_path):
442
+ split_data = split_dataframe( data )
443
+ with mp.Pool(n_processes) as pool:
444
+ crawled_results = pool.starmap(
445
+ crawl_results,
446
+ [( d, serp_provider) for d in split_data]
447
+ )
448
+ crawled_results = merge_results( crawled_results, dataframe_columns=['crawled_results'], list_columns=['empty_indices'])
449
+ try:
450
+ with open( crawl_file_path, "wb") as f:
451
+ joblib.dump( crawled_results, f)
452
+ except FileNotFoundError as e:
453
+ logger.error(f"# CRAWL error: e = {e}")
454
+ with open( f"./{Path(crawl_file_path).name}.joblib", "wb") as f:
455
+ joblib.dump( crawled_results, f)
456
+ else:
457
+ with open( crawl_file_path, "rb") as f:
458
+ crawled_results = joblib.load(f)
459
+ logger.debug( f"total time: {time.time() - st}")
460
+ return crawled_results
461
+
462
+ def compose_extraction( query, search_results, config: dict):
463
+ """
464
+ Argument
465
+ query: str
466
+ search_results: str
467
+ config: dict
468
+ system_prompt: str
469
+ classes: list. e.g. `小吃店`,`日式料理(含居酒屋,串燒)`,`火(鍋/爐)`,`東南亞料理(不含日韓)`,`海鮮熱炒`,`特色餐廳(含雞、鵝、牛、羊肉)`,`釣蝦場`,`傳統餐廳`,`燒烤`,`韓式料理(含火鍋,烤肉)`,`PUB(Live Band)`,`PUB(一般,含Lounge)`,`PUB(電音\舞場)`,`五星級飯店`,`自助KTV(含連鎖,庭園自助)`,`西餐廳(含美式,義式,墨式)`,`咖啡廳(泡沫紅茶)`,`飯店(星級/旅館,不含五星級)`,`運動休閒館(含球類練習場,飛鏢等)`,`西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`,`西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)`,`早餐`
470
+ provider: "openai"
471
+ model: "gpt-4-0125-preview" or 'gpt-3.5-turbo-0125'
472
+ Return
473
+ response: str
474
+
475
+ Example
476
+ classes = ", ".join([ "`"+x+"`" for x in classes if x!='早餐' ])+ " or " + "`早餐`"
477
+ traits = "Gathering, Chill, Enjoying Together, Drinking Freely, Winery, Wine Cellar, Wine Storage, Relaxing, Unwinding, Lyrical, Romantic, Pleasant, Stress Relief, Wine and Dine, Light Drinking Gatherings, Birthday Celebrations, Socializing, Parties, Networking, After Work Relaxation with a Drink, Relaxing Places Suitable for Drinking, Every Dish Goes Well with Beer, Shared Dishes, Dining Together, Atmosphere Suitable for Celebratory Drinking, Places Suitable for Light Drinking Gatherings with Friends, Small Shops Suitable for Relaxing and Light Drinking"
478
+ system_prompt = f'''
479
+ As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query, your task is to first use store name and address to identify relevant information.
480
+ After that, from the relevant information, extract `store_name`, `address`, `description`, `category`, `provide_alcohol` and `phone_number` from the found relevant information.
481
+ Note that `category` can only be {classes}.
482
+ According to our experience,`provide_alcohol` can be inferred based on whether a store is suitable for scenarios such as {traits}.
483
+ `description` is a summary of key piece of evidence and reasons that lead you decide `category` and `provide_alcohol` .
484
+
485
+ It's very important to omit unrelated results. Do not make up any assumption.
486
+ Please think step by step, and output a single json that starts with `{{` and ends with `}}`. An example output json is like {{"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "...", "provide_alcohol": true or false}}
487
+ If no relevant information has been found, simply output json with empty values.
488
+ '''
489
+ """
490
+ classes = ", ".join([ "`"+x+"`" for x in config['classes'] if x!='早餐' ])+ " or " + "`早餐`"
491
+ traits = config['traits']
492
+ system_prompt = config['extraction_prompt']
493
+ jenv = jinja2.Environment()
494
+ template = jenv.from_string(system_prompt)
495
+ system_prompt = template.render( classes=classes, traits=traits)
496
+ user_content = f"`query`: `{query}`\n`search_results`: {search_results}"
497
+ response = llm(
498
+ provider = config['provider'],
499
+ model = config['model'],
500
+ system_prompt = system_prompt,
501
+ user_content = user_content
502
+ )
503
+ return response
504
+
505
+ def extract_results( data: pd.DataFrame, config: dict):
506
+ """
507
+ Argument
508
+ data: a dataframe
509
+ - "index", "business_id", "business_name", "serp", "googlemap_results", "search_results", "address"
510
+ # - `evidence`, `result`
511
+ config: dict
512
+ classes: list
513
+ provider: str
514
+ model: str
515
+ Return
516
+ extracted_results: dataframe of `extracted_evidence`
517
+ """
518
+ extracted_results, empty_indices, ext_res = [], [], []
519
+ for i, d in tqdm(enumerate(data.itertuples())):
520
+ idx = d.index # d[1]
521
+ # evidence = d.evidence
522
+ # evidence = d.formatted_evidence
523
+ evidence = d.googlemap_results +"\n" + d.search_results
524
+ business_id = d.business_id # d[2]
525
+ business_name = d.business_name # d[3]
526
+ address = d.address # d[7]
527
+ ana_res = None
528
+ query = compose_query( address, business_name, use_exclude=False)
529
+ try:
530
+ ext_res = compose_extraction(
531
+ query = query,
532
+ search_results = evidence,
533
+ config = config
534
+ )
535
+ ext_res = parse_json_garbage(ext_res)
536
+ except Exception as e:
537
+ logger.error(f"# ANALYSIS error (add to empty indices): e = {e}, i = {i}, q = {query}, ext_res = {ext_res}")
538
+ empty_indices.append(i)
539
+ continue
540
+
541
+ extracted_results.append( {
542
+ "index": idx,
543
+ "business_id": business_id,
544
+ "business_name": business_name,
545
+ "evidence": evidence,
546
+ ** ext_res
547
+ } )
548
+ extracted_results = pd.DataFrame(extracted_results)
549
+
550
+ return {
551
+ "extracted_results": extracted_results,
552
+ "empty_indices": empty_indices
553
+ }
554
+
555
+ def extract_results_mp( crawled_results, extracted_file_path, config: dict, n_processes: int = 4):
556
+ """
557
+ Argument
558
+ crawled_results: dataframe
559
+ extracted_file_path
560
+ config:
561
+ classes: list
562
+ model: str
563
+ provider: str
564
+ Return
565
+ Reference
566
+ 200 records, 4 processes, 502.26914715766907
567
+ """
568
+ st = time.time()
569
+ # args.extracted_file_path = "data/extracted_results.joblib"
570
+ if not os.path.exists(extracted_file_path):
571
+ split_data = split_dataframe( crawled_results)
572
+ with mp.Pool(n_processes) as pool:
573
+ extracted_results = pool.starmap( extract_results, [ (x, config) for x in split_data])
574
+ extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
575
+ try:
576
+ with open( extracted_file_path, "wb") as f:
577
+ joblib.dump( extracted_results, f)
578
+ except FileNotFoundError as e:
579
+ logger.error(f"# EXTRACT error: e = {e}")
580
+ with open( f"./{Path(extracted_file_path).name}.joblib", "wb") as f:
581
+ joblib.dump( extracted_results, f)
582
+ else:
583
+ with open( extracted_file_path, "rb") as f:
584
+ extracted_results = joblib.load(f)
585
+ logger.info( f"total time: {time.time() - st}")
586
+ return extracted_results
587
+
588
+ def compose_regularization( category: str, config: dict):
589
+ """
590
+ Argument
591
+ category: str
592
+ config: dict
593
+ provider: str
594
+ model: str
595
+ Return
596
+ response: str
597
+ """
598
+ system_prompt = f"""
599
+ As a helpful and factual assistant, your task is to classify the provided raw cuisine category into a conformed category. The definition of each conformed category is show below (in the format of `category`: `... definition ...`):
600
+ - `小吃店`:小吃、擔仔麵、小吃攤、街邊小店、傳統小吃、麵食、麵攤、炒飯、餃子館、鯊魚煙、黑白切、牛肉麵、銅板美食、小點心、簡餐、色小菜、開放空間攤販
601
+ - `日式料理(含居酒屋,串燒)`:居酒屋、酒場、水產、清酒、生魚片、壽司、日式啤酒、日式料理、代烤服務、日本餐飲場所、日本傳統食物、日式定食
602
+ - `火(鍋/爐)`:麻辣鍋、薑母鴨、鴨味仔、鍋物、湯底、滋補、冬令補、涮涮鍋、個人鍋、冬天圍爐、羊肉爐、鴛鴦鍋、炭火爐、氣火爐、燒酒雞、蒸氣海鮮鍋
603
+ - `東南亞料理(不含日韓)`:印尼、越式、泰式、沙嗲、海南雞、河粉、馬來西亞料理、新加坡料理、寮國料理、緬甸料理、南洋風味、印度料理、越南春捲、泰式綠咖哩、異國風情裝潢、滇緬料理
604
+ - `海鮮熱炒`:海鮮、現撈、活海鮮、生猛、大排檔、活魚活蝦、生猛海鮮、快炒、海產、台式海鮮、下酒菜
605
+ - `特色餐廳(含雞、鵝、牛、羊肉)`:烤鴨、燒鵝、甕仔雞、甕缸雞、桶仔雞、牛雜、蒙古烤肉、鵝肉城、金山鴨肉、生牛肉、全羊宴、活鱉、烤雞店、鵝肉餐廳、溫體牛、現宰羊肉、鹹水鵝、土羊肉
606
+ - `傳統餐廳`:江浙、台菜、合菜、桌菜、粵菜、中式、川菜、港式、上海菜、砂鍋魚頭、東北菜、北京烤鴨、一鴨三吃、婚宴、辦桌、老字號、宴會廳、台灣料理
607
+ - `燒烤`:燒烤、串燒、串串、烤魚、鮮蚵、炭烤、直火、碳火、和牛、戶外生火、烤肉、路邊燒烤
608
+ - `韓式料理(含火鍋,烤肉)`:韓國泡菜、韓式年糕、首爾、燒酒、韓式炸雞、春川辣炒雞、韓式炸醬麵、海鮮煎餅、烤三層肉、烤五花、烤韓牛、醬料和飯、石鍋拌飯、韓式風格、韓式清酒、啤酒、銅盤烤肉、韓流
609
+ - `PUB(Live Band)`:音樂餐廳、樂團表演、現場表演、LIVE表演、樂團駐唱、定期表演、有舞台場地、樂隊、專人駐唱
610
+ - `PUB(一般,含Lounge)`:酒吧、bar、lounge、飛鏢、調酒、運動酒吧、音樂酒吧、沙發聊天、女公關、互動調酒師、公關服務
611
+ - `PUB(電音\舞場)`:夜店、舞池電音、藝人、包廂低消制、電子音樂表演、DJ、派對狂歡
612
+ - `五星級飯店`:高級飯店、奢華酒店、連鎖五星級飯店、國際集團飯店、米其林飯店、高檔住宿
613
+ - `自助KTV(含連鎖,庭園自助)`:卡拉OK、唱歌、歌坊、歡唱吧、自行點歌、自助唱歌、唱歌包廂、慶生聯誼包廂
614
+ - `西餐廳(含美式,義式,墨式)`:牛排、餐酒、歐式、義式、西餐、義大利麵、凱薩沙拉、紅酒、白酒、調酒、墨西哥式料理、阿根廷式料理、漢堡、比薩
615
+ - `咖啡廳(泡沫紅茶)`:泡沫紅茶店、咖啡店、café、coffee、輕食、軟性飲料、簡餐、茶街
616
+ - `飯店(星級/旅館,不含五星級)`:飯店、酒店、商務旅館、平價住宿
617
+ - `運動休閒館(含球類練習場,飛鏢等)`:撞球、高爾夫、運動、保齡球、娛樂、高爾夫練習場、大魯閣棒球場、籃球、羽毛球、PHOENIX鳳凰、羽球館、看球賽
618
+ - `釣蝦場`:釣蝦、蝦寶、投幣卡拉OK、釣竿和餌料、蝦子現場烹煮食用、泰國蝦、現烤蝦子、包廂唱歌、現釣現烤、自備或租用釣竿。
619
+
620
+ Note that you must choose from the above categories. Other ones are strongly prohibited.
621
+
622
+ Output in json format such as {{"category": "..."}}.
623
+
624
+ """
625
+ user_content = category
626
+ response = llm(
627
+ provider = config['provider'],
628
+ model = config['model'],
629
+ system_prompt = system_prompt,
630
+ user_content = user_content
631
+ )
632
+ return response
633
+
634
+ def regularize_results( results: pd.DataFrame, provider, model):
635
+ """Regularize the categories
636
+ Argument
637
+ results: dataframe
638
+ provider: str
639
+ model: str
640
+ Return
641
+ a dict of
642
+ - regularized_results: dataframe
643
+ - empty_indices: list
644
+ """
645
+ results = results.copy()
646
+ regular_categories, empty_indices = [], []
647
+ for i, d in tqdm(enumerate(results.itertuples())):
648
+ idx = d.index # d[1]
649
+ category = d.category
650
+ if pd.isna(category) or len(category)==0:
651
+ regular_categories.append("")
652
+ continue
653
+ try:
654
+ query = category
655
+ regularized_result = compose_regularization( category, provider=provider, model=model)
656
+ regular_category = parse_json_garbage(regularized_result)['category']
657
+ regular_categories.append(regular_category)
658
+ except Exception as e:
659
+ logger.error(f"# REGULARIZATION error (add to empty_indices): e -> {e}, query -> {query}, category: {category}")
660
+ regular_categories.append(category)
661
+ empty_indices.append(idx)
662
+
663
+ results.loc[ :, "category"] = regular_categories
664
+ return {
665
+ "regularized_results": results,
666
+ "empty_indices": empty_indices
667
+ }
668
+
669
+ def regularize_results_mp( data: pd.DataFrame, regularized_file_path, provider, model):
670
+ """Regularize categories in parallel
671
+ Argument
672
+ data: dataframe
673
+ regularized_file_path: str
674
+ provider: str
675
+ model: str
676
+ Return
677
+ regularized_results: dataframe
678
+ """
679
+ st = time.time()
680
+ if not os.path.exists(regularized_file_path):
681
+ split_data = split_dataframe( data )
682
+ with mp.Pool(n_processes) as pool:
683
+ regularized_results = pool.starmap(
684
+ regularize_results,
685
+ [ (
686
+ d, provider, model
687
+ ) for d in split_data]
688
+ )
689
+ regularized_results = merge_results( regularized_results, dataframe_columns=['regularized_results'], list_columns=['empty_indices'])
690
+ # with open( filtered_file_path, "wb") as f:
691
+ # joblib.dump( filtered_results, f)
692
+ regularized_results['regularized_results'].to_csv( regularized_file_path, index=False)
693
+ else:
694
+ # with open( filtered_file_path, "rb") as f:
695
+ # filtered_results = joblib.load(f)
696
+ regularized_results = { 'regularized_results': pd.read_csv( regularized_file_path)}
697
+ logger.debug( f"total time: {time.time() - st}")
698
+ return regularized_results
699
+
requirements.txt CHANGED
@@ -17,4 +17,5 @@ tiktoken
17
  tqdm
18
  joblib
19
  google-search-results
20
- anthropic>=0.25.7,<0.26.0
 
 
17
  tqdm
18
  joblib
19
  google-search-results
20
+ anthropic>=0.25.7,<0.26.0
21
+ google-generativeai>=0.7.1,<0.8.0
scripts/run_batch.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ start_index=0
2
+ total_files=17
3
+ wait_time_in_seconds=10
4
+ for i in $(seq $start_index $total_files); do
5
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_1_篩選結果.xlsx - Sheet1_$i"
6
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_2_篩選結果.xlsx - Sheet1_$i"
7
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_3_篩選結果.xlsx - Sheet1_$i"
8
+ data_dir="data"/"gpt-4o-mini"/"output_0402_4_篩選結果.xlsx - Sheet1_$i"
9
+ # data_dir="data/tmp"
10
+
11
+ echo "Preparing batch data ..."
12
+ python batch.py -t prepare_batch \
13
+ -crp "$data_dir"/crawled_results.joblib \
14
+ -o "$data_dir"/batch.jsonl
15
+ # -topn 200
16
+
17
+ echo "Executing batch data ..."
18
+ python batch.py -t run_batch \
19
+ -i "$data_dir"/batch.jsonl \
20
+ -j "$data_dir"/job.joblib \
21
+ -jp "$data_dir"/output.jsonl
22
+
23
+ echo "Converting batch to extracted results ..."
24
+ python batch.py -t batch2extract \
25
+ -jp "$data_dir"/output.jsonl \
26
+ -crp "$data_dir"/crawled_results.joblib \
27
+ -erp "$data_dir"/extracted_results.joblib
28
+
29
+
30
+ sleep $wait_time_in_seconds
31
+ done
32
+
33
+ echo "All tasks completed."
scripts/run_combine.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ if [ -z $1 ]; then
2
+ echo "Please provide the batch index."
3
+ echo "Example: "
4
+ echo " sh ./run_combine.sh 1"
5
+ echo " sh ./run_combine.sh 2"
6
+ echo " sh ./run_combine.sh 3"
7
+ echo " sh ./run_combine.sh 4"
8
+ exit 1
9
+ else
10
+ index=$1
11
+ data_dir="data"/"gpt-4o-mini"
12
+
13
+ pattern="output_0402_"$index"_篩選結果.xlsx - Sheet1_*"
14
+ python batch.py \
15
+ -t combine \
16
+ -i "$data_dir" \
17
+ -prp "$pattern" \
18
+ -rp "data/staging/output_0402_"$index"_篩選結果.xlsx - Sheet1.csv" \
19
+ -o "data/staging/output_0402_"$index"_篩選結果.xlsx - Sheet1.postprocessed.csv"
20
+
21
+ echo "All tasks completed."
22
+ fi
23
+
24
+
scripts/run_crawl.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 設定檔案的基礎名稱
4
+ # base_file="output_0402_1_篩選結果.xlsx - Sheet1_"
5
+ # base_file="output_0402_2_篩選結果.xlsx - Sheet1_"
6
+ # base_file="output_0402_3_篩選結果.xlsx - Sheet1_"
7
+ base_file="output_0402_4_篩選結果.xlsx - Sheet1_"
8
+
9
+ # 設定總共要處理的檔案數量
10
+ start_index=0
11
+ total_files=17
12
+
13
+ # 設定每次處理消耗的數量和 API 限制
14
+ # consumption_per_run=1000
15
+ # api_rate_limit=3000
16
+ api_rate_limit=20000
17
+ wait_time_in_seconds=60 # 1500 # 25 mins
18
+
19
+ # 迴圈執行
20
+ for i in $(seq $start_index $total_files); do
21
+ # 動態生成檔案名稱
22
+ file_name="${base_file}${i}.csv"
23
+ crawled_file_path="${base_file}${i}/crawled_results.joblib"
24
+
25
+ # 執行 python 指令
26
+ python sheet.py --data_path "data/production/${file_name}" --task new \
27
+ --step crawl \
28
+ --output_dir data/gpt-4o-mini \
29
+ --n_processes 4 \
30
+ --serp_provider serp \
31
+ --crawled_file_path "${crawled_file_path}" \
32
+ --extraction_provider openai \
33
+ --extraction_model gpt-4o-mini \
34
+ --regularization_provider openai \
35
+ --regularization_model gpt-4o-mini
36
+
37
+ # 等待以避免 API rate limit
38
+ echo "Completed task for ${file_name}. Waiting for ${wait_time_in_seconds} seconds..."
39
+ sleep $wait_time_in_seconds
40
+ done
41
+
42
+ echo "All tasks completed."
scripts/run_postprocess.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ if [ -z $1 ] | [ -z $2 ]; then
2
+ echo "Please provide the batch index."
3
+ echo "Example: "
4
+ echo " sh ./run_postprocess.sh 1 25"
5
+ echo " sh ./run_postprocess.sh 2 27"
6
+ echo " sh ./run_postprocess.sh 3 27"
7
+ echo " sh ./run_postprocess.sh 4 17"
8
+ exit 1
9
+ else
10
+ batch_num=$1
11
+ start_index=0
12
+ total_files=$2
13
+ wait_time_in_seconds=1
14
+ for i in $(seq $start_index $total_files); do
15
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_1_篩選結果.xlsx - Sheet1_$i"
16
+ data_dir="data"/"gpt-4o-mini"/"output_0402_"$batch_num"_篩選結果.xlsx - Sheet1_$i"
17
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_3_篩選結果.xlsx - Sheet1_$i"
18
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_4_篩選結果.xlsx - Sheet1_$i"
19
+ python batch.py -t postprocess \
20
+ -rrp "$data_dir"/regularized_results.joblib \
21
+ -prp "$data_dir"/postprocessed_results.csv
22
+ done
23
+
24
+ echo "All tasks completed."
25
+ fi
scripts/run_regularization.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ start_index=0
2
+ total_files=0
3
+ wait_time_in_seconds=10
4
+ for i in $(seq $start_index $total_files); do
5
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_1_篩選結果.xlsx - Sheet1_$i"
6
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_2_篩選結果.xlsx - Sheet1_$i"
7
+ data_dir="data"/"gpt-4o-mini"/"output_0402_3_篩選結果.xlsx - Sheet1_$i"
8
+ # data_dir="data"/"gpt-4o-mini"/"output_0402_4_篩選結果.xlsx - Sheet1_$i"
9
+ # data_dir="data/tmp"
10
+
11
+ # echo "Preparing batch data ..."
12
+ # python batch.py -t prepare_regularization \
13
+ # -erp "$data_dir"/extracted_results.joblib \
14
+ # -o "$data_dir"/regularization.jsonl
15
+ # -topn 200
16
+
17
+ # echo "Executing batch data ..."
18
+ # python batch.py -t run_batch \
19
+ # -i "$data_dir"/regularization.jsonl \
20
+ # -j "$data_dir"/reg_job.joblib \
21
+ # -jp "$data_dir"/reg_output.jsonl
22
+
23
+ # echo "Converting batch to extracted results ..."
24
+ # python model.py -t batch2extract \
25
+ # -jp "$data_dir"/output.jsonl \
26
+ # -crp "$data_dir"/crawled_results.joblib \
27
+ # -erp "$data_dir"/extracted_results.joblib
28
+
29
+ echo "Converting batch to regularized results ..."
30
+ python batch.py -t batch2reg \
31
+ -jp "$data_dir"/reg_output.jsonl \
32
+ -erp "$data_dir"/extracted_results.joblib \
33
+ -rrp "$data_dir"/regularized_results.joblib
34
+
35
+ sleep $wait_time_in_seconds
36
+ done
37
+
38
+ echo "All tasks completed."
sheet.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import time
3
  import json
4
  import joblib
@@ -6,529 +7,63 @@ import math
6
  import itertools
7
  import argparse
8
  import multiprocessing as mp
 
 
9
 
 
 
 
10
  import pandas as pd
11
  from dotenv import load_dotenv
12
  from serpapi import GoogleSearch
13
  import tiktoken
14
  from openai import OpenAI
15
  from tqdm import tqdm
 
16
 
17
  from model import llm
18
- from utils import parse_json_garbage
19
-
 
 
 
 
 
 
 
 
 
20
  load_dotenv()
21
  ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
22
  SERP_API_KEY = os.getenv('SERP_APIKEY')
23
-
24
-
25
- def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
26
- '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']):
27
- """
28
- """
29
- assert os.path.exists(file_path)
30
- data = pd.read_csv( file_path, names=names)
31
- return data
32
 
33
- def get_serp( query: str, google_domain: str, gl: str, lr: str) -> dict:
34
- """
35
- """
36
- results = []
37
- search = GoogleSearch({
38
- "q": query,
39
- 'google_domain': google_domain,
40
- 'gl': gl,
41
- 'lr': lr,
42
- "api_key": SERP_API_KEY
43
- })
44
- result = search.get_dict()
45
- # print(result['organic_results'][0])
46
- # return result['organic_results'][0]
47
- return result
48
-
49
-
50
- def get_condensed_result(result):
51
- """
52
- Argument
53
- result
54
- Return
55
- condensed_result:
56
- Example:
57
- result['knowledge_graph'].keys() # 'title', 'thumbnail', 'type', 'entity_type', 'kgmid', 'knowledge_graph_search_link', 'serpapi_knowledge_graph_search_link', 'tabs', 'place_id', 'directions', 'local_map', 'rating', 'review_count', '服務項目', '地址', '地址_links', 'raw_hours', 'hours', '電話號碼', '電話號碼_links', 'popular_times', 'user_reviews', 'reviews_from_the_web', 'unclaimed_listing', '個人資料', '其他人也搜尋了以下項目', '其他人也搜尋了以下項目_link', '其他人也搜尋了以下項目_stick'
58
- """
59
- filtered_results = [
60
- {"title": r.get('title',""), 'snippet': r.get('snippet',"")} for r in result['organic_results']
61
- ]
62
- if 'knowledge_graph' in result:
63
- if 'user_reviews' in result['knowledge_graph']:
64
- filtered_results.append( {'title': result['knowledge_graph']['title'], '顧客評價': "\t".join([ _.get('summary', '') for _ in result['knowledge_graph']['user_reviews']]) })
65
- if '其他人也搜尋了以下項目' in result['knowledge_graph']:
66
- filtered_results.append( {'title': "類似的店", 'snippet': "\t".join([ str(_.get('extensions', '')) for _ in result['knowledge_graph']['其他人也搜尋了以下項目']]) })
67
- if '暫停營業' in result['knowledge_graph']:
68
- filtered_results.append( {'status': '暫停營業' if result['knowledge_graph']['暫停營業'] else '營業中'})
69
- if '電話號碼' in result['knowledge_graph']:
70
- filtered_results.append( {'telephone_number': result['knowledge_graph']['電話號碼']})
71
- condensed_result = json.dumps(filtered_results, ensure_ascii=False)
72
- # print( condensed_results )
73
- return condensed_result
74
-
75
- def compose_extraction( query, search_results, classes: list, provider: str, model: str):
76
- """
77
- Argument
78
- query: str
79
- search_results: str
80
- system_prompt: str
81
- classes: list, `小吃店`, `日式料理(含居酒屋,串燒)`, `火(鍋/爐)`, `東南亞料理(不含日韓)`, `海鮮熱炒`, `特色餐廳(含雞、鵝、牛、羊肉)`, `傳統餐廳`, `燒烤`, `韓式料理(含火鍋,烤肉)`, `西餐廳(含美式,義式,墨式)`, `西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`, `西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)` or `早餐`
82
- provider: "openai"
83
- model: "gpt-4-0125-preview" or 'gpt-3.5-turbo-0125'
84
- Return
85
- response: str
86
- """
87
- classes = ", ".join([ "`"+x+"`" for x in classes if x!='早餐' ])+ " or " + "`早餐`"
88
- system_prompt = f'''
89
- As a helpful and rigorous retail analyst, given the provided query and a list of search results for the query,
90
- your task is to first identify relevant information of the identical store based on store name and proxmity of address if known. After that, extract `store_name`, `address`, `description`, `category` and `phone_number` from the found relevant information, where `category` can only be {classes}.
91
- It's very important to omit unrelated results. Do not make up any assumption.
92
- Please think step by step, and output in json format. An example output json is like {{"store_name": "...", "address": "...", "description": "... products, service or highlights ...", "category": "...", "phone_number": "..."}}
93
- If no relevant information has been found, simply output json with empty values.
94
- I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
95
- '''
96
- user_content = f"`query`: `{query}`\n`search_results`: {search_results}"
97
- response = llm(
98
- provider = provider,
99
- model = model,
100
- system_prompt = system_prompt,
101
- user_content = user_content
102
- )
103
- return response
104
-
105
-
106
- def compose_classication( user_content, classes: list, backup_classes: list, provider: str, model: str) -> str:
107
- """
108
- Argument
109
- client:
110
- evidence: str
111
- classes: list
112
- provider: e.g. 'openai'
113
- model: e.g. 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview'
114
- Return
115
- response: str
116
- """
117
- if isinstance(classes, list):
118
- classes = ", ".join([ f"`{x}`" for x in classes])
119
- elif isinstance(classes, str):
120
- pass
121
- else:
122
- raise Exception(f"Incorrect classes type: {type(classes)}")
123
- system_prompt = f"""
124
- As a helpful and rigorous retail analyst, given the provided information about a store,
125
- your task is two-fold. First, classify provided evidence below into the mostly relevant category from the following: {classes}.
126
- Second, if no relevant information has been found, classify the evidence into the mostly relevant supercategory from the following: {backup_classes}.
127
- It's very important to omit unrelated piece of evidence and don't make up any assumption.
128
- Please think step by step, and must output in json format. An example output json is like {{"category": "..."}}
129
- If no relevant piece of information can ever be found at all, simply output json with empty string "".
130
- I'll tip you and guarantee a place in heaven you do a great job completely according to my instruction.
131
- """
132
- response = llm(
133
- provider = provider,
134
- model = model,
135
- system_prompt = system_prompt,
136
- user_content = user_content,
137
- )
138
- return response
139
-
140
-
141
- def classify_results(
142
- analysis_results: pd.DataFrame,
143
- classes: list,
144
- backup_classes: list,
145
- provider: str,
146
- model: str,
147
- input_column: str = 'evidence',
148
- output_column: str = 'classified_category',
149
- ):
150
- """Classify the results
151
- Argument
152
- analysis_results: dataframe
153
- input_column: str
154
- output_column: str
155
- classes: list
156
- Return
157
- analysis_results: dataframe
158
- """
159
- classified_results = analysis_results.copy()
160
- labels, empty_indices = [], []
161
- for idx, evidence in zip( analysis_results['index'], analysis_results[input_column]):
162
- try:
163
- user_content = f'''`evidence`: `{evidence}`'''
164
- pred_cls = compose_classication( user_content, classes=classes, backup_classes=backup_classes, provider=provider, model=model)
165
- label = parse_json_garbage(pred_cls)['category']
166
- labels.append(label)
167
- except Exception as e:
168
- print(f"# CLASSIFICATION error: e -> {e}, user_content -> {user_content}, evidence: {evidence}")
169
- labels.append("")
170
- empty_indices.append(idx)
171
-
172
- classified_results[output_column] = labels
173
- return {
174
- "classified_results": classified_results,
175
- "empty_indices": empty_indices
176
- }
177
-
178
- def classify_results_mp( extracted_results: pd.DataFrame, classified_file_path: str, classes: list, backup_classes: list, provider: str, model: str, n_processes: int = 4):
179
- """
180
- Argument
181
- extracted_results:
182
- classified_file_path:
183
- classes: e.g. ['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)']
184
- backup_classes: e.g. [ '中式', '西式']
185
- provider:
186
- model:
187
- n_processes: int
188
- Return
189
- classified_results: dataframe
190
- Reference
191
- 200 records, 4 processes, 122.4695s
192
- """
193
- st = time.time()
194
- # classified_file_path = "data/classified_result.joblib"
195
- if not os.path.exists(classified_file_path):
196
- split_data = split_dataframe(extracted_results)
197
- with mp.Pool(args.n_processes) as pool:
198
- classified_results = pool.starmap(
199
- classify_results,
200
- [ (
201
- d,
202
- classes, backup_classes,
203
- provider, model,
204
- 'evidence', 'classified_category',
205
- ) for d in split_data]
206
- )
207
- classified_results = merge_results( classified_results, dataframe_columns=['classified_results'], list_columns=['empty_indices'])
208
- with open( classified_file_path, "wb") as f:
209
- joblib.dump( classified_results, f)
210
- else:
211
- with open( classified_file_path, "rb") as f:
212
- classified_results = joblib.load(f)
213
- print( f"total time: {time.time() - st}")
214
- return classified_results
215
-
216
-
217
- def compose_query( address, name, with_index: bool = True, exclude: str = "-inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw"):
218
- """
219
- Argumemnt
220
- # d: series with d[1]: 地址, d[4]: 營業人名稱 #
221
- address: str
222
- name: str
223
- with_index: bool
224
- Return
225
- query: `縣市` `營業人名稱`
226
- """
227
- # if with_index: # .itertuples()
228
- # query = f"{d[1][:3]} {d[4]}"
229
- # else:
230
- # query = f"{d[0][:3]} {d[3]}"
231
- query = f"{address[:3]} {name} {exclude}"
232
- return query
233
-
234
- def crawl_results( data: pd.DataFrame, google_domain: str = 'google.com.tw', gl: str = 'tw', lr: str = 'lang_zh-TW'):
235
- """
236
- Argument
237
- data: dataframe
238
- google_domain: str
239
- gl: str
240
- lr: str
241
- Return
242
- crawled_results
243
- Reference
244
- 200 records, 4 processes, 171.36490321159363
245
- """
246
- serp_results = []
247
- condensed_results = []
248
- crawled_results = []
249
- empty_indices = []
250
- for i, d in tqdm(enumerate(data.itertuples())):
251
- idx = d[0]
252
- address = d[1]
253
- business_id = d[2]
254
- business_name = d[4]
255
- query = compose_query(address, business_name)
256
- try:
257
- res = get_serp( query, google_domain, gl, lr)
258
- serp_results.append(res)
259
- except:
260
- print( f"# SERP error: i = {i}, idx = {idx}, query = {query}")
261
- empty_indices.append(i)
262
- continue
263
- try:
264
- cond_res = get_condensed_result(res)
265
- condensed_results.append(cond_res)
266
- except:
267
- print(f"# CONDENSE error: i = {i}, idx = {idx}, res = {res}")
268
- empty_indices.append(i)
269
- continue
270
-
271
- crawled_results.append( {
272
- "index": idx,
273
- "business_id": business_id,
274
- "business_name": business_name,
275
- "serp": res,
276
- "evidence": cond_res,
277
- "address": address,
278
- } )
279
- crawled_results = pd.DataFrame(crawled_results)
280
-
281
- return {
282
- "crawled_results": crawled_results,
283
- "empty_indices": empty_indices
284
- }
285
-
286
- def crawl_results_mp( data: pd.DataFrame, crawl_file_path: str, n_processes: int = 4):
287
- st = time.time()
288
- # crawl_file_path = "data/crawled_results.joblib"
289
- if not os.path.exists(crawl_file_path):
290
- split_data = split_dataframe( data )
291
- with mp.Pool(n_processes) as pool:
292
- crawled_results = pool.map( crawl_results, split_data)
293
- crawled_results = merge_results( crawled_results, dataframe_columns=['crawled_results'], list_columns=['empty_indices'])
294
- with open( crawl_file_path, "wb") as f:
295
- joblib.dump( crawled_results, f)
296
- else:
297
- with open( crawl_file_path, "rb") as f:
298
- crawled_results = joblib.load(f)
299
- print( f"total time: {time.time() - st}")
300
- return crawled_results
301
-
302
- def extract_results( data: pd.DataFrame, classes: list, provider: str, model: str):
303
- """
304
- Argument
305
- data: `evidence`, `result`
306
- Return
307
- extracted_results: dataframe of `extracted_evidence`
308
- """
309
- extracted_results, empty_indices, ext_res = [], [], []
310
- for i, d in tqdm(enumerate(data.itertuples())):
311
- idx = d[1]
312
- evidence = d.evidence
313
- business_id = d[2]
314
- business_name = d[3]
315
- address = d[6]
316
- ana_res = None
317
- query = compose_query( address, business_name)
318
- try:
319
- ext_res = compose_extraction( query = query, search_results = evidence, classes = classes, provider = provider, model = model)
320
- ext_res = parse_json_garbage(ext_res)
321
- except Exception as e:
322
- print(f"# ANALYSIS error: e = {e}, i = {i}, q = {query}, ext_res = {ext_res}")
323
- empty_indices.append(i)
324
- continue
325
-
326
- extracted_results.append( {
327
- "index": idx,
328
- "business_id": business_id,
329
- "business_name": business_name,
330
- "evidence": evidence,
331
- ** ext_res
332
- } )
333
- extracted_results = pd.DataFrame(extracted_results)
334
-
335
- return {
336
- "extracted_results": extracted_results,
337
- "empty_indices": empty_indices
338
- }
339
-
340
- def extract_results_mp( crawled_results, extracted_file_path, classes: list, provider: str, model: str, n_processes: int = 4):
341
- """
342
- Argument
343
- crawled_results: dataframe
344
- extracted_file_path
345
- classes: list
346
- Return
347
- Reference
348
- 200 records, 4 processes, 502.26914715766907
349
- """
350
- st = time.time()
351
- # args.extracted_file_path = "data/extracted_results.joblib"
352
- if not os.path.exists(extracted_file_path):
353
- split_data = split_dataframe( crawled_results)
354
- with mp.Pool(n_processes) as pool:
355
- extracted_results = pool.starmap( extract_results, [ (x, classes, provider, model) for x in split_data])
356
- extracted_results = merge_results( extracted_results, dataframe_columns=['extracted_results'], list_columns=['empty_indices'])
357
- with open( extracted_file_path, "wb") as f:
358
- joblib.dump( extracted_results, f)
359
- else:
360
- with open( extracted_file_path, "rb") as f:
361
- extracted_results = joblib.load(f)
362
- print( f"total time: {time.time() - st}")
363
- return extracted_results
364
-
365
-
366
- def postprocess_result( results: pd.DataFrame, postprocessed_results_path, category_hierarchy: dict, column_name: str = 'category'):
367
- """
368
- Argument
369
- analysis_result: `evidence`, `result`
370
- postprocessed_results_path
371
- Return
372
- """
373
- # index = analysis_result['result']['index']
374
- # store_name = data.loc[index]['營業人名稱'] if len(analysis_result['result'].get('store_name',''))==0 else analysis_result['result']['store_name']
375
- # address = data.loc[index]['營業地址'] if len(analysis_result['result'].get('address',''))==0 else analysis_result['result']['address']
376
- # post_res = {
377
- # "evidence": analysis_result['evidence'],
378
- # "index": index,
379
- # "begin_date": data.loc[index]['設立日期'],
380
- # "store_name": store_name,
381
- # "address": address,
382
- # "description": analysis_result['result'].get('description', ""),
383
- # "phone_number": analysis_result['result'].get('phone_number', ""),
384
- # "category": analysis_result['result'].get('category', ""),
385
- # "supercategory": category_hierarchy.get(analysis_result['result'].get('category', ""), analysis_result['result'].get('category',"")),
386
- # }
387
- if not os.path.exists(postprocessed_results_path):
388
- postprocessed_results = results.copy()
389
- postprocessed_results['supercategory'] = postprocessed_results[column_name].apply(lambda x: category_hierarchy.get(x, ''))
390
- with open( postprocessed_results_path, "wb") as f:
391
- joblib.dump( postprocessed_results, f)
392
- else:
393
- with open( postprocessed_results_path, "rb") as f:
394
- postprocessed_results = joblib.load(f)
395
- return postprocessed_results
396
-
397
-
398
- def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'):
399
- """
400
- Argument
401
- classified_results_df: dataframe
402
- combined_results_path
403
- src_column: str
404
- strategy: str, 'replace' or 'patch'
405
- Return
406
- combined_results: dataframe
407
- """
408
- if not os.path.exists(combined_results_path):
409
- combined_results = results.copy()
410
- if strategy == 'replace':
411
- condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column])
412
- combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
413
- elif strategy == 'patch':
414
- condition = (combined_results[tgt_column]=='')
415
- combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
416
- else:
417
- raise Exception(f"Strategy {strategy} not implemented")
418
- with open( combined_results_path, "wb") as f:
419
- joblib.dump( combined_results, f)
420
- else:
421
- with open( combined_results_path, "rb") as f:
422
- combined_results = joblib.load(f)
423
- return combined_results
424
-
425
- def format_evidence(evidence):
426
- """
427
- """
428
- formatted = []
429
- evidence = json.loads(evidence)
430
- # print( len(evidence) )
431
- for i in range(len(evidence)):
432
- if 'title' in evidence[i] and '顧客評價' in evidence[i]:
433
- f = f"\n> 顧客評價: {evidence[i]['顧客評價']}"
434
- elif 'title' in evidence[i] and evidence[i]['title']=='類似的店':
435
- f = f"\n> 類似的店: {evidence[i]['snippet']}"
436
- elif 'status' in evidence[i]:
437
- f = f"\n> 經營狀態: {evidence[i]['status']}"
438
- elif 'telephone_number' in evidence[i]:
439
- f = f"\n> 電話號碼: {evidence[i]['telephone_number']}"
440
- else:
441
- try:
442
- f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})"
443
- except KeyError:
444
- print( evidence[i] )
445
- raise KeyError
446
- formatted.append(f)
447
- return "\n".join(formatted)
448
-
449
- def format_output( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func = format_evidence):
450
- """
451
- Argument
452
- df: `evidence`, `result`
453
- input_column:
454
- output_column:
455
- format_func:
456
- Return
457
- formatted_df: dataframe of `formatted_evidence`
458
- """
459
- formatted_df = df.copy()
460
- formatted_df[output_column] = formatted_df[input_column].apply(format_evidence)
461
- return formatted_df
462
-
463
- def merge_results( results: list, dataframe_columns: list, list_columns: list):
464
- """
465
- Argument
466
- results: a list of dataframes
467
- dataframe_columns: list
468
- list_columns: list
469
- """
470
- assert len(results) > 0, "No results to merge"
471
- merged_results = {}
472
- for result in results:
473
- for key in dataframe_columns:
474
- mer_res = pd.concat([ r[key] for r in results], ignore_index=True)
475
- merged_results[key] = mer_res
476
-
477
- for key in list_columns:
478
- mer_res = list(itertools.chain(*[ r[key] for r in results]))
479
- merged_results[key] = mer_res
480
-
481
- return merged_results
482
-
483
-
484
- def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list:
485
- """
486
- """
487
- n = df.shape[0]
488
- n_per_process = math.ceil(n / n_processes)
489
- return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)]
490
-
491
-
492
  def continue_missing(args):
493
  """
494
  """
495
  data = get_leads(args.data_path)
496
  n_data = data.shape[0]
497
 
498
- formatted_results_path = os.path.join( args.output_dir, args.formatted_results_path)
499
- formatted_results = pd.read_csv(formatted_results_path)
500
  missing_indices = []
501
  for i in range(n_data):
502
  if i not in formatted_results['index'].unique():
503
- print(f"{i} is not found")
504
  missing_indices.append(i)
 
 
 
 
 
 
 
505
 
506
- crawled_results_path = os.path.join( args.output_dir, args.crawled_file_path)
507
- crawled_results = joblib.load( open( crawled_results_path, "rb"))
508
- crawled_results = crawled_results['crawled_results'].query( f"index in {missing_indices}")
509
- print( crawled_results)
510
-
511
- er = extract_results( crawled_results, classes = args.classes, provider = args.provider, model = args.model)
512
- er = er['extracted_results']
513
- print(er['category'])
514
-
515
- postprossed_results = postprocess_result(
516
- er,
517
- "/tmp/postprocessed_results.joblib",
518
- category2supercategory
519
- )
520
-
521
- out_formatted_results = format_output(
522
- postprossed_results,
523
- input_column = 'evidence',
524
- output_column = 'formatted_evidence',
525
- format_func = format_evidence
526
- )
527
-
528
- out_formatted_results.to_csv( "/tmp/formatted_results.missing.csv", index=False)
529
- formatted_results = pd.concat([formatted_results, out_formatted_results], ignore_index=True)
530
- formatted_results.sort_values(by='index', ascending=True, inplace=True)
531
- formatted_results.to_csv( "/tmp/formatted_results.csv", index=False)
532
 
533
 
534
  def main(args):
@@ -541,125 +76,150 @@ def main(args):
541
  extract: 2791.631685256958(delay = 10)
542
  classify: 2374.4915606975555(delay = 10)
543
  """
544
- crawled_file_path = os.path.join( args.output_dir, args.crawled_file_path)
545
- extracted_file_path = os.path.join( args.output_dir, args.extracted_file_path)
546
- classified_file_path = os.path.join( args.output_dir, args.classified_file_path)
547
- combined_file_path = os.path.join( args.output_dir, args.combined_file_path)
548
- postprocessed_results = os.path.join( args.output_dir, args.postprocessed_results)
549
- formatted_results_path = os.path.join( args.output_dir, args.formatted_results_path)
 
 
 
550
 
551
  ## 讀取資料名單 ##
552
  data = get_leads(args.data_path)
553
 
554
  ## 進行爬蟲與分析 ##
555
- crawled_results = crawl_results_mp( data, crawled_file_path, n_processes=args.n_processes)
556
- # crawled_results = { k:v[-5:] for k,v in crawled_results.items()}
 
 
 
 
 
 
 
 
557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  ## 方法 1: 擷取關鍵資訊與分類 ##
559
- extracted_results = extract_results_mp(
560
- crawled_results = crawled_results['crawled_results'],
561
- extracted_file_path = extracted_file_path,
562
- classes = args.classes,
563
- provider = args.provider,
564
- model = args.model,
565
- n_processes = args.n_processes
566
- )
 
 
 
 
 
567
 
568
  ## 方法2: 直接對爬蟲結果分類 ##
569
- classified_results = classify_results_mp(
570
- extracted_results['extracted_results'],
571
- classified_file_path,
572
- classes = args.classes,
573
- backup_classes = args.backup_classes,
574
- provider = args.provider,
575
- model = args.model,
576
- n_processes = args.n_processes
577
- )
578
 
579
  ## 合併分析結果 ##
580
- combined_results = combine_results(
581
- classified_results['classified_results'],
582
- combined_file_path,
583
- src_column = 'classified_category',
584
- tgt_column = 'category',
585
- strategy = args.strategy
586
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
  ## 後處理分析結果 ##
589
- postprossed_results = postprocess_result(
590
- combined_results,
591
- postprocessed_results,
592
- category2supercategory
593
- )
594
-
595
- formatted_results = format_output( postprossed_results, input_column = 'evidence', output_column = 'formatted_evidence', format_func = format_evidence)
596
- formatted_results.to_csv( formatted_results_path, index=False)
597
-
598
-
599
- category2supercategory = {
600
- "小吃店": "中式",
601
- "日式料理(含居酒屋,串燒)": "中式",
602
- "火(鍋/爐)": "中式",
603
- "東南亞料��(不含日韓)": "中式",
604
- "海鮮熱炒": "中式",
605
- "特色餐廳(含雞、鵝、牛、羊肉)": "中式",
606
- "傳統餐廳": "中式",
607
- "燒烤": "中式",
608
- "韓式料理(含火鍋,烤肉)": "中式",
609
- "西餐廳(含美式,義式,墨式)": "西式",
610
- "中式": "中式",
611
- "西式": "西式",
612
- "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)": "西式",
613
- "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)": "西式",
614
- "早餐": ""
615
- }
616
 
617
- supercategory2category = {
618
- "中式": [
619
- "小吃店",
620
- "日式料理(含居酒屋,串燒)",
621
- "火(鍋/爐)",
622
- "東南亞料理(不含日韓)",
623
- "海鮮熱炒",
624
- "特色餐廳(含雞、鵝、牛、羊肉)",
625
- "傳統餐廳",
626
- "燒烤",
627
- "韓式料理(含火鍋,烤肉)"
628
- ],
629
- "西式": ["西餐廳(含美式,義式,墨式)", "西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)", "西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)"],
630
- "": ["早餐"]
631
- }
632
 
633
  if __name__=='__main__':
634
 
635
  base = "https://serpapi.com/search.json"
636
  engine = 'google'
637
- # query = "Coffee"
638
  google_domain = 'google.com.tw'
639
  gl = 'tw'
640
  lr = 'lang_zh-TW'
641
- # url = f"{base}?engine={engine}&q={query}&google_domain={google_domain}&gl={gl}&lr={lr}"
642
  n_processes = 4
643
  client = OpenAI( organization = ORGANIZATION_ID)
644
 
645
  parser = argparse.ArgumentParser()
 
646
  parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
 
647
  parser.add_argument("--task", type=str, default="new", choices = ["new", "continue"], help="new or continue")
 
648
  parser.add_argument("--output_dir", type=str, help='output directory')
 
649
  parser.add_argument("--classified_file_path", type=str, default="classified_results.joblib")
650
  parser.add_argument("--extracted_file_path", type=str, default="extracted_results.joblib")
651
  parser.add_argument("--crawled_file_path", type=str, default="crawled_results.joblib")
652
  parser.add_argument("--combined_file_path", type=str, default="combined_results.joblib")
653
- parser.add_argument("--postprocessed_results", type=str, default="postprocessed_results.joblib")
 
654
  parser.add_argument("--formatted_results_path", type=str, default="formatted_results.csv")
655
- parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', '西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)', '西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)', '早餐'])
 
 
 
656
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
657
  parser.add_argument("--strategy", type=str, default='patch', choices=['replace', 'patch'])
658
- parser.add_argument("--provider", type=str, default='openai', choices=['openai', 'anthropic'])
659
- parser.add_argument("--model", type=str, default='gpt-4-0125-preview', choices=['claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview'])
 
 
 
 
 
660
  parser.add_argument("--n_processes", type=int, default=4)
661
  args = parser.parse_args()
662
 
 
 
 
 
663
  if args.task == 'new':
664
  main(args)
665
  elif args.task == 'continue':
 
1
  import os
2
+ import sys
3
  import time
4
  import json
5
  import joblib
 
7
  import itertools
8
  import argparse
9
  import multiprocessing as mp
10
+ from typing import List
11
+ from pathlib import Path
12
 
13
+ import yaml
14
+ import jinja2
15
+ import requests
16
  import pandas as pd
17
  from dotenv import load_dotenv
18
  from serpapi import GoogleSearch
19
  import tiktoken
20
  from openai import OpenAI
21
  from tqdm import tqdm
22
+ from loguru import logger
23
 
24
  from model import llm
25
+ from data import get_leads, format_search_results
26
+ from utils import (parse_json_garbage, split_dataframe, merge_results,
27
+ combine_results, split_dict, format_df,
28
+ clean_quotes, compose_query, reverse_category2supercategory)
29
+ from batch import postprocess_result
30
+ from pipeline import (get_serp, get_condensed_result, get_organic_result, get_googlemap_results,
31
+ crawl_results, crawl_results_mp,
32
+ compose_extraction, extract_results, extract_results_mp,
33
+ compose_classification, classify_results, classify_results_mp,
34
+ compose_regularization, regularize_results, regularize_results_mp,
35
+ compose_filter, filter_results, filter_results_mp)
36
  load_dotenv()
37
  ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
38
  SERP_API_KEY = os.getenv('SERP_APIKEY')
39
+ SERPER_API_KEY = os.getenv('SERPER_API_KEY')
40
+
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def continue_missing(args):
43
  """
44
  """
45
  data = get_leads(args.data_path)
46
  n_data = data.shape[0]
47
 
48
+ formatted_results = pd.read_csv(os.path.join( args.output_dir, args.formatted_results_path))
 
49
  missing_indices = []
50
  for i in range(n_data):
51
  if i not in formatted_results['index'].unique():
52
+ logger.debug(f"{i} is not found")
53
  missing_indices.append(i)
54
+ if len(missing_indices)==0:
55
+ logger.debug("No missing data")
56
+ return
57
+ missing_data = data.loc[missing_indices]
58
+ if not os.path.exists(args.output_missing_dir):
59
+ os.makedirs(args.output_missing_dir)
60
+ missing_data.to_csv( args.missing_data_path, index=False, header=False)
61
 
62
+ args.data_path = args.missing_data_path
63
+ args.output_dir = args.output_missing_dir
64
+ if missing_data.shape[0]<args.n_processes:
65
+ args.n_processes = 1
66
+ main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  def main(args):
 
76
  extract: 2791.631685256958(delay = 10)
77
  classify: 2374.4915606975555(delay = 10)
78
  """
79
+ steps = args.steps
80
+ crawled_file_path = os.path.join( args.output_dir, args.crawled_file_path) if args.crawled_file_path is not None else None
81
+ extracted_file_path = os.path.join( args.output_dir, args.extracted_file_path) if args.extracted_file_path is not None else None
82
+ # classified_file_path = os.path.join( args.output_dir, args.classified_file_path)
83
+ # combined_file_path = os.path.join( args.output_dir, args.combined_file_path)
84
+ postprocessed_file_path = os.path.join( args.output_dir, args.postprocessed_file_path) if args.postprocessed_file_path is not None else None
85
+ # formatted_results_path = os.path.join( args.output_dir, args.formatted_results_path)
86
+ filtered_file_path = os.path.join( args.output_dir, args.filtered_file_path) if args.filtered_file_path is not None else None
87
+ regularized_file_path = os.path.join( args.output_dir, args.regularized_file_path) if args.regularized_file_path is not None else None
88
 
89
  ## 讀取資料名單 ##
90
  data = get_leads(args.data_path)
91
 
92
  ## 進行爬蟲與分析 ##
93
+ if steps=='all' or steps=='crawl':
94
+ Path(crawled_file_path).parent.mkdir(parents=True, exist_ok=True)
95
+ crawled_results = crawl_results_mp(
96
+ data,
97
+ crawled_file_path,
98
+ serp_provider=args.serp_provider,
99
+ n_processes=args.n_processes
100
+ )
101
+ else:
102
+ sys.exit(0)
103
 
104
+ # crawled_results = { k:v[-5:] for k,v in crawled_results.items()}
105
+ # crawled_results['crawled_results'].to_csv( formatted_results_path, index=False)
106
+
107
+ ## 篩選爬蟲結果 ##
108
+ # filtered_results = filter_results_mp(
109
+ # data = crawled_results['crawled_results'],
110
+ # filtered_file_path = filtered_file_path,
111
+ # provider = args.filter_provider,
112
+ # model = args.filter_model,
113
+ # n_processes = args.n_processes
114
+ # )
115
+ # sys.exit(0)
116
+
117
  ## 方法 1: 擷取關鍵資訊與分類 ##
118
+ if steps=='all' or steps=='extract':
119
+ assert os.path.exists(crawled_file_path), f"# CRAWLED file not found: {crawled_file_path}"
120
+ crawled_results = joblib.load( open(crawled_file_path, "rb"))
121
+ extracted_results = extract_results_mp(
122
+ crawled_results = crawled_results['crawled_results'], # filtered_results['filtered_results'], # crawled_results['crawled_results'],
123
+ extracted_file_path = extracted_file_path,
124
+ classes = args.classes,
125
+ provider = args.extraction_provider, # 'openai', # args.provider,
126
+ model = args.extraction_model, # 'gpt-3.5-turbo-0125', # args.model,
127
+ n_processes = args.n_processes
128
+ )
129
+ else:
130
+ sys.exit(0)
131
 
132
  ## 方法2: 直接對爬蟲結果分類 ##
133
+ # classified_results = classify_results_mp(
134
+ # extracted_results['extracted_results'],
135
+ # classified_file_path,
136
+ # classes = args.classes,
137
+ # backup_classes = args.backup_classes,
138
+ # provider = args.provider,
139
+ # model = args.model,
140
+ # n_processes = args.n_processes
141
+ # )
142
 
143
  ## 合併分析結果 ##
144
+ # combined_results = combine_results(
145
+ # classified_results['classified_results'],
146
+ # combined_file_path,
147
+ # src_column = 'classified_category',
148
+ # tgt_column = 'category',
149
+ # strategy = args.strategy
150
+ # )
151
+
152
+ ## 正規化分類結果 ##
153
+ if steps=='all' or steps=='regularize':
154
+ assert os.path.exists(args.extracted_file_path), f"# extracted result file not found: {args.extracted_file_path}"
155
+ extracted_results = joblib.load( open(extracted_file_path, "rb"))
156
+ regularize_results = regularize_results_mp(
157
+ extracted_results['extracted_results'],
158
+ regularized_file_path,
159
+ provider = args.regularization_provider, # 'google', # 'openai', # args.provider,
160
+ model = args.regularization_model # 'gemini-1.5-flash' # 'gpt-3.5-turbo-0125' # args.model
161
+ )
162
+ else:
163
+ sys.exit(0)
164
 
165
  ## 後處理分析結果 ##
166
+ if steps=='all' or steps=='postprocess':
167
+ assert os.path.exists(args.regularized_file_path), f"# extracted result file not found: {args.extracted_file_path}"
168
+ regularize_results = joblib.load( open(regularized_file_path, "rb"))
169
+ postprossed_results = postprocess_result(
170
+ regularize_results['regularized_results'], # extracted_results['extracted_results'], # combined_results,
171
+ postprocessed_file_path,
172
+ category2supercategory
173
+ )
174
+ else:
175
+ sys.exit(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  if __name__=='__main__':
179
 
180
  base = "https://serpapi.com/search.json"
181
  engine = 'google'
 
182
  google_domain = 'google.com.tw'
183
  gl = 'tw'
184
  lr = 'lang_zh-TW'
 
185
  n_processes = 4
186
  client = OpenAI( organization = ORGANIZATION_ID)
187
 
188
  parser = argparse.ArgumentParser()
189
+ parser.add_argument("--config", type=str, default='config/config.yml', help="Path to the configuration file")
190
  parser.add_argument("--data_path", type=str, default="data/餐廳類型分類.xlsx - 測試清單.csv")
191
+ parser.add_argument("--missing_data_path", type=str, default="data/missing/missing.csv")
192
  parser.add_argument("--task", type=str, default="new", choices = ["new", "continue"], help="new or continue")
193
+ parser.add_argument("--steps", type=str, default="all", choices = ["all", "crawl", "extract", "regularize", "postprocess"], help="new or continue")
194
  parser.add_argument("--output_dir", type=str, help='output directory')
195
+ parser.add_argument("--output_missing_dir", type=str, help='output missing directory')
196
  parser.add_argument("--classified_file_path", type=str, default="classified_results.joblib")
197
  parser.add_argument("--extracted_file_path", type=str, default="extracted_results.joblib")
198
  parser.add_argument("--crawled_file_path", type=str, default="crawled_results.joblib")
199
  parser.add_argument("--combined_file_path", type=str, default="combined_results.joblib")
200
+ parser.add_argument("--regularized_file_path", type=str, default="regularized_results.joblib")
201
+ parser.add_argument("--postprocessed_file_path", type=str, default="postprocessed_results.csv")
202
  parser.add_argument("--formatted_results_path", type=str, default="formatted_results.csv")
203
+ parser.add_argument("--filtered_file_path", type=str, default="filtered_results.csv")
204
+ # parser.add_argument("--classes", type=list, default=['小吃店', '日式料理(含居酒屋,串燒)', '火(鍋/爐)', '東南亞料理(不含日韓)', '海鮮熱炒', '特色餐廳(含雞、鵝、牛、羊肉)', '傳統餐廳', '燒烤', '韓式料理(含火鍋,烤肉)', '西餐廳(含美式,義式,墨式)', '西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)', '西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)', '早餐'])
205
+ parser.add_argument("--classes", type=list, default=['小吃店','日式料理(含居酒屋,串燒)','火(鍋/爐)','東南亞料理(不含日韓)','海鮮熱炒','特色餐廳(含雞、鵝、牛、羊肉)','釣蝦場','傳統餐廳','燒烤','韓式料理(含火鍋,烤肉)','PUB(Live Band)','PUB(一般,含Lounge)','PUB(電音\舞場)','五星級飯店','自助KTV(含連鎖,庭��自助)','西餐廳(含美式,義式,墨式)','咖啡廳(泡沫紅茶)','飯店(星級/旅館,不含五星級)','運動休閒館(含球類練習場,飛鏢等)','西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)','西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)','早餐'] )
206
+ # `小吃店`,`日式料理(含居酒屋,串燒)`,`火(鍋/爐)`,`東南亞料理(不含日韓)`,`海鮮熱炒`,`特色餐廳(含雞、鵝、牛、羊肉)`,`釣蝦場`,`傳統餐廳`,`燒烤`,`韓式料理(含火鍋,烤肉)`,`PUB(Live Band)`,`PUB(一般,含Lounge)`,`PUB(電音\舞場)`,`五星級飯店`,`自助KTV(含連鎖,庭園自助)`,`西餐廳(含美式,義式,墨式)`,`咖啡廳(泡沫紅茶)`,`飯店(星級/旅館,不含五星級)`,`運動休閒館(含球類練習場,飛鏢等)`,`西餐廳(餐酒館、酒吧、飛鏢吧、pub、lounge bar)`,`西餐廳(土耳其、漢堡、薯條、法式、歐式、印度)`,`早餐`
207
  parser.add_argument("--backup_classes", type=list, default=['中式', '西式'])
208
  parser.add_argument("--strategy", type=str, default='patch', choices=['replace', 'patch'])
209
+ parser.add_argument("--filter_provider", type=str, default='google', choices=['google', 'openai', 'anthropic'])
210
+ parser.add_argument("--filter_model", type=str, default='gemini-1.5-flash', choices=[ 'claude-3-5-sonnet-20240620', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview', 'gpt-4o', 'gpt-4o-mini', 'gemini-1.5-flash'])
211
+ parser.add_argument("--extraction_provider", type=str, default='openai', choices=['google', 'openai', 'anthropic'])
212
+ parser.add_argument("--extraction_model", type=str, default='gpt-3.5-turbo-0125', choices=[ 'claude-3-5-sonnet-20240620', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview', 'gpt-4o', 'gpt-4o-mini', 'gemini-1.5-flash'])
213
+ parser.add_argument("--regularization_provider", type=str, default='google', choices=['google', 'openai', 'anthropic'])
214
+ parser.add_argument("--regularization_model", type=str, default='gemini-1.5-flash', choices=['claude-3-5-sonnet-20240620', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'gpt-3.5-turbo-0125', 'gpt-4-0125-preview', 'gpt-4o', 'gpt-4o-mini', 'gemini-1.5-flash'])
215
+ parser.add_argument("--serp_provider", type=str, default='serp', choices=['serp', 'serper'])
216
  parser.add_argument("--n_processes", type=int, default=4)
217
  args = parser.parse_args()
218
 
219
+ config = yaml.safe_load(open(args.config,"r").read())
220
+ category2supercategory = config['category2supercategory']
221
+ supercategory2category = reverse_category2supercategory(category2supercategory)
222
+
223
  if args.task == 'new':
224
  main(args)
225
  elif args.task == 'continue':
utils.py CHANGED
@@ -1,13 +1,198 @@
 
1
  import re
2
  import json
 
 
 
 
3
 
4
- def parse_json_garbage(s):
5
- s = s[next(idx for idx, c in enumerate(s) if c in "{["):]
6
- print(s)
7
- s = s[:next(idx for idx, c in enumerate(s) if c in "}]")+1]
8
- print(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  try:
10
  return json.loads(re.sub("[//#].*","",s,flags=re.MULTILINE))
11
  except json.JSONDecodeError as e:
12
- return json.loads(re.sub("[//#].*","",s,flags=re.MULTILINE))
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import re
3
  import json
4
+ import itertools
5
+ import math
6
+ import joblib
7
+ from typing import List
8
 
9
+ import pandas as pd
10
+ from loguru import logger
11
+
12
+
13
+ def parse_json_garbage(s, start="{", end="}"):
14
+ """Parse JSON string without comments
15
+ Argument
16
+ s: str
17
+ start: str
18
+ end: str
19
+ Return
20
+ json_obj: dict
21
+ """
22
+ s = s[next(idx for idx, c in enumerate(s) if c in start):]
23
+ # print(f"fix head -> {s}")
24
+ s = s[:next(idx for idx, c in enumerate(s) if c in end)+1]
25
+ # print(f"fix tail -> {s}")
26
+ if s.startswith("json"):
27
+ s = s[4:]
28
  try:
29
  return json.loads(re.sub("[//#].*","",s,flags=re.MULTILINE))
30
  except json.JSONDecodeError as e:
31
+ logger.warning(f"Error parsing JSON (trying another regex...): {e}")
32
+ return json.loads(re.sub("^[//#].*","",s,flags=re.MULTILINE))
33
 
34
+
35
+ def merge_results( results: list, dataframe_columns: list, list_columns: list):
36
+ """
37
+ Argument
38
+ results: a list of dataframes
39
+ dataframe_columns: list
40
+ list_columns: list
41
+ Return
42
+ merged_results: dict
43
+ """
44
+ assert len(results) > 0, "No results to merge"
45
+ merged_results = {}
46
+ for result in results:
47
+ for key in dataframe_columns:
48
+ mer_res = pd.concat([ r[key] for r in results], ignore_index=True)
49
+ merged_results[key] = mer_res
50
+
51
+ for key in list_columns:
52
+ mer_res = list(itertools.chain(*[ r[key] for r in results]))
53
+ merged_results[key] = mer_res
54
+
55
+ return merged_results
56
+
57
+
58
+ def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list:
59
+ """
60
+ """
61
+ n = df.shape[0]
62
+ n_per_process = max( math.ceil(n / n_processes), 1)
63
+ return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)]
64
+
65
+
66
+ def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'):
67
+ """
68
+ Argument
69
+ classified_results_df: dataframe
70
+ combined_results_path
71
+ src_column: str
72
+ strategy: str, 'replace' or 'patch'
73
+ Return
74
+ combined_results: dataframe
75
+ """
76
+ if not os.path.exists(combined_results_path):
77
+ combined_results = results.copy()
78
+ if strategy == 'replace':
79
+ condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column])
80
+ combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
81
+ elif strategy == 'patch':
82
+ condition = (combined_results[tgt_column]=='')
83
+ combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values
84
+ else:
85
+ raise Exception(f"Strategy {strategy} not implemented")
86
+ with open( combined_results_path, "wb") as f:
87
+ joblib.dump( combined_results, f)
88
+ else:
89
+ with open( combined_results_path, "rb") as f:
90
+ combined_results = joblib.load(f)
91
+ return combined_results
92
+
93
+
94
+ def split_dict( information: dict | List[dict], keys1: List[str], keys2: List[str]):
95
+ """[ { key1: value1, key2: value2}, { key1: value1, key2: value2}] -> [ {key1: value1}, {key1: value1}], [{key2: value2, key2: value2}]
96
+ Argument
97
+ information: dict | List[dict], dim -> N
98
+ keys1: List[str], dim -> K1
99
+ keys2: List[str], dim -> K2
100
+ Example:
101
+ >> split_dict( [ {"a": 1, "b":2, "c": 3}, {"a": 1, "b":2, "c": 3}, {"a": 1, "b":2, "c": 3}], ['a','b'], ['c'])
102
+ >> ( [{'a': 1, 'b': 2}, {'a': 1, 'b': 2}, {'a': 1, 'b': 2}], [{'c': 3}, {'c': 3}, {'c': 3}] )
103
+ """
104
+ assert len(keys1)>0 and len(keys2)>0
105
+ results1, results2 = [], []
106
+ if isinstance( information, dict):
107
+ information = [ information]
108
+ for info in information: # N
109
+ split_results1 = {} # K1
110
+ for key in keys1:
111
+ if key in info:
112
+ split_results1[key] = info[key]
113
+ else:
114
+ split_results1[key] = None
115
+ results1.append( split_results1)
116
+ split_results2 = {} # K2
117
+ for key in keys2:
118
+ if key in info:
119
+ split_results2[key] = info[key]
120
+ else:
121
+ split_results2[key] = None
122
+ results2.append( split_results2)
123
+ # results.append( [ split_results1, split_results2])
124
+ assert len(results1)==len(results2)
125
+ if len(results1)==1:
126
+ return results1[0], results2[0]
127
+ return results1, results2
128
+
129
+
130
+ def format_df( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func: str = lambda x: x):
131
+ """
132
+ Argument
133
+ df: `evidence`, `result`
134
+ input_column:
135
+ output_column:
136
+ format_func:
137
+ Return
138
+ formatted_df: dataframe of `formatted_evidence`
139
+ """
140
+ formatted_df = df.copy()
141
+ formatted_df[output_column] = formatted_df[input_column].apply(format_func)
142
+ return formatted_df
143
+
144
+
145
+ def clean_quotes( text: str):
146
+ """
147
+ """
148
+ return text.strip().replace("\u3000","").replace("\r","").replace("\"", "").replace("'", "")
149
+
150
+
151
+ def compose_query( address, name, with_index: bool = True, exclude: str = "-inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw", use_exclude: bool = True):
152
+ """
153
+ Argumemnt
154
+ # d: series with d[1]: 地址, d[4]: 營業人名稱 #
155
+ address: str
156
+ name: str
157
+ with_index: bool
158
+ Return
159
+ query: `縣市` `營業人名稱`
160
+ """
161
+ # if with_index: # .itertuples()
162
+ # query = f"{d[1][:3]} {d[4]}"
163
+ # else:
164
+ # query = f"{d[0][:3]} {d[3]}"
165
+ if use_exclude:
166
+ query = f"{address[:3]} {name} {exclude}"
167
+ else:
168
+ query = f"{address[:3]} {name}"
169
+ return query
170
+
171
+
172
+ def reverse_category2supercategory(category2supercategory):
173
+ """
174
+ Argument
175
+ category2supercategory: dict
176
+ Return
177
+ supercategory2category: dict
178
+ """
179
+ supercategory2category = {}
180
+ for key, value in category2supercategory.items():
181
+ if value not in supercategory2category:
182
+ supercategory2category[value] = [key]
183
+ else:
184
+ supercategory2category[value].append(key)
185
+ return supercategory2category
186
+
187
+ def concat_df( list_df: List[pd.DataFrame], axis: int = 0):
188
+ """
189
+ Argument
190
+ list_df: List[pd.DataFrame]
191
+ axis: int
192
+ Return
193
+ df: pd.DataFrame
194
+ """
195
+ assert len(list_df)>0, "Empty list of dataframes"
196
+ if len(list_df)==1:
197
+ return list_df[0]
198
+ return pd.concat( list_df, axis=axis)