File size: 5,591 Bytes
4925baf
 
 
 
 
 
 
 
 
 
07d2942
4925baf
 
 
 
 
07d2942
 
4925baf
7cfd43a
4925baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07d2942
4925baf
 
 
 
 
8aa4241
4925baf
 
 
 
07d2942
 
4925baf
 
 
 
8aa4241
 
4925baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aa4241
4925baf
07d2942
60274d1
07d2942
4925baf
 
 
 
60274d1
07d2942
4925baf
 
 
07d2942
 
 
 
 
 
 
 
 
 
 
 
4925baf
 
 
 
 
 
 
 
 
07d2942
 
 
 
 
 
 
 
4925baf
07d2942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4925baf
 
 
 
07d2942
 
 
 
4925baf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os

import gradio as gr
import pandas as pd
from dotenv import load_dotenv
import jieba
jieba.cut('你好')
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
from loguru import logger

from sheet import compose_query, get_serp, get_condensed_result, extract_results, postprocess_result, format_output, category2supercategory

load_dotenv()

# logger = logging.getLogger(__name__) 
# logger.setLevel(logging.DEBUG)

classes = list([ x for x in category2supercategory.keys() if len(x)>0])

def plot_wordcloud( text):
    """
    """
    if os.getenv("FONT_PATH", None) is not None:
        wc_generator = WordCloud(font_path=os.getenv("FONT_PATH"))
    else:
        wc_generator = WordCloud()
    img = wc_generator.generate( " ".join(jieba.cut(text)))
    # fig, ax = plt.subplots()
    # ax.imshow(wordcloud, interpolation='bilinear')
    # ax.axis("off")
    return img.to_image()

def format_category( formatted_results):
    """
    """
    return "\n\n".join([
        f"> 大類別:{formatted_results['supercategory'].values[0]}",
        f"> 小類別:{formatted_results['category'].values[0]}",
        f"> 推測提供酒品:{ '是' if formatted_results['provide_alcohol'].values[0] else '否' }",
        f"> 商家名稱:{formatted_results['store_name'].values[0]}",
        f"> 電話:{formatted_results['phone_number'].values[0]}",
        f"> 描述:{formatted_results['description'].values[0]}"
    ])

def do( business_name: str, address: str):
    """
    """

    crawled_results = []
    provider = os.environ.get("DEFAULT_PROVIDER", "openai")
    model = os.environ.get("DEFAULT_MODEL", "'gpt-4o'") 

    google_domain = "google.com.tw"
    gl = 'tw'
    lr  = 'lang_zh-TW'
    business_id = 12345678
    
    query = compose_query(address, business_name)
    try:
        res = get_serp( query, google_domain, gl, lr)
    except Exception as e:
        return f"Error: {e}"
    
    cond_res = get_condensed_result(res)

    crawled_results.append( { 
        "index": 0, 
        "business_id": business_id, 
        "business_name": business_name, 
        "serp": res,
        "evidence": cond_res, 
        "address": address
    } )

    crawled_results = pd.DataFrame(crawled_results)
    # logger.debug(crawled_results)
    extracted_results = extract_results( crawled_results, classes=classes, provider = provider, model = model)
    # logger.error(extracted_results['extracted_results'].columns)
    extracted_results = extracted_results['extracted_results'][ [ 'business_id', 'business_name', 'address', 'category', 'evidence', 'phone_number', 'description', 'store_name', 'provide_alcohol'] ]
    logger.debug( extracted_results['category'])
    
    postprocessed_results = postprocess_result( extracted_results, postprocessed_results_path="/tmp/postprocessed_results.joblib", category_hierarchy=category2supercategory)
    os.remove("/tmp/postprocessed_results.joblib")

    formatted_results = format_output( postprocessed_results)
    logger.debug( formatted_results)
    
    formatted_output = format_category( formatted_results)

    img = plot_wordcloud(formatted_results['formatted_evidence'].values[0])
    return f"【搜尋結果】\n{formatted_results['formatted_evidence'].values[0][6:]}", img, f"【判斷結果】\n{formatted_output}"

def load( blob, progress=gr.Progress()):
    """
    """
    if isinstance(blob, str):
    #   df = pd.read_csv(StringIO(temp_file), parse_dates=[ "Start", "Finish"])
        df = pd.read_csv(blob, names=COLUMNS, header=None) # parse_dates=[ "Start", "Finish"]
    else:
      df = pd.read_csv(blob.name, names=COLUMNS, header=None) # parse_dates=[ "Start", "Finish"]
    print( df.head() )
    return df

## --- interface --- ##
# outputs = [gr.Dataframe(row_count = (1, "dynamic"), col_count=(6,"dynamic"), label="output data", interactive=1)]
# demo = gr.Interface(
#         fn=do,
#         inputs=[ "text", "text", "text"],
#         outputs=outputs,
#     )

COLUMNS = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
        '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']
CSS = """
h1 {
    text-align: center;
    display:block;
}
"""
## --- block --- ##
with gr.Blocks(css=CSS) as demo:
    gr.Markdown("# 🌟 自動分類餐廳型態 🌟")
    with gr.Tab('單筆'):
        with gr.Row():
            inputs = [ gr.Textbox( label="商家名稱", placeholder="輸入商家或公司名稱"), gr.Textbox(label="地址", placeholder="至少輸入縣市,完整地址更好")]
        with gr.Row():
            btn = gr.Button("Submit")
        with gr.Row():
            outputs = [ gr.Markdown( label="參考資料(google search)"), gr.Image( label="文字雲"), gr.Markdown( label="類別", )]
        btn.click(fn=do, inputs=inputs, outputs=outputs)
    with gr.Tab('批次'):
        with gr.Row():
            batch_inputs = [ gr.UploadButton("上傳檔案", file_count="single")]
        with gr.Row():
            batch_btn = gr.Button("批量處理")
        with gr.Row():
            batch_outputs = [ gr.Dataframe(
                headers=COLUMNS,
                datatype=["str"] * 16
            )]
        batch_btn.click(fn=load, inputs=batch_inputs, outputs=batch_outputs)
        


if __name__ == "__main__":
    
    demo.launch(
        # share=True, 
        server_name = '0.0.0.0', auth=( os.environ.get('USERNAME'), os.environ.get('PASSWORD'))
    )