File size: 2,207 Bytes
07d2942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import json
import math

import pandas as pd
from loguru import logger


def get_leads( file_path: str, names: list = ['營業地址', '統一編號', '總機構統一編號', '營業人名稱', '資本額', '設立日期', '組織別名稱', '使用統一發票',
       '行業代號', '名稱', '行業代號1', '名稱1', '行業代號2', '名稱2', '行業代號3', '名稱3']):
    """
    """
    assert os.path.exists(file_path), f"File not found: {file_path}"
    data = pd.read_csv( file_path, names=names, dtype={"統一編號": str})
    return data

def format_search_results(evidence):
    """Format evidence (serp_results)
    Argument
        evidence: str. The format is a list of dictionaries
    Return
        formatted_evidence: str
    """
    search_results = []
    gmap_store_mentions = []
    evidence = json.loads(evidence)
    # print( len(evidence) )
    for i in range(len(evidence)):
        if 'title' in evidence[i] and '顧客評價' in evidence[i]:
            f = f"\n> 顧客評價: {evidence[i]['顧客評價']}"
            gmap_store_mentions.append(f)
        elif 'title' in evidence[i] and evidence[i]['title']=='類似的店':
            f = f"\n> 類似的店: {evidence[i]['snippet']}"
            gmap_store_mentions.append(f)
        elif 'status' in evidence[i]:
            f = f"\n> 經營狀態: {evidence[i]['status']}"
            gmap_store_mentions.append(f)
        elif 'telephone_number' in evidence[i]:
            f = f"\n> 電話號碼: {evidence[i]['telephone_number']}"
            gmap_store_mentions.append(f)
        else:
            try:
                f = f"{i+1}. {evidence[i]['title']} ({evidence[i].get('snippet','')})"
            except KeyError:
                logger.error( evidence[i] )
                raise KeyError
            search_results.append(f)
    return "## 店面資訊: " + "\n".join(gmap_store_mentions) + "\n" + "\n## 網路搜尋結果: " + "\n".join(search_results)

def split_data( data: pd.DataFrame, samples: int = 4000):
    """
    """
    data_len = len(data)
    n = math.ceil(data_len/samples)
    data_list = [data[ i*samples: (i+1)*samples] for i in range(n)]
    return data_list