import os import re import json import itertools import math import joblib from typing import List import pandas as pd from loguru import logger def parse_json_garbage(s, start="{", end="}"): """Parse JSON string without comments Argument s: str start: str end: str Return json_obj: dict """ s = s[next(idx for idx, c in enumerate(s) if c in start):] # print(f"fix head -> {s}") s = s[:next(idx for idx, c in enumerate(s) if c in end)+1] # print(f"fix tail -> {s}") if s.startswith("json"): s = s[4:] try: return json.loads(re.sub("[//#].*","",s,flags=re.MULTILINE)) except json.JSONDecodeError as e: logger.warning(f"Error parsing JSON (trying another regex...): {e}") return json.loads(re.sub("^[//#].*","",s,flags=re.MULTILINE)) def merge_results( results: list, dataframe_columns: list, list_columns: list): """ Argument results: a list of dataframes dataframe_columns: list list_columns: list Return merged_results: dict """ assert len(results) > 0, "No results to merge" merged_results = {} for result in results: for key in dataframe_columns: mer_res = pd.concat([ r[key] for r in results], ignore_index=True) merged_results[key] = mer_res for key in list_columns: mer_res = list(itertools.chain(*[ r[key] for r in results])) merged_results[key] = mer_res return merged_results def split_dataframe( df: pd.DataFrame, n_processes: int = 4) -> list: """ """ n = df.shape[0] n_per_process = max( math.ceil(n / n_processes), 1) return [ df.iloc[i:i+n_per_process] for i in range(0, n, n_per_process)] def combine_results( results: pd.DataFrame, combined_results_path: str, src_column: str = 'classified_category', tgt_column: str = 'category', strategy: str = 'replace'): """ Argument classified_results_df: dataframe combined_results_path src_column: str strategy: str, 'replace' or 'patch' Return combined_results: dataframe """ if not os.path.exists(combined_results_path): combined_results = results.copy() if strategy == 'replace': condition = (combined_results[tgt_column]=='') | (combined_results[src_column]!=combined_results[tgt_column]) combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values elif strategy == 'patch': condition = (combined_results[tgt_column]=='') combined_results.loc[ condition, tgt_column] = combined_results[condition][src_column].values else: raise Exception(f"Strategy {strategy} not implemented") with open( combined_results_path, "wb") as f: joblib.dump( combined_results, f) else: with open( combined_results_path, "rb") as f: combined_results = joblib.load(f) return combined_results def split_dict( information: dict | List[dict], keys1: List[str], keys2: List[str]): """[ { key1: value1, key2: value2}, { key1: value1, key2: value2}] -> [ {key1: value1}, {key1: value1}], [{key2: value2, key2: value2}] Argument information: dict | List[dict], dim -> N keys1: List[str], dim -> K1 keys2: List[str], dim -> K2 Example: >> split_dict( [ {"a": 1, "b":2, "c": 3}, {"a": 1, "b":2, "c": 3}, {"a": 1, "b":2, "c": 3}], ['a','b'], ['c']) >> ( [{'a': 1, 'b': 2}, {'a': 1, 'b': 2}, {'a': 1, 'b': 2}], [{'c': 3}, {'c': 3}, {'c': 3}] ) """ assert len(keys1)>0 and len(keys2)>0 results1, results2 = [], [] if isinstance( information, dict): information = [ information] for info in information: # N split_results1 = {} # K1 for key in keys1: if key in info: split_results1[key] = info[key] else: split_results1[key] = None results1.append( split_results1) split_results2 = {} # K2 for key in keys2: if key in info: split_results2[key] = info[key] else: split_results2[key] = None results2.append( split_results2) # results.append( [ split_results1, split_results2]) assert len(results1)==len(results2) if len(results1)==1: return results1[0], results2[0] return results1, results2 def format_df( df: pd.DataFrame, input_column: str = 'evidence', output_column: str = 'formatted_evidence', format_func: str = lambda x: x): """ Argument df: `evidence`, `result` input_column: output_column: format_func: Return formatted_df: dataframe of `formatted_evidence` """ formatted_df = df.copy() formatted_df[output_column] = formatted_df[input_column].apply(format_func) return formatted_df def clean_quotes( text: str): """ """ return text.strip().replace("\u3000","").replace("\r","").replace("\"", "").replace("'", "") def compose_query( address, name, with_index: bool = True, exclude: str = "-inurl:twincn.com -inurl:findcompany.com.tw -inurl:iyp.com.tw -inurl:twypage.com -inurl:alltwcompany.com -inurl:zhupiter.com -inurl:twinc.com.tw", use_exclude: bool = True): """ Argumemnt # d: series with d[1]: 地址, d[4]: 營業人名稱 # address: str name: str with_index: bool Return query: `縣市` `營業人名稱` """ # if with_index: # .itertuples() # query = f"{d[1][:3]} {d[4]}" # else: # query = f"{d[0][:3]} {d[3]}" if use_exclude: query = f"{address[:3]} {name} {exclude}" else: query = f"{address[:3]} {name}" return query def reverse_category2supercategory(category2supercategory): """ Argument category2supercategory: dict Return supercategory2category: dict """ supercategory2category = {} for key, value in category2supercategory.items(): if value not in supercategory2category: supercategory2category[value] = [key] else: supercategory2category[value].append(key) return supercategory2category def concat_df( list_df: List[pd.DataFrame], axis: int = 0): """ Argument list_df: List[pd.DataFrame] axis: int Return df: pd.DataFrame """ assert len(list_df)>0, "Empty list of dataframes" if len(list_df)==1: return list_df[0] return pd.concat( list_df, axis=axis)