import os import sys import time import json import joblib import math import itertools import argparse import multiprocessing as mp from typing import List from pathlib import Path import yaml import jinja2 import requests import pandas as pd from dotenv import load_dotenv from serpapi import GoogleSearch import tiktoken from openai import OpenAI from tqdm import tqdm from loguru import logger from model import llm from data import get_leads, format_search_results from utils import (parse_json_garbage, split_dataframe, merge_results, combine_results, split_dict, format_df, clean_quotes, compose_query, reverse_category2supercategory) from batch import postprocess_result from pipeline import (get_serp, get_condensed_result, get_organic_result, get_googlemap_results, crawl_results, crawl_results_mp, compose_extraction, extract_results, extract_results_mp, compose_classification, classify_results, classify_results_mp, compose_regularization, regularize_results, regularize_results_mp, compose_filter, filter_results, filter_results_mp) load_dotenv() ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID') SERP_API_KEY = os.getenv('SERP_APIKEY') SERPER_API_KEY = os.getenv('SERPER_API_KEY') def continue_missing(args): """ """ data = get_leads(args.data_path) n_data = data.shape[0] formatted_results = pd.read_csv(os.path.join( args.output_dir, args.formatted_results_path)) missing_indices = [] for i in range(n_data): if i not in formatted_results['index'].unique(): logger.debug(f"{i} is not found") missing_indices.append(i) if len(missing_indices)==0: logger.debug("No missing data") return missing_data = data.loc[missing_indices] if not os.path.exists(args.output_missing_dir): os.makedirs(args.output_missing_dir) missing_data.to_csv( args.missing_data_path, index=False, header=False) args.data_path = args.missing_data_path args.output_dir = args.output_missing_dir if missing_data.shape[0]