File size: 1,300 Bytes
07d2942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash

# 設定檔案的基礎名稱
# base_file="output_0402_1_篩選結果.xlsx - Sheet1_"
# base_file="output_0402_2_篩選結果.xlsx - Sheet1_"
# base_file="output_0402_3_篩選結果.xlsx - Sheet1_"
base_file="output_0402_4_篩選結果.xlsx - Sheet1_"

# 設定總共要處理的檔案數量
start_index=0
total_files=17

# 設定每次處理消耗的數量和 API 限制
# consumption_per_run=1000
# api_rate_limit=3000
api_rate_limit=20000
wait_time_in_seconds=60 # 1500 # 25 mins

# 迴圈執行
for i in $(seq $start_index $total_files); do
    # 動態生成檔案名稱
    file_name="${base_file}${i}.csv"
    crawled_file_path="${base_file}${i}/crawled_results.joblib"
    
    # 執行 python 指令
    python sheet.py --data_path "data/production/${file_name}" --task new \
    --step crawl \
    --output_dir data/gpt-4o-mini \
    --n_processes 4 \
    --serp_provider serp \
    --crawled_file_path "${crawled_file_path}" \
    --extraction_provider openai \
    --extraction_model gpt-4o-mini \
    --regularization_provider openai \
    --regularization_model gpt-4o-mini
    
    # 等待以避免 API rate limit
    echo "Completed task for ${file_name}. Waiting for ${wait_time_in_seconds} seconds..."
    sleep $wait_time_in_seconds
done

echo "All tasks completed."