Spaces:
Sleeping
Sleeping
# ---------------------- Library Imports ---------------------- | |
import time | |
import os | |
import json | |
import pandas as pd | |
import logging | |
import requests | |
from dotenv import load_dotenv | |
# ---------------------- Environment Variables ---------------------- | |
load_dotenv() | |
url_etherscan = os.getenv("URL_ETHERSCAN") | |
api_key_etherscan = os.getenv("API_KEY_ETHERSCAN") | |
log_folder = os.getenv("LOG_FOLDER") | |
os.makedirs(log_folder, exist_ok=True) | |
log_file = os.path.join(log_folder, "etherscan_scrap.log") | |
log_format = "%(asctime)s [%(levelname)s] - %(message)s" | |
logging.basicConfig(filename=log_file, level=logging.INFO, format=log_format) | |
# Load the JSON file into a dictionary | |
with open("ressources/dict_tokens_addr.json", "r") as file: | |
dict_addresses = json.load(file) | |
L_created = [] | |
L_updated = [] | |
n_blocks = 20000 | |
n_loop = n_blocks // 10_000 | |
# ---------------------- Helper Functions ---------------------- | |
def log_execution_time(func): | |
def wrapper(*args, **kwargs): | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
end_time = time.time() | |
logging.info(f"Function {func.__name__} executed in {end_time - start_time:.2f} seconds") | |
return result | |
return wrapper | |
def latest_block(start_block=None): | |
params = { | |
"module": "proxy", | |
"action": "eth_blockNumber", | |
"apikey": api_key_etherscan | |
} | |
response = requests.get(url_etherscan, params=params) | |
if response.status_code == 200: | |
try: | |
latest_block_number = int(response.json()["result"], 16) | |
if start_block is not None: | |
return latest_block_number, latest_block_number - start_block | |
return latest_block_number | |
except (ValueError, KeyError): | |
logging.error(f"Invalid response format or missing data in response: {response.json()}") | |
return None, None | |
else: | |
logging.error(f"API call failed with status code {response.status_code}: {response.json()}") | |
return None, None | |
def get_coin_data(contractAddr, n): | |
latest_block_number = latest_block() | |
if latest_block_number is None: | |
logging.error(f"Could not retrieve latest block number for contract address {contractAddr}") | |
return pd.DataFrame() # Return an empty DataFrame | |
df_transactions = pd.DataFrame() | |
transactions_per_call = 10_000 | |
for i in range(n): | |
start_block = latest_block_number - (n - i) * transactions_per_call | |
end_block = latest_block_number - (n - 1 - i) * transactions_per_call | |
params = { | |
"module": "account", | |
"action": "tokentx", | |
"contractaddress": contractAddr, | |
"startblock": start_block, | |
"endblock": end_block, | |
"sort": "asc", | |
"apikey": api_key_etherscan | |
} | |
response = requests.get(url_etherscan, params=params) | |
transactions = response.json().get("result", []) | |
if not isinstance(transactions, list) or not all(isinstance(item, dict) for item in transactions): | |
logging.error(f"Invalid data format for transactions: {transactions}") | |
continue # Skip this iteration if transactions data is invalid | |
df_temp = pd.DataFrame(transactions) | |
if not df_temp.empty: | |
df_transactions = pd.concat([df_transactions, df_temp]) | |
time.sleep(1) | |
if 'timeStamp' in df_transactions: | |
df_transactions['timeStamp'] = pd.to_datetime(df_transactions['timeStamp'].astype(int), unit='s') | |
else: | |
logging.error("'timeStamp' key not found in the response data.") | |
return pd.DataFrame() # Return an empty DataFrame if key is missing | |
df_transactions['value'] = df_transactions['value'].astype(float) / 1e18 | |
return df_transactions | |
# ---------------------- Main Function ---------------------- | |
def fetch_and_update_etherscan(): | |
for tokenSymbol, contractAddr in dict_addresses.items(): | |
file = f"output/transactions_{tokenSymbol}.csv" | |
if not os.path.exists(file): | |
L_created.append(file) | |
df_transactions = get_coin_data(contractAddr, n_loop) | |
df_transactions_no_dup = df_transactions.drop(["confirmations", "input"], axis=1).drop_duplicates(subset="hash") | |
df_transactions_no_dup.to_csv(file, sep=",", index=False) | |
else: | |
L_updated.append(file) | |
df_temp = pd.read_csv(file, sep=",") | |
df_temp = df_temp.sort_values("blockNumber", ascending=False) | |
start_block = df_temp["blockNumber"].iloc[0] | |
latest_block_number, diff = latest_block(start_block) | |
if latest_block_number is None: | |
logging.error(f"Failed to retrieve latest block number for token: {tokenSymbol}") | |
continue | |
n_loop_to_concat = (diff // 10000) + 1 | |
df_transactions = get_coin_data(contractAddr, n_loop_to_concat) | |
df_latest = pd.concat([df_transactions, df_temp]).drop(["confirmations", "input"], axis=1) | |
df_latest_no_dup = df_latest.drop_duplicates(subset="hash") | |
df_latest_no_dup.loc[:, "blockNumber"] = df_latest_no_dup["blockNumber"].astype(int) | |
df_latest_no_dup = df_latest_no_dup.sort_values(by="blockNumber") | |
df_latest_no_dup.to_csv(file, sep=",", index=False) | |
logging.info("Created files: " + ", ".join(L_created)) | |
logging.info("Updated files: " + ", ".join(L_updated)) | |
logging.info("Etherscan scraping script execution completed.") | |
# ---------------------- Script Execution ---------------------- | |
if __name__ == "__main__": | |
fetch_and_update_etherscan() | |