import os import sqlite3 import json import csv import hashlib from datetime import datetime from huggingface_hub import HfApi, hf_hub_download # Settings HF_TOKEN = os.environ.get("HF_TOKEN") REPO_ID = "Imsidag-community/libretranslate-suggestions" DEST_JSON_PATH_IN_REPO = "suggestions.json" DEST_CSV_PATH_IN_REPO = "suggestions.csv" REPO_TYPE = "dataset" JSON_OUTPUT_PATH = "/tmp/suggestions.json" CSV_OUTPUT_PATH = "/tmp/suggestions.csv" CHECKSUM_FILE_JSON = "/tmp/.last_suggestions_json_checksum" CHECKSUM_FILE_CSV = "/tmp/.last_suggestions_csv_checksum" possible_paths = [ "/app/db/suggestions.db", "/app/suggestions.db", "/root/.local/share/db/suggestions.db", "/home/libretranslate/.local/share/db/suggestions.db" ] def find_db(): print(f"Running in CWD: {os.getcwd()}") for path in possible_paths: if os.path.exists(path): print(f"Found suggestions.db at {path}") return path print("suggestions.db not found in any known path.") return None def extract_suggestions(db_path): suggestions = [] try: conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT q, s, source, target FROM suggestions") rows = cursor.fetchall() conn.close() for row in rows: unique_id = hashlib.md5((row[0] + row[1] + row[2] + row[3]).encode()).hexdigest() suggestions.append({ "id": unique_id, "source_text": row[0], "suggested_text": row[1], "source_lang": row[2], "target_lang": row[3], "timestamp": datetime.now().isoformat() }) except sqlite3.Error as e: print(f"SQLite error: {e}") return suggestions def download_existing_json(): try: path = hf_hub_download( repo_id=REPO_ID, repo_type=REPO_TYPE, filename=DEST_JSON_PATH_IN_REPO, token=HF_TOKEN, local_dir="/tmp" ) print("Downloaded existing suggestions from Hugging Face.") return path except Exception as e: print(f"Could not fetch existing suggestions from HF: {e}") return None def merge_with_existing(suggestions, existing_json_path): existing = {} if existing_json_path and os.path.exists(existing_json_path): try: with open(existing_json_path, "r", encoding="utf-8") as f: for item in json.load(f): existing[item["id"]] = { "source_text": item["source_text"], "suggested_text": item["suggested_text"], "source_lang": item["source_lang"], "target_lang": item["target_lang"], "timestamp": item.get("timestamp", datetime.now().isoformat()) } except Exception as e: print(f"Failed to read existing JSON: {e}") changed = False for s in suggestions: s_clean = { "source_text": s["source_text"], "suggested_text": s["suggested_text"], "source_lang": s["source_lang"], "target_lang": s["target_lang"], } existing_entry = existing.get(s["id"]) if not existing_entry: changed = True existing[s["id"]] = {**s_clean, "timestamp": datetime.now().isoformat()} if not changed: print("No new suggestions — skipping write/upload.") return None # Write merged JSON final = [] for id_, data in existing.items(): final.append({**data, "id": id_}) with open(JSON_OUTPUT_PATH, "w", encoding="utf-8") as f: json.dump(final, f, indent=2, ensure_ascii=False) # Also write CSV write_csv(final, CSV_OUTPUT_PATH) return JSON_OUTPUT_PATH def write_csv(suggestions, csv_path): with open(csv_path, "w", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=[ "id", "source_text", "suggested_text", "source_lang", "target_lang", "timestamp" ]) writer.writeheader() for item in suggestions: writer.writerow(item) def get_checksum(filepath): if not os.path.exists(filepath): return None with open(filepath, "rb") as f: return hashlib.md5(f.read()).hexdigest() def upload_if_updated(filepath, dest_path, checksum_file): if not filepath or not os.path.exists(filepath): return new_checksum = get_checksum(filepath) old_checksum = None if os.path.exists(checksum_file): with open(checksum_file, "r") as f: old_checksum = f.read().strip() if new_checksum != old_checksum: print(f"Uploading updated {os.path.basename(dest_path)} to Hugging Face...") try: api = HfApi() api.upload_file( path_or_fileobj=filepath, path_in_repo=dest_path, repo_id=REPO_ID, repo_type=REPO_TYPE, token=HF_TOKEN ) with open(checksum_file, "w") as f: f.write(new_checksum) print(f"Upload successful: {dest_path} at {datetime.now().isoformat()}") except Exception as e: print(f"Upload failed for {dest_path}:", e) else: print(f"No changes in {os.path.basename(dest_path)} — skipping upload.") def main(): print(f"===== Application Startup at {datetime.now().isoformat()} =====") if not HF_TOKEN: print("HF_TOKEN not set — skipping upload.") return db_path = find_db() if not db_path: return suggestions = extract_suggestions(db_path) if not suggestions: print("No suggestions found — skipping.") return existing_path = download_existing_json() merged_json = merge_with_existing(suggestions, existing_path) if merged_json: upload_if_updated(JSON_OUTPUT_PATH, DEST_JSON_PATH_IN_REPO, CHECKSUM_FILE_JSON) upload_if_updated(CSV_OUTPUT_PATH, DEST_CSV_PATH_IN_REPO, CHECKSUM_FILE_CSV) if __name__ == "__main__": main()