import hashlib import csv import os def hash_md5(filepath): md5_hash = hashlib.md5() try: with open(filepath, "rb") as file: for chunk in iter(lambda: file.read(128 * md5_hash.block_size), b''): md5_hash.update(chunk) return md5_hash.hexdigest() except Exception as e: return str(e) CSV_FILE_PATH = "audio_plus_hash_uniq_07102024.csv" def update_csv_with_files(csv_file_path, audio_old_path, audio_16000_path, new_transcription): hash_old = hash_md5(audio_old_path) hash_new = hash_md5(audio_16000_path) update_csv(csv_file_path, hash_old, hash_new, audio_old_path, new_transcription) def update_csv(csv_file_path, search_hash, hash_16000, new_path, new_transcription): # Use read/write mode to modify the relevant line or append if not found with open(csv_file_path, mode='r+', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) fieldnames = reader.fieldnames rows = list(reader) found = False # Locate the row with the matching hash for i, row in enumerate(rows): if row['hash'] == search_hash or row['hash_16000'] == search_hash: rows[i]['hash_16000'] = hash_16000 rows[i]['transcription'] = new_transcription found = True break if found: # Move file pointer to the beginning and write only the updated row file.seek(0) # Go to the beginning of the file writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() # Ensure the header is written writer.writerows(rows) # Write all rows back, with the updated one return # Append a new row if the hash is not found with open(csv_file_path, mode='a', newline='', encoding='utf-8') as append_file: writer = csv.DictWriter(append_file, fieldnames=fieldnames) writer.writerow({ 'hash': search_hash, 'hash_16000': hash_16000, 'filepath': new_path, 'transcription': new_transcription })