misinfo / src /evidence /corpus_utils.py
gyigit's picture
update
54e8a79
import os
import shutil
from src.utils.path_utils import get_project_root
def separate_evidence_images(base_dir):
"""
Separates evidence images from the train directory and copies them into a new 'evidence_corpus' folder.
Args:
base_dir (str): The base directory containing the 'train' folder.
"""
# Define paths
datasets = ["train", "test"]
evidence_corpus_dir = os.path.join(base_dir, "evidence_corpus")
# Create the evidence_corpus directory if it doesn't exist
os.makedirs(evidence_corpus_dir, exist_ok=True)
# Loop through the train directory and copy evidence images
for dataset in datasets:
dataset_dir = os.path.join(base_dir, dataset)
for filename in os.listdir(dataset_dir):
if filename.split("_")[-1].split(".")[0] == "evidence":
new_filename = f"{dataset}_{filename}"
source_path = os.path.join(dataset_dir, filename)
target_path = os.path.join(evidence_corpus_dir, new_filename)
shutil.copy(source_path, target_path)
print("All evidence images in the train set have been copied.")
import pickle
# File path for the evidence features pickle
pickle_file_path = "evidence_features.pkl"
# Function to update the keys in the pickle
def update_pickle_keys(pickle_file_path, output_pickle_path=None):
# Open and load the existing pickle
with open(pickle_file_path, "rb") as f:
feature_dict = pickle.load(f)
updated_dict = {}
# Update each key
for old_path, features in feature_dict.items():
# Extract the filename (e.g., test_0_evidence.jpg)
filename = os.path.basename(old_path)
# Determine if it's a test or train image based on the filename
if filename.startswith("test"):
new_relative_path = os.path.join(
"data",
"raw",
"factify",
"extracted",
"images",
"test",
filename.split("_", 1)[1],
)
elif filename.startswith("train"):
new_relative_path = os.path.join(
"data",
"raw",
"factify",
"extracted",
"images",
"train",
filename.split("_", 1)[1],
)
else:
raise ValueError(f"Unexpected filename format: {filename}")
# Add the updated key and its value to the new dictionary
updated_dict[new_relative_path] = features
# Save the updated dictionary back to a pickle file
output_path = output_pickle_path if output_pickle_path else pickle_file_path
with open(output_path, "wb") as f:
pickle.dump(updated_dict, f)
print(f"Updated pickle saved at: {output_path}")
# Example usage
if __name__ == "__main__":
pickle_file_path = "/evidence_features.pkl"
project_root = get_project_root()
# Run the function
base_dir = os.path.join(
project_root, "data", "raw", "factify", "extracted", "images"
)
separate_evidence_images(base_dir)
# out_pkl_path = "C:\\Users\\defne\\Desktop\\2024-2025FallSemester\\Applied NLP\\multimodal-misinformation-detection\\data\\raw\\factify\\extracted\\images"
# update_pickle_keys(pickle_file_path, output_pickle_path=out_pkl_path)