Spaces:
Sleeping
Sleeping
File size: 2,555 Bytes
de37907 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import os
import pandas as pd
import json
from typing import Callable
# Define the base path
base_path = "/mnt/d/Semester7/NLP/RAG/Data"
# Construct the full paths
reddit_jokes_1_path = os.path.join(base_path, "reddit_jokes1.csv")
reddit_jokes_1_path_processed = os.path.join(base_path, "reddit_jokes1_processed.json")
hate_speech_path = os.path.join(base_path, "hate_speech.csv")
hate_speech_path_processed = os.path.join(base_path, "hate_speech_processed.json")
reddit_jokes_2_path = os.path.join(base_path, "reddit_jokes2.json")
reddit_jokes_2_processed_path = os.path.join(base_path, "reddit_jokes2_processed.json")
stupidstuff_path = os.path.join(base_path, "stupidstuff.json")
stupidstuff_path_processed = os.path.join(base_path, "stupidstuff_processed.json")
wocka_path = os.path.join(base_path, "wocka.json")
wocka_path_processed = os.path.join(base_path, "wocka_processed.json")
def csv_to_json(in_path: str, out_path: str, preprocess_function: Callable[[list], None] = None) -> None:
# Read the CSV file
df = pd.read_csv(in_path)
# Convert the DataFrame to a list of dictionaries
data = df.to_dict(orient='records') # orient='records' means that each row is converted to a dictionary
# Preprocess the data
if preprocess_function is not None:
preprocess_function(data)
# Save the list to a JSON file
with open(out_path, 'w') as f:
json.dump(data, f, indent=4)
def preprocess_json(in_path: str, out_path: str, preprocess_function: Callable[[list], None]) -> None:
# Read json file
with open(in_path, 'r') as f:
data = json.load(f)
# Preprocess the data
preprocess_function(data)
# Save the modified list to a new JSON file
with open(out_path, 'w') as f:
json.dump(data, f, indent=4)
def delete_id(data: list) -> None:
# Remove "id" from each dictionary
for joke in data:
if 'id' in joke:
del joke['id']
def delete_Content_int(data: list) -> None:
# Remove "Content_int" from each dictionary
for joke in data:
if 'Content_int' in joke:
del joke['Content_int']
if __name__ == "__main__":
# preprocess_json(reddit_jokes_2_path, reddit_jokes_2_processed_path, delete_id)
# preprocess_json(stupidstuff_path, stupidstuff_path_processed, delete_id)
# preprocess_json(wocka_path, wocka_path_processed, delete_id)
# csv_to_json(reddit_jokes_1_path, reddit_jokes_1_path_processed)
# csv_to_json(hate_speech_path, hate_speech_path_processed, delete_Content_int)
pass
|