Spaces:
Sleeping
Sleeping
File size: 1,672 Bytes
628d1d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import pandas as pd
import json
import re
def remove_prefix(text, prefix_pattern):
"""
Removes the prefix matching the given pattern from the text.
"""
return re.sub(prefix_pattern, "", text).strip()
def main():
# Read the Excel file
try:
df = pd.read_excel("data/manabi.xlsx")
print("Excel file read successfully.")
except FileNotFoundError:
print("The file 'data/manabi.xlsx' was not found. Please check the file path.")
return
except Exception as e:
print(f"An error occurred while reading the Excel file: {e}")
return
# Check if the necessary columns exist
if "θ³ͺε" not in df.columns or "εη" not in df.columns:
print("The Excel file must contain 'θ³ͺε' and 'εη' columns.")
return
# Initialize the list to store processed data
qa_list = []
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
raw_question = str(row["θ³ͺε"])
raw_answer = str(row["εη"])
# Remove prefixes using regex patterns
question = remove_prefix(raw_question, r"^Q\d+\.\s*")
answer = remove_prefix(raw_answer, r"^A\.\s*")
qa_list.append({"question": question, "answer": answer})
# Save the list to a JSON file
try:
with open("data/qa_data.json", "w", encoding="utf-8") as json_file:
json.dump(qa_list, json_file, ensure_ascii=False, indent=2)
print("Data has been successfully saved to 'data/qa_data.json'.")
except Exception as e:
print(f"An error occurred while writing to JSON file: {e}")
if __name__ == "__main__":
main()
|