|
import os
|
|
import tempfile
|
|
import fugashi
|
|
import unidic_lite
|
|
from transformers import AutoTokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
|
|
|
|
def save_dataframe_to_file(dataframe, file_format="csv"):
|
|
temp_dir = tempfile.gettempdir()
|
|
file_path = os.path.join(temp_dir, f"output.{file_format}")
|
|
if file_format == "csv":
|
|
dataframe.to_csv(file_path, index=False, encoding='utf-8-sig')
|
|
elif file_format == "xlsx":
|
|
dataframe.to_excel(file_path, index=False, encoding='utf-8-sig')
|
|
return file_path
|
|
|
|
def tokenize_Df(examples):
|
|
return tokenizer(list(examples['prompt']), list(examples['response']),
|
|
return_tensors="pt",
|
|
padding='max_length',
|
|
max_length=60,
|
|
truncation='longest_first') |