|
import os |
|
import tempfile |
|
import fugashi |
|
import unidic_lite |
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese") |
|
|
|
def save_dataframe_to_file(dataframe, file_format="csv"): |
|
temp_dir = tempfile.gettempdir() |
|
file_path = os.path.join(temp_dir, f"output.{file_format}") |
|
if file_format == "csv": |
|
dataframe.to_csv(file_path, index=False) |
|
elif file_format == "xlsx": |
|
dataframe.to_excel(file_path, index=False) |
|
return file_path |
|
|
|
def tokenize_Df(examples): |
|
return tokenizer(list(examples['prompt']), list(examples['response']), |
|
return_tensors="pt", |
|
padding='max_length', |
|
max_length=60, |
|
truncation='longest_first') |