File size: 811 Bytes
06ae167 e1af10a 06ae167 e1af10a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
import os
import tempfile
import fugashi
import unidic_lite
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
def save_dataframe_to_file(dataframe, file_format="csv"):
temp_dir = tempfile.gettempdir() # 获取系统临时目录
file_path = os.path.join(temp_dir, f"output.{file_format}")
if file_format == "csv":
dataframe.to_csv(file_path, index=False)
elif file_format == "xlsx":
dataframe.to_excel(file_path, index=False)
return file_path
def tokenize_Df(examples):
return tokenizer(list(examples['prompt']), list(examples['response']),
return_tensors="pt",
padding='max_length',
max_length=60,
truncation='longest_first') |