File size: 811 Bytes
06ae167
 
e1af10a
 
 
 
06ae167
 
 
 
 
 
 
 
e1af10a
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import os
import tempfile
import fugashi
import unidic_lite
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

def save_dataframe_to_file(dataframe, file_format="csv"):
    temp_dir = tempfile.gettempdir()  # 获取系统临时目录
    file_path = os.path.join(temp_dir, f"output.{file_format}")
    if file_format == "csv":
        dataframe.to_csv(file_path, index=False)
    elif file_format == "xlsx":
        dataframe.to_excel(file_path, index=False)
    return file_path

def tokenize_Df(examples):
    return tokenizer(list(examples['prompt']), list(examples['response']),
                    return_tensors="pt",
                    padding='max_length',
                    max_length=60,
                    truncation='longest_first')