Spaces:

dayuian
/

VocabLine

Running

File size: 2,407 Bytes

f6bff5d
 
 
 
 
e67e94c
f6bff5d
e67e94c
1d08e1f
f6bff5d
 
 
e67e94c
f6bff5d
 
e67e94c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6bff5d
 
 
 
 
 
 
 
e67e94c
 
f6bff5d
e67e94c
 
f6bff5d
 
 
 
 
e67e94c
 
f6bff5d
 
 
e67e94c
f6bff5d
 
e67e94c
 
f6bff5d
 
e67e94c
 
f6bff5d
e67e94c
f6bff5d
 
 
e67e94c
f6bff5d
 
e67e94c
 
 
 
f6bff5d

import gradio as gr
import json
import random
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import re

# 模型初始化
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 資料夾
DATA_DIR = "./data"

# 自動掃描資料夾生成選單
def get_sources():
    files = os.listdir(DATA_DIR)
    sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
    return sources

# 清理 GPT 生成句子的雜訊
def clean_sentence(output):
    output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
    output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
    if not output.endswith("."):
        output += "."
    return output

# 核心函數
def get_words_with_sentences(source, n):
    status = []
    try:
        data_path = os.path.join(DATA_DIR, f"{source}.json")
        with open(data_path, 'r', encoding='utf-8') as f:
            words = json.load(f)

        selected_words = random.sample(words, n)
        results = []

        for i, word_data in enumerate(selected_words):
            status.append(f"正在生成第 {i+1}/{n} 個單字 [{word_data['word']}] 例句...")
            word = word_data['word']

            prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence."

            inputs = tokenizer(prompt, return_tensors="pt")
            outputs = model.generate(**inputs, max_new_tokens=30)
            sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

            clean_output = clean_sentence(sentence)

            results.append({
                "word": word,
                "phonetic": word_data["phonetic"],
                "sentence": clean_output
            })

        status.append("✅ 完成！")
        return results, status

    except Exception as e:
        status.append(f"❌ 發生錯誤: {str(e)}")
        return [], status

# Gradio 介面
demo = gr.Interface(
    fn=get_words_with_sentences,
    inputs=[
        gr.Dropdown(choices=get_sources(), value="common3000", label="選擇單字庫", interactive=True, show_clear_button=False),
        gr.Number(value=10, label="抽幾個單字")
    ],
    outputs=[
        gr.JSON(label="生成結果"),
        gr.JSON(label="生成進度")
    ]
)

demo.launch()