File size: 394 Bytes
4bb9d41
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
import pandas as pd
from typing import List, Dict
import os

def load_language_data(data_dir: str, language: str) -> List[str]:
    filepath = os.path.join(data_dir, f"{language.lower()}/texts.txt")
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.readlines()

def preprocess_text(text: str) -> str:
    text = text.strip()
    text = ' '.join(text.split())
    return text