Spaces:
Running
Running
import requests | |
import bz2 | |
import xml.etree.ElementTree as ET | |
import os | |
import pickle | |
from tqdm import tqdm | |
import mwparserfromhell | |
# Step 1: Download the latest dump | |
DUMP_URL = "https://dumps.wikimedia.org/tewiktionary/latest/tewiktionary-latest-pages-articles.xml.bz2" | |
response = requests.get(DUMP_URL, stream=True) | |
print("Downloading the latest dump...") | |
total_size = int(response.headers.get('content-length', 0)) | |
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True) | |
dump_file = "tewiktionary-latest-pages-articles.xml.bz2" | |
with open(dump_file, 'wb') as file: | |
for chunk in response.iter_content(chunk_size=8192): | |
progress_bar.update(len(chunk)) | |
file.write(chunk) | |
progress_bar.close() | |
# Step 2: Extract the dump | |
print("\nExtracting the dump...") | |
with bz2.open(dump_file, 'rb') as source, open(dump_file[:-4], 'wb') as dest: | |
for line in source: | |
dest.write(line) | |
# Step 3: Parse the XML dump and extract translations | |
print("Parsing the XML dump to extract translations...") | |
tree = ET.parse(dump_file[:-4]) | |
root = tree.getroot() | |
ns = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'} | |
translations = {} | |
for page in root.findall('ns:page', ns): | |
title = page.find('ns:title', ns).text | |
revision = page.find('ns:revision', ns) | |
if revision: | |
text_data = revision.find('ns:text', ns) | |
if text_data and text_data.text: | |
# Parse the wikitext | |
wikicode = mwparserfromhell.parse(text_data.text) | |
links = [link.title for link in wikicode.filter_wikilinks() if link.title.startswith("en:")] | |
if links: | |
english_translations = [str(link.split(':')[1]) for link in links] | |
translations[title] = english_translations | |
# Display the first 1000 translations | |
print("\nDisplaying the first 1000 translations:") | |
for i, (telugu_word, english_words) in enumerate(translations.items()): | |
if i >= 1000: | |
break | |
print(f"Telugu Word: {telugu_word}, English Translations: {', '.join(english_words)}") | |
# Save the translations to a pickle file | |
print("\nSaving translations to pickle file...") | |
pickle_filename = "telugu_english_translations.pkl" | |
with open(pickle_filename, 'wb') as file: | |
pickle.dump(translations, file) | |
print(f"Translations saved to {pickle_filename}") | |
# Optional: Remove the downloaded files if you want | |
# os.remove(dump_file) | |
# os.remove(dump_file[:-4]) | |