Spaces:

bhavanishankarpullela
/

CoSTA

Sleeping

File size: 2,416 Bytes

2dc7757

import requests
import bz2
import xml.etree.ElementTree as ET
import os
import pickle
from tqdm import tqdm
import mwparserfromhell

# Step 1: Download the latest dump
DUMP_URL = "https://dumps.wikimedia.org/tewiktionary/latest/tewiktionary-latest-pages-articles.xml.bz2"
response = requests.get(DUMP_URL, stream=True)

print("Downloading the latest dump...")
total_size = int(response.headers.get('content-length', 0))
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

dump_file = "tewiktionary-latest-pages-articles.xml.bz2"
with open(dump_file, 'wb') as file:
    for chunk in response.iter_content(chunk_size=8192):
        progress_bar.update(len(chunk))
        file.write(chunk)
progress_bar.close()

# Step 2: Extract the dump
print("\nExtracting the dump...")
with bz2.open(dump_file, 'rb') as source, open(dump_file[:-4], 'wb') as dest:
    for line in source:
        dest.write(line)

# Step 3: Parse the XML dump and extract translations
print("Parsing the XML dump to extract translations...")
tree = ET.parse(dump_file[:-4])
root = tree.getroot()

ns = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'}

translations = {}

for page in root.findall('ns:page', ns):
    title = page.find('ns:title', ns).text
    revision = page.find('ns:revision', ns)
    if revision:
        text_data = revision.find('ns:text', ns)
        if text_data and text_data.text:
            # Parse the wikitext
            wikicode = mwparserfromhell.parse(text_data.text)
            links = [link.title for link in wikicode.filter_wikilinks() if link.title.startswith("en:")]
            if links:
                english_translations = [str(link.split(':')[1]) for link in links]
                translations[title] = english_translations

# Display the first 1000 translations
print("\nDisplaying the first 1000 translations:")
for i, (telugu_word, english_words) in enumerate(translations.items()):
    if i >= 1000:
        break
    print(f"Telugu Word: {telugu_word}, English Translations: {', '.join(english_words)}")

# Save the translations to a pickle file
print("\nSaving translations to pickle file...")
pickle_filename = "telugu_english_translations.pkl"
with open(pickle_filename, 'wb') as file:
    pickle.dump(translations, file)

print(f"Translations saved to {pickle_filename}")

# Optional: Remove the downloaded files if you want
# os.remove(dump_file)
# os.remove(dump_file[:-4])