File size: 13,752 Bytes
360dd3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import selenium
import os, warnings
import urllib.request
from time import sleep
import pandas as pd, ssl
from selenium import webdriver
from urllib.error import HTTPError
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
pd.options.display.float_format = '{:.2f}'.format
ssl._create_default_https_context = ssl._create_unverified_context
def fetch_wikis_codes():
try:
urls = [r'https://en.wikipedia.org/wiki/Statistics_of_Wikipedias',
r'https://meta.wikimedia.org/wiki/List_of_Wikipedias']
for url in urls:
try: tables = pd.read_html(url)
except urllib.error.HTTPError: continue
for i in range(len(tables)):
dataframe = tables[i]
columns = list(dataframe.columns.values)
if(set(['Language', 'Wiki']).issubset(set(columns))):
wikis_codes = tables[i]
break
wikis_codes = wikis_codes[['Wiki', 'Language']]
wikis_codes = wikis_codes[wikis_codes["Language"].str.contains("(closed)") == False]
wikis_codes = wikis_codes.set_index('Wiki').to_dict()['Language']
return wikis_codes
except:
wikis_codes = {'en': 'English', 'ceb': 'Cebuano', 'de': 'German', 'sv': 'Swedish', 'fr': 'French', 'nl': 'Dutch', 'ru': 'Russian',
'es': 'Spanish', 'it': 'Italian', 'arz': 'Egyptian Arabic', 'pl': 'Polish', 'ja': 'Japanese', 'zh': 'Chinese', 'vi':
'Vietnamese', 'uk': 'Ukrainian', 'war': 'Waray', 'ar': 'Arabic', 'pt': 'Portuguese', 'fa': 'Persian', 'ca': 'Catalan',
'sr': 'Serbian', 'id': 'Indonesian', 'ko': 'Korean', 'no': 'Norwegian (Bokmål)', 'ce': 'Chechen', 'fi': 'Finnish', 'cs':
'Czech', 'tr': 'Turkish', 'hu': 'Hungarian', 'tt': 'Tatar', 'sh': 'Serbo-Croatian', 'ro': 'Romanian', 'zh-min-nan':
'Southern Min', 'eu': 'Basque', 'ms': 'Malay', 'eo': 'Esperanto', 'he': 'Hebrew', 'hy': 'Armenian', 'da': 'Danish', 'bg':
'Bulgarian', 'cy': 'Welsh', 'sk': 'Slovak', 'azb': 'South Azerbaijani', 'uz': 'Uzbek', 'et': 'Estonian', 'simple':
'Simple English', 'be': 'Belarusian', 'kk': 'Kazakh', 'min': 'Minangkabau', 'el': 'Greek', 'hr': 'Croatian', 'lt': 'Lithuanian',
'gl': 'Galician', 'az': 'Azerbaijani', 'ur': 'Urdu', 'sl': 'Slovene', 'lld': 'Ladin', 'ka': 'Georgian', 'nn': 'Norwegian (Nynorsk)',
'hi': 'Hindi', 'th': 'Thai', 'ta': 'Tamil', 'bn': 'Bengali', 'la': 'Latin', 'mk': 'Macedonian', 'zh-yue': 'Cantonese', 'ast':
'Asturian', 'lv': 'Latvian', 'af': 'Afrikaans', 'tg': 'Tajik', 'my': 'Burmese', 'mg': 'Malagasy', 'mr': 'Marathi', 'sq': 'Albanian',
'bs': 'Bosnian', 'oc': 'Occitan', 'te': 'Telugu', 'ml': 'Malayalam', 'nds': 'Low German', 'be-tarask': 'Belarusian (Taraškievica)',
'br': 'Breton', 'ky': 'Kyrgyz', 'sw': 'Swahili', 'jv': 'Javanese', 'lmo': 'Lombard', 'new': 'Newar', 'pnb': 'Western Punjabi', 'vec':
'Venetian', 'ht': 'Haitian Creole', 'pms': 'Piedmontese', 'ba': 'Bashkir', 'lb': 'Luxembourgish', 'su': 'Sundanese', 'ku': 'Kurdish (Kurmanji)',
'ga': 'Irish', 'szl': 'Silesian', 'is': 'Icelandic', 'fy': 'West Frisian', 'cv': 'Chuvash', 'ckb': 'Kurdish (Sorani)', 'pa': 'Punjabi', 'tl':
'Tagalog', 'an': 'Aragonese', 'wuu': 'Wu Chinese', 'diq': 'Zaza', 'io': 'Ido', 'sco': 'Scots', 'vo': 'Volapük', 'yo': 'Yoruba', 'ne': 'Nepali',
'ia': 'Interlingua', 'kn': 'Kannada', 'gu': 'Gujarati', 'als': 'Alemannic German', 'ha': 'Hausa', 'avk': 'Kotava', 'bar': 'Bavarian', 'crh':
'Crimean Tatar', 'scn': 'Sicilian', 'bpy': 'Bishnupriya Manipuri', 'qu': 'Quechua (Southern Quechua)', 'nv': 'Navajo', 'mn': 'Mongolian', 'xmf':
'Mingrelian', 'ban': 'Balinese', 'si': 'Sinhala', 'tum': 'Tumbuka', 'ps': 'Pashto', 'frr': 'North Frisian', 'os': 'Ossetian', 'mzn': 'Mazanderani',
'bat-smg': 'Samogitian', 'or': 'Odia', 'ig': 'Igbo', 'sah': 'Yakut', 'cdo': 'Eastern Min', 'gd': 'Scottish Gaelic', 'bug': 'Buginese', 'yi': 'Yiddish',
'sd': 'Sindhi', 'ilo': 'Ilocano', 'am': 'Amharic', 'nap': 'Neapolitan', 'li': 'Limburgish', 'bcl': 'Central Bikol', 'fo': 'Faroese', 'gor': 'Gorontalo',
'hsb': 'Upper Sorbian', 'map-bms': 'Banyumasan', 'mai': 'Maithili', 'shn': 'Shan', 'eml': 'Emilian-Romagnol', 'ace': 'Acehnese', 'zh-classical':
'Classical Chinese', 'sa': 'Sanskrit', 'as': 'Assamese', 'wa': 'Walloon', 'ie': 'Interlingue', 'hyw': 'Western Armenian', 'lij': 'Ligurian', 'mhr':
'Meadow Mari', 'zu': 'Zulu', 'sn': 'Shona', 'hif': 'Fiji Hindi', 'mrj': 'Hill Mari', 'bjn': 'Banjarese', 'mni': 'Meitei', 'km': 'Khmer', 'hak':
'Hakka Chinese', 'roa-tara': 'Tarantino', 'pam': 'Kapampangan', 'sat': 'Santali', 'rue': 'Rusyn', 'nso': 'Northern Sotho', 'bh': 'Bihari (Bhojpuri)',
'so': 'Somali', 'mi': 'Māori', 'se': 'Northern Sámi', 'myv': 'Erzya', 'vls': 'West Flemish', 'nds-nl': 'Dutch Low Saxon', 'dag': 'Dagbani', 'sc':
'Sardinian', 'ary': 'Moroccan Arabic', 'co': 'Corsican', 'kw': 'Cornish', 'bo': 'Lhasa Tibetan', 'vep': 'Veps', 'glk': 'Gilaki', 'tk': 'Turkmen', 'kab':
'Kabyle', 'gan': 'Gan Chinese', 'rw': 'Kinyarwanda', 'fiu-vro': 'Võro', 'ab': 'Abkhaz', 'gv': 'Manx', 'ug': 'Uyghur', 'nah': 'Nahuatl', 'zea': 'Zeelandic',
'skr': 'Saraiki', 'frp': 'Franco-Provençal', 'udm': 'Udmurt', 'pcd': 'Picard', 'mt': 'Maltese', 'kv': 'Komi', 'csb': 'Kashubian', 'gn': 'Guarani', 'smn':
'Inari Sámi', 'ay': 'Aymara', 'nrm': 'Norman', 'ks': 'Kashmiri', 'lez': 'Lezgian', 'lfn': 'Lingua Franca Nova', 'olo': 'Livvi-Karelian', 'mwl': 'Mirandese',
'stq': 'Saterland Frisian', 'lo': 'Lao', 'ang': 'Old English', 'mdf': 'Moksha', 'fur': 'Friulian', 'rm': 'Romansh', 'lad': 'Judaeo-Spanish', 'kaa': 'Karakalpak',
'gom': 'Konkani (Goan Konkani)', 'ext': 'Extremaduran', 'koi': 'Permyak', 'tyv': 'Tuvan', 'pap': 'Papiamento', 'av': 'Avar', 'dsb': 'Lower Sorbian', 'ln':
'Lingala', 'dty': 'Doteli', 'tw': 'Twi', 'cbk-zam': 'Chavacano (Zamboanga)', 'dv': 'Maldivian', 'ksh': 'Ripuarian', 'za': 'Zhuang (Standard Zhuang)', 'gag':
'Gagauz', 'bxr': 'Buryat (Russia Buriat)', 'pfl': 'Palatine German', 'lg': 'Luganda', 'szy': 'Sakizaya', 'pag': 'Pangasinan', 'blk': "Pa'O", 'pi': 'Pali',
'tay': 'Atayal', 'haw': 'Hawaiian', 'awa': 'Awadhi', 'inh': 'Ingush', 'krc': 'Karachay-Balkar', 'xal': 'Kalmyk Oirat', 'pdc': 'Pennsylvania Dutch', 'to':
'Tongan', 'atj': 'Atikamekw', 'tcy': 'Tulu', 'arc': 'Aramaic (Syriac)', 'mnw': 'Mon', 'jam': 'Jamaican Patois', 'shi': 'Shilha', 'kbp': 'Kabiye', 'wo':
'Wolof', 'anp': 'Angika', 'kbd': 'Kabardian', 'nia': 'Nias', 'nov': 'Novial', 'om': 'Oromo', 'ki': 'Kikuyu', 'nqo': "N'Ko", 'bi': 'Bislama', 'xh': 'Xhosa',
'tpi': 'Tok Pisin', 'tet': 'Tetum', 'ff': 'Fula', 'roa-rup': 'Aromanian', 'jbo': 'Lojban', 'fj': 'Fijian', 'kg': 'Kongo (Kituba)', 'lbe': 'Lak', 'ty': 'Tahitian',
'guw': 'Gun', 'cu': 'Old Church Slavonic', 'trv': 'Seediq', 'ami': 'Amis', 'srn': 'Sranan Tongo', 'sm': 'Samoan', 'mad': 'Madurese', 'alt': 'Southern Altai',
'ltg': 'Latgalian', 'gcr': 'French Guianese Creole', 'chr': 'Cherokee', 'tn': 'Tswana', 'ny': 'Chewa', 'st': 'Sotho', 'pih': 'Norfuk', 'rmy': 'Romani (Vlax Romani)',
'got': 'Gothic', 'ee': 'Ewe', 'pcm': 'Nigerian Pidgin', 'bm': 'Bambara', 'ss': 'Swazi', 'ts': 'Tsonga', 've': 'Venda', 'kcg': 'Tyap', 'chy': 'Cheyenne', 'rn':
'Kirundi', 'ch': 'Chamorro', 'gur': 'Frafra', 'ik': 'Iñupiaq', 'ady': 'Adyghe', 'pnt': 'Pontic Greek', 'guc': 'Wayuu', 'iu': 'Inuktitut', 'pwn': 'Paiwan', 'sg':
'Sango', 'din': 'Dinka', 'ti': 'Tigrinya', 'kl': 'Greenlandic', 'dz': 'Dzongkha', 'cr': 'Cree', 'ak': 'Akan'}
return wikis_codes
def fetch_wiki_metadata(wiki, metric, submetric, timeout):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", f"{os.getcwd()}")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
driver = webdriver.Firefox(options=options, firefox_profile=profile, executable_path='geckodriver', service_log_path=os.devnull)
if metric == 'pages':
base_url = f'https://stats.wikimedia.org/#/{wiki}.wikipedia.org/content/pages-to-date/full|table|'
elif metric == 'edits':
base_url = f'https://stats.wikimedia.org/#/{wiki}.wikipedia.org/contributing/edits/full|table|'
parameters = f'1-month|editor_type~anonymous*group-bot*name-bot*user+(page_type)~{submetric}|monthly'
request_url = "".join([base_url, parameters])
driver.implicitly_wait(3)
driver.get(request_url)
driver.page_source
sleep(timeout)
csvFilename = f"{wiki}--{metric}--{submetric}.csv"
csvFilename = csvFilename.replace(' ','-')
driver.find_element_by_class_name("ui.icon.button.tooltipped.tooltipped-n").click()
sleep(3) ; os.rename("undefined.csv", csvFilename)
driver.close()
driver.quit()
print(f' [+] Metadata Exported to `{wiki}/{csvFilename}`.')
return csvFilename
wiki_codes = fetch_wikis_codes()
labels = []
for key, value in wiki_codes.items():
labels.append(f"{value} ({key})")
wikis = list(wiki_codes.keys())
metrics = ['pages', 'edits']
submetrics = ['content', 'non-content']
timeout = 3
counter = 1
for wiki in wikis:
print(f'{counter}## {wiki_codes[wiki]} Wikipedia Files:')
if not os.path.exists(f'{wiki}'): os.makedirs(f'{wiki}')
if not os.path.exists('all-metadata'): os.makedirs('all-metadata')
for metric in metrics:
for submetric in submetrics:
try:
csvFilename = fetch_wiki_metadata(wiki, metric, submetric, timeout)
dataframe = pd.read_csv(csvFilename).iloc[-1]
except selenium.common.exceptions.ElementClickInterceptedException:
dataframe = pd.read_csv(fetch_wiki_metadata(wiki, metric, submetric, timeout*2)).iloc[-1]
timeout *= 2
retrieval_date = pd.to_datetime(dataframe['timeRange.end']).strftime('%Y-%m-%d')
if metric == 'pages':
if submetric == 'content':
pages_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot']
pages_content_humans = dataframe['total.user']+dataframe['total.anonymous']
elif submetric == 'non-content':
pages_non_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot']
pages_non_content_humans = dataframe['total.user']+dataframe['total.anonymous']
else: print(f'Error: this submetric: {submetric} is not supported!')
elif metric == 'edits':
if submetric == 'content':
edits_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot']
edits_content_humans = dataframe['total.user']+dataframe['total.anonymous']
elif submetric == 'non-content':
edits_non_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot']
edits_non_content_humans = dataframe['total.user']+dataframe['total.anonymous']
else: print(f'Error: this submetric: {submetric} is not supported!')
else: print(f'Error: this metric: {metric} is not supported!')
os.system(f'mv {wiki}--{metric}--{submetric}.csv {wiki}/{wiki}--{metric}--{submetric}.csv')
selected_language = f'{wiki_codes[wiki]} ({wiki})'
metadata = {'Wiki' : [selected_language, selected_language, selected_language, selected_language,
selected_language, selected_language, selected_language,selected_language],
'Metric' : ['Pages', 'Pages', 'Pages', 'Pages', 'Edits', 'Edits', 'Edits', 'Edits'],
'Sub-Metric' : ['Articles', 'Articles', 'Non-Articles', 'Non-Articles',
'Articles', 'Articles', 'Non-Articles', 'Non-Articles'],
'Editors' : ['Bots', 'Humans', 'Bots', 'Humans', 'Bots', 'Humans', 'Bots', 'Humans'],
'Values' : [pages_content_bots, pages_content_humans, pages_non_content_bots, pages_non_content_humans,
edits_content_bots, edits_content_humans, edits_non_content_bots, edits_non_content_humans]}
wiki_metadata = pd.DataFrame(metadata)
wiki_metadata['Retrieval-Date'] = retrieval_date
wiki_metadata.to_csv(f'{wiki_codes[wiki].replace(" ","-")}--Wikipedia--Metadata.csv', index=False)
os.system(f'mv {wiki} all-metadata/')
counter = counter + 1
sleep(1)
|