File size: 1,719 Bytes
ccba2d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import wikipedia
import os
def get_raw_wikipedia_article(entity):
try:
results = wikipedia.search(entity)
best_result = results[0]
page = wikipedia.page(best_result, auto_suggest=False)
return page.content
except wikipedia.exceptions.DisambiguationError as e:
# Search term can't be disambiguated so we try
# again with a more specific search term adding ' (arts)'
return get_raw_wikipedia_article(entity + ' (arts)')
except wikipedia.exceptions.PageError as e:
# If the page doesn't exist, handle the PageError here.
print("The requested page does not exist on Wikipedia.")
return None
def clean_article(raw_article):
lines = raw_article.split('\n')
clean_lines = []
for l in lines:
if l.startswith('== See also'):
break
if l.startswith('== References'):
break
if l.startswith('='):
continue
if len(l.strip()) == 0:
continue
clean_lines.append(l.strip())
return '\n'.join(clean_lines)
def save_article(content, path):
with open(path, 'w', encoding='utf-8') as f:
f.write(content)
def load_entities(entities_path):
with open(entities_path, 'r', encoding='utf-8') as f:
return [l.strip() for l in f.readlines()]
def scrape(entities_path, save_path):
entities = load_entities(entities_path)
for entity in entities:
raw_article = get_raw_wikipedia_article(entity)
if raw_article == None:
print(f'Article on Wikipedia not found for entity {entity} :(')
continue
cleaned_article = clean_article(raw_article)
save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt'))
if __name__ == '__main__':
scrape('src/data/entities.txt', 'data/wiki_articles') |