import wikipediaapi class Article: def __init__(self): self.article = None self.article_data = [] self.id_counter = 0 def set_summary(self): if self.article.summary: for text in self.article.summary.split('\n'): self.id_counter += 1 self.article_data.append( { 'id': self.id_counter, 'section': 'Summary', 'text': text.lower() } ) def set_sections_and_texts(self, sections): for section in sections: if section.text: for text in section.text.split('\n'): self.id_counter += 1 self.article_data.append( { 'id': self.id_counter, 'section': section.title, 'text': text.lower() } ) if len(section.sections) > 0: self.set_sections_and_texts(section.sections) def clean_data(self): unwanted_sections = ['See also', 'External links'] cleaned_data = [] for data in self.article_data: if len(data['text']) > 1 and data['section'] not in unwanted_sections: cleaned_data.append(data) self.article_data = cleaned_data def get_article_data(self, article_name): self.article = wikipediaapi.Wikipedia('en').page(article_name) if not self.article.exists(): return [] else: self.set_summary() self.set_sections_and_texts(self.article.sections) self.clean_data() return self.article_data