File size: 1,525 Bytes
9f23e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import wikipediaapi

class Article:

    def __init__(self, article_name):
        self.article_data = {}
        self.article = wikipediaapi.Wikipedia('en').page(article_name)

    def article_exists(self):
        try:
            if self.article.exists():
                return True
        except:           
            return False

    def get_sections_and_texts(self, sections):
        if 'Summary' not in self.article_data:
            self.article_data['Summary'] = ''
            if self.article.summary:
                self.article_data['Summary'] = self.article.summary.lower().split('\n') 

        for section in sections:
            if section.text:
                self.article_data[section.title] = section.text.lower().split('\n')
            if len(section.sections) > 0:
                self.get_sections_and_texts(section.sections)

    def remove_empty_sections(self):
        for _, docs in self.article_data.items():
            for d in docs:
                if len(d) <= 0:
                    docs.remove(d)
        
        
    def get_article_data(self):
        self.get_sections_and_texts(self.article.sections)
        self.remove_empty_sections()

        num_docs = sum(len(docs) for _, docs in self.article_data.items())
        avg_doc_len = sum(len(doc.split()) for _, docs in self.article_data.items() for doc in docs) / num_docs

        return {
            'article_data': self.article_data,
            'num_docs': num_docs,
            'avg_doc_len': avg_doc_len
            }