File size: 1,818 Bytes
ef3de03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f150f6b
ef3de03
 
f150f6b
 
 
 
 
ef3de03
f150f6b
ef3de03
 
 
 
f150f6b
ef3de03
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup
from pathlib import Path


class SearchResultsExtractor:
    def __init__(self) -> None:
        pass

    def load_html(self, html_path):
        with open(html_path, "r", encoding="utf-8") as f:
            html = f.read()
        self.soup = BeautifulSoup(html, "html.parser")

    def extract_search_results(self):
        search_result_elements = self.soup.find_all("div", class_="g")

        for result in search_result_elements:
            site = result.find("cite").find_previous("span").text
            link = result.find("a")["href"]
            title = result.find("h3").text

            abstract_element = result.find("div", {"data-sncf": "1"})
            if abstract_element is None:
                abstract_element = result.find("div", class_="ITZIwc")
            abstract = abstract_element.text.strip()

            print(
                f"{title}\n" f"  - {site}\n" f"  - {link}\n" f"  - {abstract}\n" f"\n"
            )
        print(len(search_result_elements))

    def extract_related_questions(self):
        related_question_elements = self.soup.find_all(
            "div", class_="related-question-pair"
        )
        for question_element in related_question_elements:
            question = question_element.find("span").text.strip()
            print(question)
        print(len(related_question_elements))

    def extract(self, html_path):
        self.load_html(html_path)
        self.extract_search_results()
        self.extract_related_questions()


if __name__ == "__main__":
    html_path_root = Path(__file__).parents[1] / "files"
    # html_filename = "python教程"
    html_filename = "python_tutorials"
    html_path = html_path_root / f"{html_filename}.html"
    extractor = SearchResultsExtractor()
    extractor.extract(html_path)