File size: 6,844 Bytes
58d33f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""Loader that loads iFixit data."""
from typing import List, Optional

import requests

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.web_base import WebBaseLoader

IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"


class IFixitLoader(BaseLoader):
    """Load iFixit repair guides, device wikis and answers.

    iFixit is the largest, open repair community on the web. The site contains nearly
    100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
    licensed under CC-BY.

    This loader will allow you to download the text of a repair guide, text of Q&A's
    and wikis from devices on iFixit using their open APIs and web scraping.
    """

    def __init__(self, web_path: str):
        """Initialize with web path."""
        if not web_path.startswith("https://www.ifixit.com"):
            raise ValueError("web path must start with 'https://www.ifixit.com'")

        path = web_path.replace("https://www.ifixit.com", "")

        allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]

        """ TODO: Add /Wiki """
        if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
            raise ValueError(
                "web path must start with /Device, /Guide, /Teardown or /Answers"
            )

        pieces = [x for x in path.split("/") if x]

        """Teardowns are just guides by a different name"""
        self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"

        if self.page_type == "Guide" or self.page_type == "Answers":
            self.id = pieces[2]
        else:
            self.id = pieces[1]

        self.web_path = web_path

    def load(self) -> List[Document]:
        if self.page_type == "Device":
            return self.load_device()
        elif self.page_type == "Guide" or self.page_type == "Teardown":
            return self.load_guide()
        elif self.page_type == "Answers":
            return self.load_questions_and_answers()
        else:
            raise ValueError("Unknown page type: " + self.page_type)

    @staticmethod
    def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
        res = requests.get(
            IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
        )

        if res.status_code != 200:
            raise ValueError(
                'Could not load suggestions for "' + query + '"\n' + res.json()
            )

        data = res.json()

        results = data["results"]
        output = []

        for result in results:
            try:
                loader = IFixitLoader(result["url"])
                if loader.page_type == "Device":
                    output += loader.load_device(include_guides=False)
                else:
                    output += loader.load()
            except ValueError:
                continue

        return output

    def load_questions_and_answers(
        self, url_override: Optional[str] = None
    ) -> List[Document]:
        loader = WebBaseLoader(self.web_path if url_override is None else url_override)
        soup = loader.scrape()

        output = []

        title = soup.find("h1", "post-title").text

        output.append("# " + title)
        output.append(soup.select_one(".post-content .post-text").text.strip())

        answersHeader = soup.find("div", "post-answers-header")
        if answersHeader:
            output.append("\n## " + answersHeader.text.strip())

        for answer in soup.select(".js-answers-list .post.post-answer"):
            if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
                output.append("\n### Accepted Answer")
            elif "post-helpful" in answer["class"]:
                output.append("\n### Most Helpful Answer")
            else:
                output.append("\n### Other Answer")

            output += [
                a.text.strip() for a in answer.select(".post-content .post-text")
            ]
            output.append("\n")

        text = "\n".join(output).strip()

        metadata = {"source": self.web_path, "title": title}

        return [Document(page_content=text, metadata=metadata)]

    def load_device(
        self, url_override: Optional[str] = None, include_guides: bool = True
    ) -> List[Document]:
        documents = []
        if url_override is None:
            url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
        else:
            url = url_override

        res = requests.get(url)
        data = res.json()
        text = "\n".join(
            [
                data[key]
                for key in ["title", "description", "contents_raw"]
                if key in data
            ]
        ).strip()

        metadata = {"source": self.web_path, "title": data["title"]}
        documents.append(Document(page_content=text, metadata=metadata))

        if include_guides:
            """Load and return documents for each guide linked to from the device"""
            guide_urls = [guide["url"] for guide in data["guides"]]
            for guide_url in guide_urls:
                documents.append(IFixitLoader(guide_url).load()[0])

        return documents

    def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
        if url_override is None:
            url = IFIXIT_BASE_URL + "/guides/" + self.id
        else:
            url = url_override

        res = requests.get(url)

        if res.status_code != 200:
            raise ValueError(
                "Could not load guide: " + self.web_path + "\n" + res.json()
            )

        data = res.json()

        doc_parts = ["# " + data["title"], data["introduction_raw"]]

        doc_parts.append("\n\n###Tools Required:")
        if len(data["tools"]) == 0:
            doc_parts.append("\n - None")
        else:
            for tool in data["tools"]:
                doc_parts.append("\n - " + tool["text"])

        doc_parts.append("\n\n###Parts Required:")
        if len(data["parts"]) == 0:
            doc_parts.append("\n - None")
        else:
            for part in data["parts"]:
                doc_parts.append("\n - " + part["text"])

        for row in data["steps"]:
            doc_parts.append(
                "\n\n## "
                + (
                    row["title"]
                    if row["title"] != ""
                    else "Step {}".format(row["orderby"])
                )
            )

            for line in row["lines"]:
                doc_parts.append(line["text_raw"])

        doc_parts.append(data["conclusion_raw"])

        text = "\n".join(doc_parts)

        metadata = {"source": self.web_path, "title": data["title"]}

        return [Document(page_content=text, metadata=metadata)]