Spaces:
Runtime error
Runtime error
File size: 6,844 Bytes
58d33f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
"""Loader that loads iFixit data."""
from typing import List, Optional
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.web_base import WebBaseLoader
IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
class IFixitLoader(BaseLoader):
"""Load iFixit repair guides, device wikis and answers.
iFixit is the largest, open repair community on the web. The site contains nearly
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
licensed under CC-BY.
This loader will allow you to download the text of a repair guide, text of Q&A's
and wikis from devices on iFixit using their open APIs and web scraping.
"""
def __init__(self, web_path: str):
"""Initialize with web path."""
if not web_path.startswith("https://www.ifixit.com"):
raise ValueError("web path must start with 'https://www.ifixit.com'")
path = web_path.replace("https://www.ifixit.com", "")
allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]
""" TODO: Add /Wiki """
if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
raise ValueError(
"web path must start with /Device, /Guide, /Teardown or /Answers"
)
pieces = [x for x in path.split("/") if x]
"""Teardowns are just guides by a different name"""
self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"
if self.page_type == "Guide" or self.page_type == "Answers":
self.id = pieces[2]
else:
self.id = pieces[1]
self.web_path = web_path
def load(self) -> List[Document]:
if self.page_type == "Device":
return self.load_device()
elif self.page_type == "Guide" or self.page_type == "Teardown":
return self.load_guide()
elif self.page_type == "Answers":
return self.load_questions_and_answers()
else:
raise ValueError("Unknown page type: " + self.page_type)
@staticmethod
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
res = requests.get(
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
)
if res.status_code != 200:
raise ValueError(
'Could not load suggestions for "' + query + '"\n' + res.json()
)
data = res.json()
results = data["results"]
output = []
for result in results:
try:
loader = IFixitLoader(result["url"])
if loader.page_type == "Device":
output += loader.load_device(include_guides=False)
else:
output += loader.load()
except ValueError:
continue
return output
def load_questions_and_answers(
self, url_override: Optional[str] = None
) -> List[Document]:
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
soup = loader.scrape()
output = []
title = soup.find("h1", "post-title").text
output.append("# " + title)
output.append(soup.select_one(".post-content .post-text").text.strip())
answersHeader = soup.find("div", "post-answers-header")
if answersHeader:
output.append("\n## " + answersHeader.text.strip())
for answer in soup.select(".js-answers-list .post.post-answer"):
if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
output.append("\n### Accepted Answer")
elif "post-helpful" in answer["class"]:
output.append("\n### Most Helpful Answer")
else:
output.append("\n### Other Answer")
output += [
a.text.strip() for a in answer.select(".post-content .post-text")
]
output.append("\n")
text = "\n".join(output).strip()
metadata = {"source": self.web_path, "title": title}
return [Document(page_content=text, metadata=metadata)]
def load_device(
self, url_override: Optional[str] = None, include_guides: bool = True
) -> List[Document]:
documents = []
if url_override is None:
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
else:
url = url_override
res = requests.get(url)
data = res.json()
text = "\n".join(
[
data[key]
for key in ["title", "description", "contents_raw"]
if key in data
]
).strip()
metadata = {"source": self.web_path, "title": data["title"]}
documents.append(Document(page_content=text, metadata=metadata))
if include_guides:
"""Load and return documents for each guide linked to from the device"""
guide_urls = [guide["url"] for guide in data["guides"]]
for guide_url in guide_urls:
documents.append(IFixitLoader(guide_url).load()[0])
return documents
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
if url_override is None:
url = IFIXIT_BASE_URL + "/guides/" + self.id
else:
url = url_override
res = requests.get(url)
if res.status_code != 200:
raise ValueError(
"Could not load guide: " + self.web_path + "\n" + res.json()
)
data = res.json()
doc_parts = ["# " + data["title"], data["introduction_raw"]]
doc_parts.append("\n\n###Tools Required:")
if len(data["tools"]) == 0:
doc_parts.append("\n - None")
else:
for tool in data["tools"]:
doc_parts.append("\n - " + tool["text"])
doc_parts.append("\n\n###Parts Required:")
if len(data["parts"]) == 0:
doc_parts.append("\n - None")
else:
for part in data["parts"]:
doc_parts.append("\n - " + part["text"])
for row in data["steps"]:
doc_parts.append(
"\n\n## "
+ (
row["title"]
if row["title"] != ""
else "Step {}".format(row["orderby"])
)
)
for line in row["lines"]:
doc_parts.append(line["text_raw"])
doc_parts.append(data["conclusion_raw"])
text = "\n".join(doc_parts)
metadata = {"source": self.web_path, "title": data["title"]}
return [Document(page_content=text, metadata=metadata)]
|