Spaces:
Runtime error
Runtime error
support target_versions
Browse files- app.py +5 -27
- cfg.py +10 -4
- rtd_scraper/scrape_rtd.py +23 -26
- rtd_scraper/tutorial/spiders/docs_spider.py +31 -5
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import logging
|
2 |
-
import os
|
3 |
from typing import Optional, Tuple
|
4 |
|
5 |
import gradio as gr
|
@@ -19,12 +18,6 @@ handler = (
|
|
19 |
handler.setLevel(logging.INFO)
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
|
22 |
-
# Check if an openai key is set as an env. variable
|
23 |
-
if os.getenv("OPENAI_API_KEY") is None:
|
24 |
-
print(
|
25 |
-
"Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
|
26 |
-
)
|
27 |
-
|
28 |
# Typehint for chatbot history
|
29 |
ChatHistory = list[list[Optional[str], Optional[str]]]
|
30 |
|
@@ -114,21 +107,21 @@ with demo:
|
|
114 |
examples = gr.Examples(
|
115 |
examples=[
|
116 |
"How can I install the library?",
|
117 |
-
"
|
118 |
-
"How do I deal with noisy data in 2 words?",
|
119 |
],
|
120 |
inputs=question,
|
121 |
)
|
122 |
|
123 |
gr.Markdown(
|
124 |
-
"This
|
125 |
)
|
126 |
|
127 |
response = gr.State()
|
128 |
|
129 |
# fmt: off
|
130 |
-
|
131 |
-
|
|
|
132 |
inputs=[question],
|
133 |
outputs=[chatbot]
|
134 |
).then(
|
@@ -141,21 +134,6 @@ with demo:
|
|
141 |
outputs=[chatbot]
|
142 |
)
|
143 |
|
144 |
-
question.submit(
|
145 |
-
add_user_question,
|
146 |
-
inputs=[question],
|
147 |
-
outputs=[chatbot],
|
148 |
-
).then(
|
149 |
-
chat,
|
150 |
-
inputs=[chatbot],
|
151 |
-
outputs=[chatbot, response]
|
152 |
-
).then(
|
153 |
-
add_sources,
|
154 |
-
inputs=[chatbot, response],
|
155 |
-
outputs=[chatbot]
|
156 |
-
)
|
157 |
-
# fmt: on
|
158 |
-
|
159 |
|
160 |
demo.queue(concurrency_count=16)
|
161 |
demo.launch(share=False)
|
|
|
1 |
import logging
|
|
|
2 |
from typing import Optional, Tuple
|
3 |
|
4 |
import gradio as gr
|
|
|
18 |
handler.setLevel(logging.INFO)
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Typehint for chatbot history
|
22 |
ChatHistory = list[list[Optional[str], Optional[str]]]
|
23 |
|
|
|
107 |
examples = gr.Examples(
|
108 |
examples=[
|
109 |
"How can I install the library?",
|
110 |
+
"What dependencies are required?",
|
|
|
111 |
],
|
112 |
inputs=question,
|
113 |
)
|
114 |
|
115 |
gr.Markdown(
|
116 |
+
"This app uses [Buster 🤖](github.com/jerpint/buster) and ChatGPT to search the docs for relevant info and answer questions."
|
117 |
)
|
118 |
|
119 |
response = gr.State()
|
120 |
|
121 |
# fmt: off
|
122 |
+
gr.on(
|
123 |
+
triggers=[submit.click, question.submit],
|
124 |
+
fn=add_user_question,
|
125 |
inputs=[question],
|
126 |
outputs=[chatbot]
|
127 |
).then(
|
|
|
134 |
outputs=[chatbot]
|
135 |
)
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
demo.queue(concurrency_count=16)
|
139 |
demo.launch(share=False)
|
cfg.py
CHANGED
@@ -1,5 +1,5 @@
|
|
|
|
1 |
import logging
|
2 |
-
import sys
|
3 |
|
4 |
from buster.busterbot import Buster, BusterConfig
|
5 |
from buster.completers import ChatGPTCompleter, DocumentAnswerer
|
@@ -14,11 +14,17 @@ from rtd_scraper.scrape_rtd import scrape_rtd
|
|
14 |
# Set the root logger's level to INFO
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
homepage_url = "https://
|
19 |
-
|
20 |
|
21 |
-
|
|
|
22 |
|
23 |
# Disable logging for third-party libraries at DEBUG level
|
24 |
for name in logging.root.manager.loggerDict:
|
|
|
1 |
+
import os
|
2 |
import logging
|
|
|
3 |
|
4 |
from buster.busterbot import Buster, BusterConfig
|
5 |
from buster.completers import ChatGPTCompleter, DocumentAnswerer
|
|
|
14 |
# Set the root logger's level to INFO
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
|
17 |
+
# Check if an openai key is set as an env. variable
|
18 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
19 |
+
print(
|
20 |
+
"Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
|
21 |
+
)
|
22 |
|
23 |
+
homepage_url = os.getenv("RTD_URL", "https://orion.readthedocs.io/")
|
24 |
+
target_version = os.getenv("RTD_VERSION", "en/stable")
|
25 |
|
26 |
+
# scrape and embed content from readthedocs website
|
27 |
+
scrape_rtd(homepage_url=homepage_url, save_directory="outputs/", target_version=target_version)
|
28 |
|
29 |
# Disable logging for third-party libraries at DEBUG level
|
30 |
for name in logging.root.manager.loggerDict:
|
rtd_scraper/scrape_rtd.py
CHANGED
@@ -5,10 +5,10 @@ from buster.docparser import get_all_documents
|
|
5 |
from buster.documents_manager import DeepLakeDocumentsManager
|
6 |
from buster.parser import SphinxParser
|
7 |
from scrapy.crawler import CrawlerProcess
|
8 |
-
from scrapy.exceptions import CloseSpider
|
9 |
from scrapy.utils.project import get_project_settings
|
10 |
|
11 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
|
|
|
12 |
|
13 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
14 |
for name in logging.root.manager.loggerDict:
|
@@ -16,12 +16,9 @@ for name in logging.root.manager.loggerDict:
|
|
16 |
logger.setLevel(logging.INFO)
|
17 |
|
18 |
|
19 |
-
def run_spider(homepage_url, save_directory):
|
20 |
-
# settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py
|
21 |
-
# os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
|
22 |
-
|
23 |
process = CrawlerProcess(settings=get_project_settings())
|
24 |
-
process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory)
|
25 |
|
26 |
# Start the crawling process
|
27 |
process.start()
|
@@ -30,11 +27,11 @@ def run_spider(homepage_url, save_directory):
|
|
30 |
process.stop()
|
31 |
|
32 |
|
33 |
-
def scrape_rtd(homepage_url, save_directory):
|
34 |
# Crawl the website using scrapy
|
35 |
-
run_spider(homepage_url, save_directory=save_directory)
|
36 |
|
37 |
-
# Convert the .html pages into chunks using Buster's SphinxParser
|
38 |
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
|
39 |
|
40 |
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
|
@@ -49,23 +46,23 @@ def scrape_rtd(homepage_url, save_directory):
|
|
49 |
# Add the source column
|
50 |
df["source"] = "readthedocs"
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
|
69 |
if __name__ == "__main__":
|
70 |
-
homepage_url = "https://
|
71 |
-
scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
|
|
|
5 |
from buster.documents_manager import DeepLakeDocumentsManager
|
6 |
from buster.parser import SphinxParser
|
7 |
from scrapy.crawler import CrawlerProcess
|
|
|
8 |
from scrapy.utils.project import get_project_settings
|
9 |
|
10 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
|
11 |
+
# from tutorial.spiders.docs_spider import DocsSpider
|
12 |
|
13 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
14 |
for name in logging.root.manager.loggerDict:
|
|
|
16 |
logger.setLevel(logging.INFO)
|
17 |
|
18 |
|
19 |
+
def run_spider(homepage_url, save_directory, target_version=None):
|
|
|
|
|
|
|
20 |
process = CrawlerProcess(settings=get_project_settings())
|
21 |
+
process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory, target_version=target_version)
|
22 |
|
23 |
# Start the crawling process
|
24 |
process.start()
|
|
|
27 |
process.stop()
|
28 |
|
29 |
|
30 |
+
def scrape_rtd(homepage_url, save_directory, target_version=None):
|
31 |
# Crawl the website using scrapy
|
32 |
+
run_spider(homepage_url, save_directory=save_directory, target_version=target_version)
|
33 |
|
34 |
+
# # Convert the .html pages into chunks using Buster's SphinxParser
|
35 |
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
|
36 |
|
37 |
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
|
|
|
46 |
# Add the source column
|
47 |
df["source"] = "readthedocs"
|
48 |
|
49 |
+
# Initialize the DeepLake vector store
|
50 |
+
dm = DeepLakeDocumentsManager(
|
51 |
+
vector_store_path=os.path.join(save_directory, "deeplake_store"),
|
52 |
+
overwrite=True,
|
53 |
+
required_columns=["url", "content", "source", "title"],
|
54 |
+
)
|
55 |
+
|
56 |
+
# Add all embeddings to the vector store
|
57 |
+
dm.batch_add(
|
58 |
+
df=df,
|
59 |
+
batch_size=3000,
|
60 |
+
min_time_interval=60,
|
61 |
+
num_workers=32,
|
62 |
+
)
|
63 |
+
|
64 |
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
+
homepage_url = "https://orion.readthedocs.io/"
|
68 |
+
scrape_rtd(homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/")
|
rtd_scraper/tutorial/spiders/docs_spider.py
CHANGED
@@ -6,20 +6,39 @@ import scrapy
|
|
6 |
|
7 |
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
class DocsSpider(scrapy.Spider):
|
11 |
name = "docs"
|
12 |
|
13 |
-
def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
|
14 |
super(DocsSpider, self).__init__(*args, **kwargs)
|
15 |
|
16 |
if not homepage_url.startswith("https://"):
|
17 |
homepage_url = "https://" + homepage_url
|
18 |
|
19 |
-
|
20 |
-
self.allowed_domains = [f"{project}.readthedocs.io"]
|
21 |
self.start_urls = [homepage_url]
|
22 |
self.base_dir = Path(save_dir)
|
|
|
23 |
|
24 |
def parse(self, response):
|
25 |
parsed_uri = urlparse(response.url)
|
@@ -39,6 +58,13 @@ class DocsSpider(scrapy.Spider):
|
|
39 |
with open(filepath, "wb") as f:
|
40 |
f.write(response.body)
|
41 |
|
42 |
-
# Follow links to other documentation pages
|
43 |
for href in response.css("a::attr(href)").getall():
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
|
8 |
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
|
11 |
+
def extract_domain(url):
|
12 |
+
"""
|
13 |
+
Extract the domain (including subdomains) from a given URL.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
- url (str): The URL from which the domain needs to be extracted.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
- str: The domain (with subdomains) extracted from the URL.
|
20 |
+
For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.
|
21 |
+
|
22 |
+
"""
|
23 |
+
parsed_uri = urlparse(url)
|
24 |
+
# The netloc attribute will contain the domain name
|
25 |
+
domain = parsed_uri.netloc
|
26 |
+
return domain
|
27 |
+
|
28 |
|
29 |
class DocsSpider(scrapy.Spider):
|
30 |
name = "docs"
|
31 |
|
32 |
+
def __init__(self, homepage_url: str, save_dir="crawled_pages", target_version=None, *args, **kwargs):
|
33 |
super(DocsSpider, self).__init__(*args, **kwargs)
|
34 |
|
35 |
if not homepage_url.startswith("https://"):
|
36 |
homepage_url = "https://" + homepage_url
|
37 |
|
38 |
+
self.allowed_domains = [extract_domain(homepage_url)]
|
|
|
39 |
self.start_urls = [homepage_url]
|
40 |
self.base_dir = Path(save_dir)
|
41 |
+
self.target_version = target_version
|
42 |
|
43 |
def parse(self, response):
|
44 |
parsed_uri = urlparse(response.url)
|
|
|
58 |
with open(filepath, "wb") as f:
|
59 |
f.write(response.body)
|
60 |
|
61 |
+
# Follow links to other documentation pages only if they contain the target version in the full URL
|
62 |
for href in response.css("a::attr(href)").getall():
|
63 |
+
if self.target_version:
|
64 |
+
# A version was specified, check to see if it's the correct version from url
|
65 |
+
full_url = response.urljoin(href) # Expand href to a full URL
|
66 |
+
if self.target_version in full_url:
|
67 |
+
yield response.follow(href, self.parse)
|
68 |
+
else:
|
69 |
+
# no version specified, follow all links
|
70 |
+
yield response.follow(href, self.parse)
|