Spaces:
Sleeping
Sleeping
Delete scrape.py
Browse files
scrape.py
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
from apify_client import ApifyClient
|
4 |
-
from langchain.document_loaders import ApifyDatasetLoader
|
5 |
-
from langchain.document_loaders.base import Document
|
6 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
-
from langchain.vectorstores import Chroma
|
9 |
-
|
10 |
-
# Access variables and secrets as environment variables
|
11 |
-
WEBSITE_URL = os.environ.get('WEBSITE_URL')
|
12 |
-
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
|
13 |
-
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
14 |
-
|
15 |
-
if __name__ == '__main__':
|
16 |
-
apify_client = ApifyClient(APIFY_API_TOKEN)
|
17 |
-
print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
|
18 |
-
actor_run_info = apify_client.actor('apify/website-content-crawler').call(
|
19 |
-
run_input={'startUrls': [{'url': WEBSITE_URL}]}
|
20 |
-
)
|
21 |
-
print('Saving data into the vector database. Please wait...')
|
22 |
-
loader = ApifyDatasetLoader(
|
23 |
-
dataset_id=actor_run_info['defaultDatasetId'],
|
24 |
-
dataset_mapping_function=lambda item: Document(
|
25 |
-
page_content=item['text'] or '', metadata={'source': item['url']}
|
26 |
-
),
|
27 |
-
)
|
28 |
-
documents = loader.load()
|
29 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
|
30 |
-
docs = text_splitter.split_documents(documents)
|
31 |
-
|
32 |
-
# Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
|
33 |
-
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
|
34 |
-
|
35 |
-
vectordb = Chroma.from_documents(
|
36 |
-
documents=docs,
|
37 |
-
embedding=embedding,
|
38 |
-
persist_directory='db2',
|
39 |
-
)
|
40 |
-
vectordb.persist()
|
41 |
-
print('All done!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|