antfraia commited on
Commit
e4f375b
·
1 Parent(s): 9b0553b

Delete scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +0 -41
scrape.py DELETED
@@ -1,41 +0,0 @@
1
- import os
2
-
3
- from apify_client import ApifyClient
4
- from langchain.document_loaders import ApifyDatasetLoader
5
- from langchain.document_loaders.base import Document
6
- from langchain.embeddings.openai import OpenAIEmbeddings
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain.vectorstores import Chroma
9
-
10
- # Access variables and secrets as environment variables
11
- WEBSITE_URL = os.environ.get('WEBSITE_URL')
12
- OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
13
- APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
14
-
15
- if __name__ == '__main__':
16
- apify_client = ApifyClient(APIFY_API_TOKEN)
17
- print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
18
- actor_run_info = apify_client.actor('apify/website-content-crawler').call(
19
- run_input={'startUrls': [{'url': WEBSITE_URL}]}
20
- )
21
- print('Saving data into the vector database. Please wait...')
22
- loader = ApifyDatasetLoader(
23
- dataset_id=actor_run_info['defaultDatasetId'],
24
- dataset_mapping_function=lambda item: Document(
25
- page_content=item['text'] or '', metadata={'source': item['url']}
26
- ),
27
- )
28
- documents = loader.load()
29
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
30
- docs = text_splitter.split_documents(documents)
31
-
32
- # Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
33
- embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
34
-
35
- vectordb = Chroma.from_documents(
36
- documents=docs,
37
- embedding=embedding,
38
- persist_directory='db2',
39
- )
40
- vectordb.persist()
41
- print('All done!')