antfraia commited on
Commit
9b0553b
·
1 Parent(s): 54f82fd

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +10 -9
scrape.py CHANGED
@@ -1,22 +1,22 @@
1
  import os
2
 
3
  from apify_client import ApifyClient
4
- from dotenv import load_dotenv
5
  from langchain.document_loaders import ApifyDatasetLoader
6
  from langchain.document_loaders.base import Document
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.vectorstores import Chroma
10
 
11
- # Load environment variables from a .env file
12
- load_dotenv()
 
 
13
 
14
  if __name__ == '__main__':
15
- apify_client = ApifyClient(os.environ.get('APIFY_API_TOKEN'))
16
- website_url = os.environ.get('WEBSITE_URL')
17
- print(f'Extracting data from "{website_url}". Please wait...')
18
  actor_run_info = apify_client.actor('apify/website-content-crawler').call(
19
- run_input={'startUrls': [{'url': website_url}]}
20
  )
21
  print('Saving data into the vector database. Please wait...')
22
  loader = ApifyDatasetLoader(
@@ -29,7 +29,8 @@ if __name__ == '__main__':
29
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
30
  docs = text_splitter.split_documents(documents)
31
 
32
- embedding = OpenAIEmbeddings()
 
33
 
34
  vectordb = Chroma.from_documents(
35
  documents=docs,
@@ -37,4 +38,4 @@ if __name__ == '__main__':
37
  persist_directory='db2',
38
  )
39
  vectordb.persist()
40
- print('All done!')
 
1
  import os
2
 
3
  from apify_client import ApifyClient
 
4
  from langchain.document_loaders import ApifyDatasetLoader
5
  from langchain.document_loaders.base import Document
6
  from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import Chroma
9
 
10
+ # Access variables and secrets as environment variables
11
+ WEBSITE_URL = os.environ.get('WEBSITE_URL')
12
+ OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
13
+ APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
14
 
15
  if __name__ == '__main__':
16
+ apify_client = ApifyClient(APIFY_API_TOKEN)
17
+ print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
 
18
  actor_run_info = apify_client.actor('apify/website-content-crawler').call(
19
+ run_input={'startUrls': [{'url': WEBSITE_URL}]}
20
  )
21
  print('Saving data into the vector database. Please wait...')
22
  loader = ApifyDatasetLoader(
 
29
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
30
  docs = text_splitter.split_documents(documents)
31
 
32
+ # Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
33
+ embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
34
 
35
  vectordb = Chroma.from_documents(
36
  documents=docs,
 
38
  persist_directory='db2',
39
  )
40
  vectordb.persist()
41
+ print('All done!')