antfraia commited on
Commit
fec0bf6
·
1 Parent(s): bd7062e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -2
app.py CHANGED
@@ -7,17 +7,32 @@ import os
7
  # Update with your OpenAI API key
8
  os.environ["OPENAI_API_KEY"] = "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"
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Function to fetch website content using the updated actor
11
  def fetch_website_content(website_url):
12
  apify_client = ApifyClient("apify_api_uz0y556N4IG2aLcESj67kmnGSUpHF12XAkLp")
13
- run_input = {"startUrls": [{"url": website_url}]}
 
 
 
14
  run = apify_client.actor("moJRLRc85AitArpNN").call(run_input=run_input)
15
  items = list(apify_client.dataset(run["defaultDatasetId"]).iterate_items())
16
  return items if items else None
17
 
18
  # Fetch and index website content
19
  content = fetch_website_content("https://python.langchain.com/en/latest/")
20
- documents = [Document(page_content=item["text"] or "", metadata={"source": item["url"]}) for item in content]
21
  index = VectorstoreIndexCreator().from_loaders([documents])
22
 
23
  # Function for the Gradio UI
 
7
  # Update with your OpenAI API key
8
  os.environ["OPENAI_API_KEY"] = "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"
9
 
10
+ # Page Function to extract website content
11
+ page_function_code = """
12
+ function pageFunction(context) {
13
+ const $ = context.jQuery;
14
+ const data = {
15
+ title: $('title').text(),
16
+ content: $('body').text()
17
+ };
18
+ return data;
19
+ }
20
+ """
21
+
22
  # Function to fetch website content using the updated actor
23
  def fetch_website_content(website_url):
24
  apify_client = ApifyClient("apify_api_uz0y556N4IG2aLcESj67kmnGSUpHF12XAkLp")
25
+ run_input = {
26
+ "startUrls": [{"url": website_url}],
27
+ "pageFunction": page_function_code
28
+ }
29
  run = apify_client.actor("moJRLRc85AitArpNN").call(run_input=run_input)
30
  items = list(apify_client.dataset(run["defaultDatasetId"]).iterate_items())
31
  return items if items else None
32
 
33
  # Fetch and index website content
34
  content = fetch_website_content("https://python.langchain.com/en/latest/")
35
+ documents = [Document(page_content=item["content"] or "", metadata={"source": website_url}) for item in content]
36
  index = VectorstoreIndexCreator().from_loaders([documents])
37
 
38
  # Function for the Gradio UI