pabloce commited on
Commit
452bb3a
·
verified ·
1 Parent(s): f04b8f8

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +44 -0
utils.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ from typing import List
4
+ from datetime import datetime, timezone
5
+
6
+ from pydantic import BaseModel, Field
7
+ from trafilatura import fetch_url, extract
8
+
9
+ def get_server_time():
10
+ utc_time = datetime.now(timezone.utc)
11
+ return utc_time.strftime("%Y-%m-%d %H:%M:%S")
12
+
13
+ def get_website_content_from_url(url: str) -> str:
14
+ """
15
+ Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
16
+ Args:
17
+ url (str): URL to get website content from.
18
+ Returns:
19
+ str: Extracted content including title, main text, and tables.
20
+ """
21
+
22
+ try:
23
+ downloaded = fetch_url(url)
24
+
25
+ result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
26
+
27
+ if result:
28
+ result = json.loads(result)
29
+ return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
30
+ else:
31
+ return ""
32
+ except Exception as e:
33
+ return f"An error occurred: {str(e)}"
34
+
35
+
36
+
37
+ class CitingSources(BaseModel):
38
+ """
39
+ This represents the citing of the sources you used to answer the user query.
40
+ """
41
+ sources: List[str] = Field(
42
+ ...,
43
+ description="List of sources to cite. Should be an URL of the source. E.g. GitHub URL, Blogpost URL or Newsletter URL."
44
+ )