Arafath10 commited on
Commit
0f0c7dc
1 Parent(s): 29a9fac

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +43 -10
main.py CHANGED
@@ -7,6 +7,7 @@ from fastapi.responses import FileResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
  from io import StringIO
 
10
  import os
11
 
12
  app = FastAPI()
@@ -18,7 +19,7 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
21
- async def scrape_links():
22
  async with async_playwright() as p:
23
  browser = await p.chromium.launch(headless=True)
24
  page = await browser.new_page()
@@ -27,7 +28,7 @@ async def scrape_links():
27
  await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
28
 
29
  # Open the target website
30
- await page.goto('https://www.fool.com/earnings/call-transcripts/2024/01/24/tesla-tsla-q4-2023-earnings-call-transcript/', wait_until='domcontentloaded')
31
 
32
  # Wait for a short time to ensure dynamic content is loaded
33
  await page.wait_for_timeout(10)
@@ -50,11 +51,43 @@ async def scrape_links():
50
  await browser.close()
51
  return result
52
 
53
- @app.post("/get_webscrapet_data")
54
- async def get_webscrapet_data(url: str):
55
- try:
56
- # Run the scraping function
57
- results = await scrape_links()
58
- return results
59
- except Exception as e:
60
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
  from io import StringIO
10
+ from bs4 import BeautifulSoup
11
  import os
12
 
13
  app = FastAPI()
 
19
  allow_headers=["*"],
20
  )
21
 
22
+ async def power_scrapper(url):
23
  async with async_playwright() as p:
24
  browser = await p.chromium.launch(headless=True)
25
  page = await browser.new_page()
 
28
  await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
29
 
30
  # Open the target website
31
+ await page.goto(url, wait_until='domcontentloaded')
32
 
33
  # Wait for a short time to ensure dynamic content is loaded
34
  await page.wait_for_timeout(10)
 
51
  await browser.close()
52
  return result
53
 
54
+
55
+ def get_links(soup):
56
+ links = []
57
+ title = soup.find('title').get_text()
58
+ for link in soup.find_all('a'):
59
+ href = link.get('href')
60
+ links.append(href)
61
+ return links
62
+
63
+
64
+ def get_text_content(soup):
65
+ text_elements = []
66
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
67
+ elements = soup.find_all(tag)
68
+ for element in elements:
69
+ text_elements.append(element.get_text())
70
+ return text_elements
71
+
72
+
73
+ def get_title(soup):
74
+ title = ""
75
+ title = soup.find('title').get_text()
76
+ return title
77
+
78
+
79
+ @app.get("/get_scraped_data")
80
+ async def get_data(url: str):
81
+ headers = {'User-Agent': 'Mozilla/5.0'}
82
+ response = requests.get(url, headers=headers)
83
+ soup = BeautifulSoup(response.content, 'html.parser')
84
+
85
+ title = get_title(soup)
86
+ links = get_links(soup)
87
+ text_content = get_text_content(soup)
88
+
89
+ if links==[]:
90
+ print("running alternative scrapper")
91
+ links = await power_scrapper(url)
92
+
93
+ return ({title": title , "contend":links+text_content})