Arafath10 commited on
Commit
37c190c
·
verified ·
1 Parent(s): 0860d6e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +0 -34
main.py CHANGED
@@ -19,46 +19,12 @@ app.add_middleware(
19
  allow_headers=["*"],
20
  )
21
 
22
- def get_links(soup):
23
- links = []
24
- for link in soup.find_all('a'):
25
- href = link.get('href')
26
- links.append(href)
27
- return links
28
-
29
-
30
- def get_text_content(soup):
31
- text_elements = []
32
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
33
- elements = soup.find_all(tag)
34
- for element in elements:
35
- text_elements.append(element.get_text())
36
- return text_elements
37
-
38
- def get_title(soup):
39
- title = soup.find('title').get_text()
40
- return title
41
 
42
  @app.get("/get_scraped_data")
43
  async def get_data(url: str):
44
- import requests
45
- from bs4 import BeautifulSoup
46
- headers = {'User-Agent': 'Mozilla/5.0'}
47
- response = requests.get(url)
48
- soup = BeautifulSoup(response.content, 'html.parser')
49
-
50
- title = Scraper.get_title(soup)
51
- links = Scraper.get_links(soup)
52
- text_content = Scraper.get_text_content(soup)
53
-
54
- if not links:
55
- print("Running alternative scrapper")
56
-
57
  try:
58
  data = await Scraper.scrape(url)
59
  return data
60
  except Exception as e:
61
  raise HTTPException(status_code=500, detail=str(e))
62
- else:
63
- return {"title": title, "URL": links, "Content": text_content}
64
 
 
19
  allow_headers=["*"],
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  @app.get("/get_scraped_data")
24
  async def get_data(url: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
  data = await Scraper.scrape(url)
27
  return data
28
  except Exception as e:
29
  raise HTTPException(status_code=500, detail=str(e))
 
 
30