webscrapper

Sleeping

Arafath10 commited on Jul 22, 2024

Commit

92199f9

verified ·

1 Parent(s): fdf1d1f

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -19,44 +19,46 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/get_scraped_data")
 async def get_data(url: str):
     import requests
     from bs4 import BeautifulSoup
-    # URL of the page to scrape
-    #url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla"
-    url = url
-    # Send a GET request to the URL
-    response = requests.get(url)
-    # Check if the request was successful
-    if response.status_code == 200:
-        # Parse the page content
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Extract all text content (paragraphs, headers, etc.)
-        elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
-        body_text = "\n".join([element.get_text().strip() for element in elements])
-        # Extract all links
-        links = []
-        for a_tag in soup.find_all('a', href=True):
-            links.append(a_tag['href'])
-        # Print the extracted information
-        print("Body Text:")
-        print(body_text)
-        print("\nLinks:")
-        for link in links:
-            print(link)
-    else:
-        print("Failed to retrieve the webpage")
-    return "done"
-    try:
-        data = await Scraper.scrape(url)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))

     allow_headers=["*"],
 )
+def get_links(soup):
+        links = []
+        for link in soup.find_all('a'):
+            href = link.get('href')
+            links.append(href)
+        return links
+def get_text_content(soup):
+        text_elements = []
+        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
+            elements = soup.find_all(tag)
+            for element in elements:
+                text_elements.append(element.get_text())
+        return text_elements
+def get_title(soup):
+        title = soup.find('title').get_text()
+        return title
 @app.get("/get_scraped_data")
 async def get_data(url: str):
     import requests
     from bs4 import BeautifulSoup
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    title = Scraper.get_title(soup)
+    links = Scraper.get_links(soup)
+    text_content = Scraper.get_text_content(soup)
+    if not links:
+        print("Running alternative scrapper")
+        try:
+            data = await Scraper.scrape(url)
+            return data
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+    esle:
+        return return {"title": title, "URL": links, "Content": text_content}