async_pdf_chunck_api

Paused

App Files Files Community

Arafath10 commited on May 20, 2024

Commit

0f0c7dc

verified ·

1 Parent(s): 29a9fac

Update main.py

Browse files

Files changed (1) hide show

main.py +43 -10

main.py CHANGED Viewed

@@ -7,6 +7,7 @@ from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from io import StringIO
 import os
 app = FastAPI()
@@ -18,7 +19,7 @@ app.add_middleware(
     allow_headers=["*"],
 )
-async def scrape_links():
     async with async_playwright() as p:
         browser = await p.chromium.launch(headless=True)
         page = await browser.new_page()
@@ -27,7 +28,7 @@ async def scrape_links():
         await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
         # Open the target website
-        await page.goto('https://www.fool.com/earnings/call-transcripts/2024/01/24/tesla-tsla-q4-2023-earnings-call-transcript/', wait_until='domcontentloaded')
         # Wait for a short time to ensure dynamic content is loaded
         await page.wait_for_timeout(10)
@@ -50,11 +51,43 @@ async def scrape_links():
         await browser.close()
         return result
-@app.post("/get_webscrapet_data")
-async def get_webscrapet_data(url: str):
-    try:
-        # Run the scraping function
-        results = await scrape_links()
-        return results
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from io import StringIO
+from bs4 import BeautifulSoup
 import os
 app = FastAPI()
     allow_headers=["*"],
 )
+async def power_scrapper(url):
     async with async_playwright() as p:
         browser = await p.chromium.launch(headless=True)
         page = await browser.new_page()
         await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
         # Open the target website
+        await page.goto(url, wait_until='domcontentloaded')
         # Wait for a short time to ensure dynamic content is loaded
         await page.wait_for_timeout(10)
         await browser.close()
         return result
+def get_links(soup):
+  links = []
+  title = soup.find('title').get_text()
+  for link in soup.find_all('a'):
+      href = link.get('href')
+      links.append(href)
+  return links
+def get_text_content(soup):
+  text_elements = []
+  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
+      elements = soup.find_all(tag)
+      for element in elements:
+          text_elements.append(element.get_text())
+  return text_elements
+def get_title(soup):
+  title = ""
+  title = soup.find('title').get_text()
+  return title
+@app.get("/get_scraped_data")
+async def get_data(url: str):
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    title = get_title(soup)
+    links = get_links(soup)
+    text_content = get_text_content(soup)
+    if links==[]:
+        print("running alternative scrapper")
+        links = await power_scrapper(url)
+    return ({title": title , "contend":links+text_content})