ocr_api

Paused

Arafath10 commited on Jul 22, 2024

Commit

37c190c

verified ·

1 Parent(s): 0860d6e

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -19,46 +19,12 @@ app.add_middleware(
     allow_headers=["*"],
 )
-def get_links(soup):
-        links = []
-        for link in soup.find_all('a'):
-            href = link.get('href')
-            links.append(href)
-        return links
-def get_text_content(soup):
-        text_elements = []
-        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
-            elements = soup.find_all(tag)
-            for element in elements:
-                text_elements.append(element.get_text())
-        return text_elements
-def get_title(soup):
-        title = soup.find('title').get_text()
-        return title
 @app.get("/get_scraped_data")
 async def get_data(url: str):
-    import requests
-    from bs4 import BeautifulSoup
-    headers = {'User-Agent': 'Mozilla/5.0'}
-    response = requests.get(url)
-    soup = BeautifulSoup(response.content, 'html.parser')
-    title = Scraper.get_title(soup)
-    links = Scraper.get_links(soup)
-    text_content = Scraper.get_text_content(soup)
-    if not links:
-        print("Running alternative scrapper")
         try:
             data = await Scraper.scrape(url)
             return data
         except Exception as e:
             raise HTTPException(status_code=500, detail=str(e))
-    else:
-        return {"title": title, "URL": links, "Content": text_content}

     allow_headers=["*"],
 )
 @app.get("/get_scraped_data")
 async def get_data(url: str):
         try:
             data = await Scraper.scrape(url)
             return data
         except Exception as e:
             raise HTTPException(status_code=500, detail=str(e))