urlcrawl

Sleeping

seawolf2357 commited on Apr 22, 2024

Commit

7e27e95

verified ·

1 Parent(s): cdbedd5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,22 +4,23 @@ from bs4 import BeautifulSoup
 import re
 def fetch_pdf_links_and_titles():
-    url = "https://finance.naver.com/research/company_list.naver"
-    response = requests.get(url)
-    soup = BeautifulSoup(response.text, 'html.parser')
-    seen_urls = set()
-    links_html = ""
-    # 모든 PDF 링크와 제목을 찾습니다.
-    pdf_links = soup.find_all('a', href=re.compile("^https://ssl.pstatic.net/imgstock/upload/research/company/.*\.pdf$"))
-    for link in pdf_links:
-        title = link.text.strip()  # 링크 텍스트에서 제목 추출
-        full_url = link['href']
-        if full_url not in seen_urls:
-            seen_urls.add(full_url)
-            # HTML 문자열로 링크 추가
-            links_html += f"<div><a href='{full_url}' download='{full_url.split('/')[-1]}'>{title}</a></div>"
-    return links_html
 # Gradio 인터페이스
 with gr.Blocks() as app:

 import re
 def fetch_pdf_links_and_titles():
+    try:
+        url = "https://finance.naver.com/research/company_list.naver"
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        seen_urls = set()
+        links_html = ""
+        pdf_links = soup.find_all('a', href=re.compile("^https://ssl.pstatic.net/imgstock/upload/research/company/.*\.pdf$"))
+        for link in pdf_links:
+            title = link.text.strip()
+            full_url = link['href']
+            if full_url not in seen_urls:
+                seen_urls.add(full_url)
+                links_html += f"<div><a href='{full_url}' download='{full_url.split('/')[-1]}'>{title}</a></div>"
+        return links_html if links_html else "No links found."
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
 # Gradio 인터페이스
 with gr.Blocks() as app: