urlcrawl

Sleeping

springwater commited on May 5, 2024

Commit

7327597

verified ·

1 Parent(s): b1a3ea2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,36 +3,32 @@ import re
 import requests
 from bs4 import BeautifulSoup
-def extract_pdf_links_and_title(url):
     response = requests.get(url)
     soup = BeautifulSoup(response.text, 'html.parser')
-    # 페이지 제목 추출
-    page_title = soup.title.text if soup.title else "No title found"
     pdf_links = []
     for link in soup.find_all('a', href=True):
         if re.search(r'\.pdf', link['href']):
             pdf_links.append(link['href'])
-    # PDF 링크와 페이지 제목을 반환
-    return pdf_links[:100], page_title
-def generate_html(pdf_links_and_title):
-    pdf_links = pdf_links_and_title[0]  # PDF 링크 리스트
-    page_title = pdf_links_and_title[1]  # 페이지 제목
-    html = f"<h1>{page_title}</h1>"  # 제목을 HTML에 추가
     for link in pdf_links:
         html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
     return html
-title = "네이버 증권 리서치 링크-  https://finance.naver.com/research/company_list.naver"
-iface = gr.Interface(fn=extract_pdf_links_and_title,
                      inputs="text",
-                     outputs=["text", "html"],
                      title=title)
 iface.launch()

 import requests
 from bs4 import BeautifulSoup
+def extract_pdf_links(url):
     response = requests.get(url)
     soup = BeautifulSoup(response.text, 'html.parser')
     pdf_links = []
     for link in soup.find_all('a', href=True):
         if re.search(r'\.pdf', link['href']):
             pdf_links.append(link['href'])
+    return pdf_links[:100]
+def generate_html(pdf_links):
+    html = ""
     for link in pdf_links:
         html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
     return html
+def extract_and_download(url):
+    pdf_links = extract_pdf_links(url)
+    return generate_html(pdf_links)
+title = "네이버 증권 리서치 링크- https://finance.naver.com/research/company_list.naver"
+iface = gr.Interface(extract_and_download,
                      inputs="text",
+                     outputs="html",
                      title=title)
 iface.launch()