urlcrawl / app.py
seawolf2357's picture
Update app.py
a027460 verified
raw
history blame
886 Bytes
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
def fetch_pdf_links():
url = "https://finance.naver.com/research/company_list.naver"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 모든 PDF 링크를 찾습니다.
pdf_links = soup.find_all('a', href=re.compile("\.pdf$"))
links = []
for link in pdf_links:
full_url = "https://finance.naver.com" + link['href']
# 각 링크를 HTML 링크 형태로 저장
links.append([f"<a href='{full_url}' target='_blank'>{full_url}</a>"])
return links
# Gradio 인터페이스
with gr.Blocks() as app:
btn_fetch = gr.Button("PDF 링크 조회")
output_links = gr.Dataframe(headers=["PDF 링크"], interactive=False)
btn_fetch.click(
fn=fetch_pdf_links,
outputs=output_links
)
app.launch()