urlcrawl / app.py
seawolf2357's picture
Update app.py
cdbedd5 verified
raw
history blame
1.12 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
def fetch_pdf_links_and_titles():
url = "https://finance.naver.com/research/company_list.naver"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
seen_urls = set()
links_html = ""
# ๋ชจ๋“  PDF ๋งํฌ์™€ ์ œ๋ชฉ์„ ์ฐพ์Šต๋‹ˆ๋‹ค.
pdf_links = soup.find_all('a', href=re.compile("^https://ssl.pstatic.net/imgstock/upload/research/company/.*\.pdf$"))
for link in pdf_links:
title = link.text.strip() # ๋งํฌ ํ…์ŠคํŠธ์—์„œ ์ œ๋ชฉ ์ถ”์ถœ
full_url = link['href']
if full_url not in seen_urls:
seen_urls.add(full_url)
# HTML ๋ฌธ์ž์—ด๋กœ ๋งํฌ ์ถ”๊ฐ€
links_html += f"<div><a href='{full_url}' download='{full_url.split('/')[-1]}'>{title}</a></div>"
return links_html
# Gradio ์ธํ„ฐํŽ˜์ด์Šค
with gr.Blocks() as app:
btn_fetch = gr.Button("PDF ๋งํฌ ๋ฐ ์ •๋ณด ์กฐํšŒ")
output_html = gr.HTML()
btn_fetch.click(
fn=fetch_pdf_links_and_titles,
outputs=output_html
)
app.launch()