springwater commited on
Commit
b1a3ea2
ยท
verified ยท
1 Parent(s): dc39e39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -21
app.py CHANGED
@@ -3,42 +3,36 @@ import re
3
  import requests
4
  from bs4 import BeautifulSoup
5
 
6
- def extract_pdf_links(url):
7
- # URL ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ ์ถ”๊ฐ€
8
- if not re.match(r'http[s]?://', url):
9
- return ["Invalid URL"]
10
-
11
  response = requests.get(url)
12
  soup = BeautifulSoup(response.text, 'html.parser')
13
 
 
 
 
14
  pdf_links = []
15
  for link in soup.find_all('a', href=True):
16
  if re.search(r'\.pdf', link['href']):
17
  pdf_links.append(link['href'])
18
 
19
- return pdf_links[:100]
 
20
 
21
- def filter_links_by_keyword(pdf_links, keyword):
22
- filtered_links = [link for link in pdf_links if keyword.lower() in link.lower()]
23
- return filtered_links
24
 
25
- def generate_html(pdf_links):
26
- html = ""
27
  for link in pdf_links:
28
  html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
29
- return html
30
 
31
- def main(url, keyword):
32
- pdf_links = extract_pdf_links(url)
33
- if keyword: # ํ‚ค์›Œ๋“œ๊ฐ€ ๋น„์–ด์žˆ์ง€ ์•Š์€ ๊ฒฝ์šฐ์—๋งŒ ํ•„ํ„ฐ๋ง
34
- pdf_links = filter_links_by_keyword(pdf_links, keyword)
35
- return generate_html(pdf_links)
36
 
37
- title = "๋„ค์ด๋ฒ„ ์ฆ๊ถŒ ๋ฆฌ์„œ์น˜ ๋งํฌ - https://finance.naver.com/research/company_list.naver"
38
 
39
- iface = gr.Interface(main,
40
- inputs=["text", "text"], # URL๊ณผ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ
41
- outputs="text",
42
  title=title)
43
 
44
  iface.launch()
 
3
  import requests
4
  from bs4 import BeautifulSoup
5
 
6
+ def extract_pdf_links_and_title(url):
 
 
 
 
7
  response = requests.get(url)
8
  soup = BeautifulSoup(response.text, 'html.parser')
9
 
10
+ # ํŽ˜์ด์ง€ ์ œ๋ชฉ ์ถ”์ถœ
11
+ page_title = soup.title.text if soup.title else "No title found"
12
+
13
  pdf_links = []
14
  for link in soup.find_all('a', href=True):
15
  if re.search(r'\.pdf', link['href']):
16
  pdf_links.append(link['href'])
17
 
18
+ # PDF ๋งํฌ์™€ ํŽ˜์ด์ง€ ์ œ๋ชฉ์„ ๋ฐ˜ํ™˜
19
+ return pdf_links[:100], page_title
20
 
21
+ def generate_html(pdf_links_and_title):
22
+ pdf_links = pdf_links_and_title[0] # PDF ๋งํฌ ๋ฆฌ์ŠคํŠธ
23
+ page_title = pdf_links_and_title[1] # ํŽ˜์ด์ง€ ์ œ๋ชฉ
24
 
25
+ html = f"<h1>{page_title}</h1>" # ์ œ๋ชฉ์„ HTML์— ์ถ”๊ฐ€
 
26
  for link in pdf_links:
27
  html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
 
28
 
29
+ return html
 
 
 
 
30
 
31
+ title = "๋„ค์ด๋ฒ„ ์ฆ๊ถŒ ๋ฆฌ์„œ์น˜ ๋งํฌ- https://finance.naver.com/research/company_list.naver"
32
 
33
+ iface = gr.Interface(fn=extract_pdf_links_and_title,
34
+ inputs="text",
35
+ outputs=["text", "html"],
36
  title=title)
37
 
38
  iface.launch()