Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,66 +2,67 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
|
5 |
-
def
|
6 |
-
debug_logs = [] # 디버깅 메시지 저장용
|
7 |
-
|
8 |
try:
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
if response.status_code == 200:
|
15 |
-
debug_logs.append("Successfully fetched the webpage.")
|
16 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
17 |
-
|
18 |
-
# 제목 크롤링
|
19 |
-
try:
|
20 |
-
title_element = soup.select_one("meta[property='og:title']") # 제목 메타 태그 사용
|
21 |
-
title = title_element["content"] if title_element else "Title not found"
|
22 |
-
debug_logs.append(f"Title extracted: {title}")
|
23 |
-
except Exception as e:
|
24 |
-
title = "Error extracting title"
|
25 |
-
debug_logs.append(f"Error extracting title: {e}")
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
except Exception as e:
|
33 |
-
content = "Error extracting content"
|
34 |
-
debug_logs.append(f"Error extracting content: {e}")
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
content = "Error accessing blog"
|
39 |
-
debug_logs.append(f"Error accessing blog: Status code {response.status_code}")
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
#
|
54 |
-
|
55 |
-
gr.Markdown("# 네이버 블로그 크롤러")
|
56 |
-
gr.Markdown("블로그 URL을 입력하면 제목과 내용을 추출합니다.")
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
61 |
|
62 |
-
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
|
5 |
+
def scrape_naver_blog(url):
|
|
|
|
|
6 |
try:
|
7 |
+
# User-Agent 설정
|
8 |
+
headers = {
|
9 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
|
10 |
+
}
|
11 |
+
response = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
# 요청 성공 여부 확인
|
14 |
+
if response.status_code != 200:
|
15 |
+
debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
|
16 |
+
print(debug_message)
|
17 |
+
return debug_message
|
|
|
|
|
|
|
18 |
|
19 |
+
# BeautifulSoup을 사용하여 HTML 파싱
|
20 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
21 |
|
22 |
+
# 제목 크롤링
|
23 |
+
try:
|
24 |
+
title_element = soup.select_one(
|
25 |
+
"body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
|
26 |
+
)
|
27 |
+
title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
|
28 |
+
except Exception as e:
|
29 |
+
debug_message = f"제목 크롤링 중 오류 발생: {e}"
|
30 |
+
print(debug_message)
|
31 |
+
title = debug_message
|
32 |
|
33 |
+
# 내용 크롤링
|
34 |
+
try:
|
35 |
+
content_element = soup.select_one(
|
36 |
+
"body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
|
37 |
+
)
|
38 |
+
content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
|
39 |
+
except Exception as e:
|
40 |
+
debug_message = f"내용 크롤링 중 오류 발생: {e}"
|
41 |
+
print(debug_message)
|
42 |
+
content = debug_message
|
43 |
|
44 |
+
# 결과 출력
|
45 |
+
return {"제목": title, "내용": content}
|
|
|
|
|
46 |
|
47 |
+
except Exception as e:
|
48 |
+
debug_message = f"전체 크롤링 중 오류 발생: {e}"
|
49 |
+
print(debug_message)
|
50 |
+
return debug_message
|
51 |
|
52 |
+
def gradio_interface(url):
|
53 |
+
print(f"입력된 URL: {url}")
|
54 |
+
result = scrape_naver_blog(url)
|
55 |
+
print(f"크롤�� 결과: {result}")
|
56 |
+
return f"제목: {result['제목']}\n내용: {result['내용']}"
|
57 |
|
58 |
+
# Gradio 인터페이스 구성
|
59 |
+
iface = gr.Interface(
|
60 |
+
fn=gradio_interface,
|
61 |
+
inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
|
62 |
+
outputs=gr.Textbox(label="크롤링 결과"),
|
63 |
+
title="네이버 블로그 크롤러",
|
64 |
+
description="네이버 블로그의 제목과 내용을 크롤링하여 출력합니다."
|
65 |
+
)
|
66 |
|
67 |
+
if __name__ == "__main__":
|
68 |
+
iface.launch()
|