Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
import gradio as gr
|
|
|
|
|
4 |
|
5 |
def convert_to_mobile_url(url):
|
6 |
"""
|
@@ -18,9 +20,21 @@ def scrape_naver_blog(url):
|
|
18 |
try:
|
19 |
# Convert URL to mobile format if necessary
|
20 |
url = convert_to_mobile_url(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Send a GET request to the URL
|
23 |
-
response = requests.get(url)
|
24 |
response.raise_for_status() # Raise an error for HTTP issues
|
25 |
|
26 |
# Parse the HTML content
|
@@ -28,7 +42,7 @@ def scrape_naver_blog(url):
|
|
28 |
|
29 |
# Extract the title
|
30 |
title_div = soup.find('div', class_='se-module se-module-text se-title-text')
|
31 |
-
title = title_div.get_text(strip=True) if title_div else "
|
32 |
|
33 |
# Extract text content excluding images
|
34 |
text_components = soup.find_all('div', class_='se-module se-module-text')
|
@@ -37,7 +51,7 @@ def scrape_naver_blog(url):
|
|
37 |
return f"제목: {title}\n내용: {content}"
|
38 |
|
39 |
except Exception as e:
|
40 |
-
return f"
|
41 |
|
42 |
# Gradio interface
|
43 |
def gradio_interface(url):
|
@@ -45,13 +59,14 @@ def gradio_interface(url):
|
|
45 |
|
46 |
iface = gr.Interface(
|
47 |
fn=gradio_interface,
|
48 |
-
inputs=gr.Textbox(label="
|
49 |
-
outputs=gr.Textbox(label="
|
50 |
-
title="
|
51 |
description=(
|
52 |
-
"
|
53 |
-
"
|
54 |
),
|
|
|
55 |
)
|
56 |
|
57 |
if __name__ == "__main__":
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
import gradio as gr
|
4 |
+
import random
|
5 |
+
import time
|
6 |
|
7 |
def convert_to_mobile_url(url):
|
8 |
"""
|
|
|
20 |
try:
|
21 |
# Convert URL to mobile format if necessary
|
22 |
url = convert_to_mobile_url(url)
|
23 |
+
|
24 |
+
# HTTP request headers
|
25 |
+
headers = {
|
26 |
+
"User-Agent": (
|
27 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
28 |
+
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
29 |
+
),
|
30 |
+
"Referer": "https://www.naver.com/",
|
31 |
+
}
|
32 |
+
|
33 |
+
# Random delay between 1 to 3 seconds
|
34 |
+
time.sleep(random.uniform(1, 3))
|
35 |
|
36 |
# Send a GET request to the URL
|
37 |
+
response = requests.get(url, headers=headers)
|
38 |
response.raise_for_status() # Raise an error for HTTP issues
|
39 |
|
40 |
# Parse the HTML content
|
|
|
42 |
|
43 |
# Extract the title
|
44 |
title_div = soup.find('div', class_='se-module se-module-text se-title-text')
|
45 |
+
title = title_div.get_text(strip=True) if title_div else "제목을 찾을 수 없습니다."
|
46 |
|
47 |
# Extract text content excluding images
|
48 |
text_components = soup.find_all('div', class_='se-module se-module-text')
|
|
|
51 |
return f"제목: {title}\n내용: {content}"
|
52 |
|
53 |
except Exception as e:
|
54 |
+
return f"오류 발생: {e}"
|
55 |
|
56 |
# Gradio interface
|
57 |
def gradio_interface(url):
|
|
|
59 |
|
60 |
iface = gr.Interface(
|
61 |
fn=gradio_interface,
|
62 |
+
inputs=gr.Textbox(label="네이버 블로그 URL 입력 (표준 또는 모바일)"),
|
63 |
+
outputs=gr.Textbox(label="스크래핑된 블로그 내용"),
|
64 |
+
title="네이버 블로그 스크래퍼 (텍스트만)",
|
65 |
description=(
|
66 |
+
"네이버 블로그 URL(표준 또는 모바일)을 입력하면 제목과 텍스트 내용을 스크래핑합니다. "
|
67 |
+
"스크립트는 표준 URL을 자동으로 모바일 형식으로 변환합니다."
|
68 |
),
|
69 |
+
theme="compact", # Gradio 인터페이스의 간결한 테마 적용
|
70 |
)
|
71 |
|
72 |
if __name__ == "__main__":
|