Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
|
5 |
def convert_to_mobile_url(url):
|
6 |
"""
|
7 |
-
|
8 |
"""
|
9 |
if "m.blog.naver.com" not in url:
|
10 |
if "blog.naver.com" in url:
|
@@ -16,41 +16,45 @@ def convert_to_mobile_url(url):
|
|
16 |
return url
|
17 |
|
18 |
def scrape_naver_blog(url):
|
|
|
|
|
|
|
19 |
try:
|
20 |
-
#
|
21 |
-
print(f"Original URL: {url}")
|
22 |
-
|
23 |
-
# ๋ชจ๋ฐ์ผ URL๋ก ๋ณํ
|
24 |
mobile_url = convert_to_mobile_url(url)
|
25 |
print(f"Converted Mobile URL: {mobile_url}")
|
26 |
-
|
27 |
response = requests.get(mobile_url)
|
28 |
response.raise_for_status()
|
29 |
-
|
30 |
-
# Debugging: HTTP ์๋ต ์ํ ํ์ธ
|
31 |
-
print(f"Response Status Code: {response.status_code}")
|
32 |
-
|
33 |
soup = BeautifulSoup(response.text, 'html.parser')
|
34 |
-
|
35 |
# ์ ๋ชฉ ์คํฌ๋ํ
|
36 |
title_element = soup.find("div", class_="se-module se-module-text se-title-text")
|
37 |
title = title_element.get_text(strip=True) if title_element else "์ ๋ชฉ์ ์ฐพ์ ์ ์์"
|
38 |
-
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
# ๋ด์ฉ ์คํฌ๋ํ
|
43 |
-
content_elements = soup.find_all("div", class_="se-module se-module-text se-quote")
|
44 |
content = "\n".join(
|
45 |
elem.get_text(strip=True) for elem in content_elements
|
46 |
) if content_elements else "๋ด์ฉ์ ์ฐพ์ ์ ์์"
|
47 |
-
|
48 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
print(f"Scraped Content: {content}")
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
except Exception as e:
|
53 |
-
# Debugging: ์ค๋ฅ ๋ฉ์์ง ์ถ๋ ฅ
|
54 |
print(f"Error: {e}")
|
55 |
return f"Error: {e}"
|
56 |
|
@@ -63,7 +67,7 @@ interface = gr.Interface(
|
|
63 |
inputs=gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL"),
|
64 |
outputs=gr.Textbox(label="์คํฌ๋ํ ๊ฒฐ๊ณผ"),
|
65 |
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ",
|
66 |
-
description="๋ค์ด๋ฒ
|
67 |
)
|
68 |
|
69 |
if __name__ == "__main__":
|
|
|
4 |
|
5 |
def convert_to_mobile_url(url):
|
6 |
"""
|
7 |
+
PC URL์ ๋ชจ๋ฐ์ผ URL๋ก ๋ณํ.
|
8 |
"""
|
9 |
if "m.blog.naver.com" not in url:
|
10 |
if "blog.naver.com" in url:
|
|
|
16 |
return url
|
17 |
|
18 |
def scrape_naver_blog(url):
|
19 |
+
"""
|
20 |
+
๋ค์ด๋ฒ ๋ธ๋ก๊ทธ์ ์ ๋ชฉ, ๋ด์ฉ, ์ด๋ฏธ์ง URL ์คํฌ๋ํ.
|
21 |
+
"""
|
22 |
try:
|
23 |
+
# ๋ชจ๋ฐ์ผ URL ๋ณํ
|
|
|
|
|
|
|
24 |
mobile_url = convert_to_mobile_url(url)
|
25 |
print(f"Converted Mobile URL: {mobile_url}")
|
26 |
+
|
27 |
response = requests.get(mobile_url)
|
28 |
response.raise_for_status()
|
29 |
+
|
|
|
|
|
|
|
30 |
soup = BeautifulSoup(response.text, 'html.parser')
|
31 |
+
|
32 |
# ์ ๋ชฉ ์คํฌ๋ํ
|
33 |
title_element = soup.find("div", class_="se-module se-module-text se-title-text")
|
34 |
title = title_element.get_text(strip=True) if title_element else "์ ๋ชฉ์ ์ฐพ์ ์ ์์"
|
35 |
+
|
36 |
+
# ๋ณธ๋ฌธ ๋ด์ฉ ์คํฌ๋ํ
|
37 |
+
content_elements = soup.find_all("div", class_="se-module se-module-text")
|
|
|
|
|
|
|
38 |
content = "\n".join(
|
39 |
elem.get_text(strip=True) for elem in content_elements
|
40 |
) if content_elements else "๋ด์ฉ์ ์ฐพ์ ์ ์์"
|
41 |
+
|
42 |
+
# ์ด๋ฏธ์ง URL ์คํฌ๋ํ
|
43 |
+
image_elements = soup.find_all("img", class_="se-image-resource")
|
44 |
+
image_urls = [
|
45 |
+
img["src"] for img in image_elements if "src" in img.attrs
|
46 |
+
]
|
47 |
+
|
48 |
+
# ๋๋ฒ๊น
๋ฉ์์ง ์ถ๋ ฅ
|
49 |
+
print(f"Scraped Title: {title}")
|
50 |
print(f"Scraped Content: {content}")
|
51 |
+
print(f"Scraped Images: {image_urls}")
|
52 |
+
|
53 |
+
# ๊ฒฐ๊ณผ ๋ฐํ
|
54 |
+
result = f"์ ๋ชฉ: {title}\n\n๋ด์ฉ: {content}\n\n์ด๋ฏธ์ง URL:\n" + "\n".join(image_urls)
|
55 |
+
return result
|
56 |
+
|
57 |
except Exception as e:
|
|
|
58 |
print(f"Error: {e}")
|
59 |
return f"Error: {e}"
|
60 |
|
|
|
67 |
inputs=gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL"),
|
68 |
outputs=gr.Textbox(label="์คํฌ๋ํ ๊ฒฐ๊ณผ"),
|
69 |
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ",
|
70 |
+
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ์ ์ ๋ชฉ, ๋ด์ฉ, ์ด๋ฏธ์ง URL์ ์คํฌ๋ํํฉ๋๋ค."
|
71 |
)
|
72 |
|
73 |
if __name__ == "__main__":
|