Spaces:

aliceblue11
/

naver_blog_00

Sleeping

App Files Files Community

aliceblue11 commited on Jan 13

Commit

b66294a

verified ·

1 Parent(s): 9b36fe6

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -23

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import requests
 from bs4 import BeautifulSoup
 import gradio as gr
-import random
 import time
 def convert_to_mobile_url(url):
     """
-    Converts a standard Naver blog URL to its mobile version.
     """
     if url.startswith("https://blog.naver.com/"):
         url_parts = url.split("/")
@@ -14,61 +13,55 @@ def convert_to_mobile_url(url):
         post_id = url_parts[-1]
         mobile_url = f"https://m.blog.naver.com/{blog_id}/{post_id}"
         return mobile_url
-    return url  # Return the original URL if it's already in mobile format
 def scrape_naver_blog(url):
     try:
-        # Convert URL to mobile format if necessary
         url = convert_to_mobile_url(url)
-        # HTTP request headers
         headers = {
             "User-Agent": (
                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                 "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
             ),
-            "Referer": "https://www.naver.com/",
         }
-        # Random delay between 1 to 3 seconds
-        delay = random.uniform(1, 3)
-        time.sleep(delay)
-        # Send a GET request to the URL
         response = requests.get(url, headers=headers)
-        response.raise_for_status()  # Raise an error for HTTP issues
-        # Parse the HTML content
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Extract the title
         title_div = soup.find('div', class_='se-module se-module-text se-title-text')
         title = title_div.get_text(strip=True) if title_div else "제목을 찾을 수 없습니다."
-        # Extract text content excluding images
         text_components = soup.find_all('div', class_='se-module se-module-text')
         content = "\n".join(component.get_text(strip=True) for component in text_components if component)
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
         return f"오류 발생: {e}"
-# Gradio interface
 def gradio_interface(url):
     return scrape_naver_blog(url)
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=gr.Textbox(label="네이버 블로그 URL 입력 (표준 또는 모바일)"),
     outputs=gr.Textbox(label="스크래핑된 블로그 내용"),
     title="네이버 블로그 스크래퍼 (텍스트만)",
     description=(
-        "네이버 블로그 URL(표준 또는 모바일)을 입력하면 제목과 텍스트 내용을 스크래핑합니다. "
-        "스크립트는 표준 URL을 자동으로 모바일 형식으로 변환하며, 헤더와 랜덤 딜레이를 설정하여 요청을 자연스럽게 만듭니다."
-    ),
-    theme="compact",  # 간결한 Gradio 인터페이스 테마
-)
-if __name__ == "__main__":
-    iface.launch()

 import requests
 from bs4 import BeautifulSoup
 import gradio as gr
 import time
 def convert_to_mobile_url(url):
     """
+    표준 네이버 블로그 URL을 모바일 URL로 변환합니다.
     """
     if url.startswith("https://blog.naver.com/"):
         url_parts = url.split("/")
         post_id = url_parts[-1]
         mobile_url = f"https://m.blog.naver.com/{blog_id}/{post_id}"
         return mobile_url
+    return url  # 이미 모바일 URL이면 그대로 반환
 def scrape_naver_blog(url):
     try:
+        # 표준 URL을 모바일 URL로 변환
         url = convert_to_mobile_url(url)
+        # HTTP 요청에 필요한 헤더 설정
         headers = {
             "User-Agent": (
                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                 "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
             ),
+            "Referer": "https://www.naver.com/",  # 네이버 메인 페이지를 참조 페이지로 설정
         }
+        # 요청 전 3초 지연
+        time.sleep(3)
+        # URL로 GET 요청 보내기
         response = requests.get(url, headers=headers)
+        response.raise_for_status()  # HTTP 문제 발생 시 예외 발생
+        # HTML 내용 파싱
         soup = BeautifulSoup(response.text, 'html.parser')
+        # 제목 추출
         title_div = soup.find('div', class_='se-module se-module-text se-title-text')
         title = title_div.get_text(strip=True) if title_div else "제목을 찾을 수 없습니다."
+        # 이미지 제외 텍스트 내용 추출
         text_components = soup.find_all('div', class_='se-module se-module-text')
         content = "\n".join(component.get_text(strip=True) for component in text_components if component)
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
+        # 오류 발생 시 메시지 반환
         return f"오류 발생: {e}"
+# Gradio 인터페이스 함수
 def gradio_interface(url):
     return scrape_naver_blog(url)
+# Gradio 인터페이스 설정
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=gr.Textbox(label="네이버 블로그 URL 입력 (표준 또는 모바일)"),
     outputs=gr.Textbox(label="스크래핑된 블로그 내용"),
     title="네이버 블로그 스크래퍼 (텍스트만)",
     description=(
+        "네이버 블로그 URL(표준 또는 모바일