Spaces:

aliceblue11
/

naver_blog_00

Sleeping

App Files Files Community

aliceblue11 commited on Jan 13

Commit

fb38986

verified ·

1 Parent(s): 2f00f7a

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -8

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import requests
 from bs4 import BeautifulSoup
 import gradio as gr
 def convert_to_mobile_url(url):
     """
@@ -18,9 +20,21 @@ def scrape_naver_blog(url):
     try:
         # Convert URL to mobile format if necessary
         url = convert_to_mobile_url(url)
         # Send a GET request to the URL
-        response = requests.get(url)
         response.raise_for_status()  # Raise an error for HTTP issues
         # Parse the HTML content
@@ -28,7 +42,7 @@ def scrape_naver_blog(url):
         # Extract the title
         title_div = soup.find('div', class_='se-module se-module-text se-title-text')
-        title = title_div.get_text(strip=True) if title_div else "Title not found."
         # Extract text content excluding images
         text_components = soup.find_all('div', class_='se-module se-module-text')
@@ -37,7 +51,7 @@ def scrape_naver_blog(url):
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
-        return f"Error occurred: {e}"
 # Gradio interface
 def gradio_interface(url):
@@ -45,13 +59,14 @@ def gradio_interface(url):
 iface = gr.Interface(
     fn=gradio_interface,
-    inputs=gr.Textbox(label="Enter Naver Blog URL (Standard or Mobile)"),
-    outputs=gr.Textbox(label="Scraped Blog Content"),
-    title="Naver Blog Scraper (Text Only)",
     description=(
-        "Enter a Naver Blog URL (standard or mobile) to scrape the title and text content only. "
-        "The script will automatically convert standard URLs to mobile format."
     ),
 )
 if __name__ == "__main__":

 import requests
 from bs4 import BeautifulSoup
 import gradio as gr
+import random
+import time
 def convert_to_mobile_url(url):
     """
     try:
         # Convert URL to mobile format if necessary
         url = convert_to_mobile_url(url)
+        # HTTP request headers
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            ),
+            "Referer": "https://www.naver.com/",
+        }
+        # Random delay between 1 to 3 seconds
+        time.sleep(random.uniform(1, 3))
         # Send a GET request to the URL
+        response = requests.get(url, headers=headers)
         response.raise_for_status()  # Raise an error for HTTP issues
         # Parse the HTML content
         # Extract the title
         title_div = soup.find('div', class_='se-module se-module-text se-title-text')
+        title = title_div.get_text(strip=True) if title_div else "제목을 찾을 수 없습니다."
         # Extract text content excluding images
         text_components = soup.find_all('div', class_='se-module se-module-text')
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
+        return f"오류 발생: {e}"
 # Gradio interface
 def gradio_interface(url):
 iface = gr.Interface(
     fn=gradio_interface,
+    inputs=gr.Textbox(label="네이버 블로그 URL 입력 (표준 또는 모바일)"),
+    outputs=gr.Textbox(label="스크래핑된 블로그 내용"),
+    title="네이버 블로그 스크래퍼 (텍스트만)",
     description=(
+        "네이버 블로그 URL(표준 또는 모바일)을 입력하면 제목과 텍스트 내용을 스크래핑합니다. "
+        "스크립트는 표준 URL을 자동으로 모바일 형식으로 변환합니다."
     ),
+    theme="compact",  # Gradio 인터페이스의 간결한 테마 적용
 )
 if __name__ == "__main__":