aliceblue11 commited on
Commit
fb38986
·
verified ·
1 Parent(s): 2f00f7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import gradio as gr
 
 
4
 
5
  def convert_to_mobile_url(url):
6
  """
@@ -18,9 +20,21 @@ def scrape_naver_blog(url):
18
  try:
19
  # Convert URL to mobile format if necessary
20
  url = convert_to_mobile_url(url)
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Send a GET request to the URL
23
- response = requests.get(url)
24
  response.raise_for_status() # Raise an error for HTTP issues
25
 
26
  # Parse the HTML content
@@ -28,7 +42,7 @@ def scrape_naver_blog(url):
28
 
29
  # Extract the title
30
  title_div = soup.find('div', class_='se-module se-module-text se-title-text')
31
- title = title_div.get_text(strip=True) if title_div else "Title not found."
32
 
33
  # Extract text content excluding images
34
  text_components = soup.find_all('div', class_='se-module se-module-text')
@@ -37,7 +51,7 @@ def scrape_naver_blog(url):
37
  return f"제목: {title}\n내용: {content}"
38
 
39
  except Exception as e:
40
- return f"Error occurred: {e}"
41
 
42
  # Gradio interface
43
  def gradio_interface(url):
@@ -45,13 +59,14 @@ def gradio_interface(url):
45
 
46
  iface = gr.Interface(
47
  fn=gradio_interface,
48
- inputs=gr.Textbox(label="Enter Naver Blog URL (Standard or Mobile)"),
49
- outputs=gr.Textbox(label="Scraped Blog Content"),
50
- title="Naver Blog Scraper (Text Only)",
51
  description=(
52
- "Enter a Naver Blog URL (standard or mobile) to scrape the title and text content only. "
53
- "The script will automatically convert standard URLs to mobile format."
54
  ),
 
55
  )
56
 
57
  if __name__ == "__main__":
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import gradio as gr
4
+ import random
5
+ import time
6
 
7
  def convert_to_mobile_url(url):
8
  """
 
20
  try:
21
  # Convert URL to mobile format if necessary
22
  url = convert_to_mobile_url(url)
23
+
24
+ # HTTP request headers
25
+ headers = {
26
+ "User-Agent": (
27
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
28
+ "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
29
+ ),
30
+ "Referer": "https://www.naver.com/",
31
+ }
32
+
33
+ # Random delay between 1 to 3 seconds
34
+ time.sleep(random.uniform(1, 3))
35
 
36
  # Send a GET request to the URL
37
+ response = requests.get(url, headers=headers)
38
  response.raise_for_status() # Raise an error for HTTP issues
39
 
40
  # Parse the HTML content
 
42
 
43
  # Extract the title
44
  title_div = soup.find('div', class_='se-module se-module-text se-title-text')
45
+ title = title_div.get_text(strip=True) if title_div else "제목을 찾을 수 없습니다."
46
 
47
  # Extract text content excluding images
48
  text_components = soup.find_all('div', class_='se-module se-module-text')
 
51
  return f"제목: {title}\n내용: {content}"
52
 
53
  except Exception as e:
54
+ return f"오류 발생: {e}"
55
 
56
  # Gradio interface
57
  def gradio_interface(url):
 
59
 
60
  iface = gr.Interface(
61
  fn=gradio_interface,
62
+ inputs=gr.Textbox(label="네이버 블로그 URL 입력 (표준 또는 모바일)"),
63
+ outputs=gr.Textbox(label="스크래핑된 블로그 내용"),
64
+ title="네이버 블로그 스크래퍼 (텍스트만)",
65
  description=(
66
+ "네이버 블로그 URL(표준 또는 모바일) 입력하면 제목과 텍스트 내용을 스크래핑합니다. "
67
+ "스크립트는 표준 URL을 자동으로 모바일 형식으로 변환합니다."
68
  ),
69
+ theme="compact", # Gradio 인터페이스의 간결한 테마 적용
70
  )
71
 
72
  if __name__ == "__main__":