aliceblue11 commited on
Commit
15112ab
·
verified ·
1 Parent(s): 97b8c76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -34
app.py CHANGED
@@ -4,54 +4,40 @@ import gradio as gr
4
 
5
  def scrape_naver_blog(url):
6
  try:
7
- # 디버깅 로그: URL 확인
8
- print(f"[DEBUG] 스크래핑할 URL: {url}")
 
9
 
10
- # 네이버 블로그 HTML 가져오기
11
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
12
- response = requests.get(url, headers=headers)
13
 
14
- # HTTP 상태 코드 확인
15
- print(f"[DEBUG] HTTP 응답 상태 코드: {response.status_code}")
16
- if response.status_code != 200:
17
- return f"오류: 페이지에 접근할 수 없습니다. HTTP 상태 코드: {response.status_code}"
18
 
19
- # HTML 파싱
20
- soup = BeautifulSoup(response.text, "html.parser")
 
21
 
22
- # 제목과 내용 스크래핑
23
- title_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(1) > div > div > div:nth-of-type(2)"
24
- content_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(2) > div:nth-of-type(2) > div > div"
25
 
26
- # CSS 선택자 변환
27
- title_element = soup.select_one(title_xpath.replace(" > ", " > "))
28
- content_element = soup.select_one(content_xpath.replace(" > ", " > "))
29
-
30
- if not title_element or not content_element:
31
- return "오류: 제공된 XPath로 제목이나 내용을 찾을 수 없습니다."
32
-
33
- # 텍스트 추출
34
- title = title_element.get_text(strip=True)
35
- content = content_element.get_text(strip=True)
36
-
37
- # 결과 반환
38
  return f"제목: {title}\n내용: {content}"
39
 
40
  except Exception as e:
41
- # 디버깅 로그: 예외 발생 시
42
- print(f"[DEBUG] 예외 발생: {str(e)}")
43
- return f"오류가 발생했습니다: {str(e)}"
44
 
45
- # Gradio 인터페이스 설정
46
  def gradio_interface(url):
47
  return scrape_naver_blog(url)
48
 
49
  iface = gr.Interface(
50
  fn=gradio_interface,
51
- inputs=gr.Textbox(label="네이버 블로그 URL"),
52
- outputs=gr.Textbox(label="스크래핑 결과"),
53
- title="네이버 블로그 스크래퍼",
54
- description="네이버 블로그 URL 입력하면 제목과 내용을 스크래핑하여 출력합니다.",
55
  )
56
 
57
  if __name__ == "__main__":
 
4
 
5
  def scrape_naver_blog(url):
6
  try:
7
+ # Check if the URL is a mobile URL
8
+ if not url.startswith("https://m.blog.naver.com"):
9
+ return "Error: Please provide a valid mobile URL (https://m.blog.naver.com)."
10
 
11
+ # Send a GET request to the URL
12
+ response = requests.get(url)
13
+ response.raise_for_status() # Raise an error for HTTP issues
14
 
15
+ # Parse the HTML content
16
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
17
 
18
+ # Extract the title
19
+ title_div = soup.find('div', class_='se-module se-module-text se-title-text')
20
+ title = title_div.get_text(strip=True) if title_div else "Title not found."
21
 
22
+ # Extract the content
23
+ content_divs = soup.find_all('div', class_='se-module se-module-text se-quote')
24
+ content = "\n".join(div.get_text(strip=True) for div in content_divs) if content_divs else "Content not found."
25
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return f"제목: {title}\n내용: {content}"
27
 
28
  except Exception as e:
29
+ return f"Error occurred: {e}"
 
 
30
 
31
+ # Gradio interface
32
  def gradio_interface(url):
33
  return scrape_naver_blog(url)
34
 
35
  iface = gr.Interface(
36
  fn=gradio_interface,
37
+ inputs=gr.Textbox(label="Enter Naver Blog Mobile URL"),
38
+ outputs=gr.Textbox(label="Scraped Blog Content"),
39
+ title="Naver Blog Scraper",
40
+ description="Enter a Naver Blog mobile URL to scrape the title and content (text only).",
41
  )
42
 
43
  if __name__ == "__main__":