AIRider commited on
Commit
34aa4af
·
verified ·
1 Parent(s): 5d11094

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -53
app.py CHANGED
@@ -2,66 +2,67 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
 
5
- def scrape_blog(url):
6
- debug_logs = [] # 디버깅 메시지 저장용
7
-
8
  try:
9
- # HTTP 요청 보내기
10
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
11
- debug_logs.append(f"Request sent to {url}")
12
-
13
- # 응답 확인
14
- if response.status_code == 200:
15
- debug_logs.append("Successfully fetched the webpage.")
16
- soup = BeautifulSoup(response.text, 'html.parser')
17
-
18
- # 제목 크롤링
19
- try:
20
- title_element = soup.select_one("meta[property='og:title']") # 제목 메타 태그 사용
21
- title = title_element["content"] if title_element else "Title not found"
22
- debug_logs.append(f"Title extracted: {title}")
23
- except Exception as e:
24
- title = "Error extracting title"
25
- debug_logs.append(f"Error extracting title: {e}")
26
 
27
- # 내용 크롤링
28
- try:
29
- content_element = soup.select_one("div#postViewArea") # 네이버 블로그의 본문 영역 ID
30
- content = content_element.get_text(strip=True) if content_element else "Content not found"
31
- debug_logs.append(f"Content extracted: {content}")
32
- except Exception as e:
33
- content = "Error extracting content"
34
- debug_logs.append(f"Error extracting content: {e}")
35
 
36
- else:
37
- title = "Error accessing blog"
38
- content = "Error accessing blog"
39
- debug_logs.append(f"Error accessing blog: Status code {response.status_code}")
40
 
41
- except Exception as e:
42
- title = "Error accessing blog"
43
- content = "Error accessing blog"
44
- debug_logs.append(f"Error accessing blog: {e}")
45
-
46
- return {"title": title, "content": content, "debug_logs": debug_logs}
 
 
 
 
47
 
48
- def interface_function(url):
49
- result = scrape_blog(url)
50
- debug_output = "\n".join(result["debug_logs"])
51
- return f"제목: {result['title']}\n\n내용: {result['content']}\n\n[Debug Logs]\n{debug_output}"
 
 
 
 
 
 
52
 
53
- # Gradio 인터페이스 설정
54
- with gr.Blocks() as demo:
55
- gr.Markdown("# 네이버 블로그 크롤러")
56
- gr.Markdown("블로그 URL을 입력하면 제목과 내용을 추출합니다.")
57
 
58
- with gr.Row():
59
- url_input = gr.Textbox(label="네이버 블로그 URL")
60
- submit_button = gr.Button("크롤링 시작")
 
61
 
62
- output = gr.Textbox(label="결과")
 
 
 
 
63
 
64
- submit_button.click(interface_function, inputs=url_input, outputs=output)
 
 
 
 
 
 
 
65
 
66
- # 실행
67
- demo.launch()
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
 
5
+ def scrape_naver_blog(url):
 
 
6
  try:
7
+ # User-Agent 설정
8
+ headers = {
9
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
10
+ }
11
+ response = requests.get(url, headers=headers)
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # 요청 성공 여부 확인
14
+ if response.status_code != 200:
15
+ debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
16
+ print(debug_message)
17
+ return debug_message
 
 
 
18
 
19
+ # BeautifulSoup을 사용하여 HTML 파싱
20
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
21
 
22
+ # 제목 크롤링
23
+ try:
24
+ title_element = soup.select_one(
25
+ "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
26
+ )
27
+ title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
28
+ except Exception as e:
29
+ debug_message = f"제목 크롤링 중 오류 발생: {e}"
30
+ print(debug_message)
31
+ title = debug_message
32
 
33
+ # 내용 크롤링
34
+ try:
35
+ content_element = soup.select_one(
36
+ "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
37
+ )
38
+ content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
39
+ except Exception as e:
40
+ debug_message = f"내용 크롤링 중 오류 발생: {e}"
41
+ print(debug_message)
42
+ content = debug_message
43
 
44
+ # 결과 출력
45
+ return {"제목": title, "내용": content}
 
 
46
 
47
+ except Exception as e:
48
+ debug_message = f"전체 크롤링 중 오류 발생: {e}"
49
+ print(debug_message)
50
+ return debug_message
51
 
52
+ def gradio_interface(url):
53
+ print(f"입력된 URL: {url}")
54
+ result = scrape_naver_blog(url)
55
+ print(f"크롤�� 결과: {result}")
56
+ return f"제목: {result['제목']}\n내용: {result['내용']}"
57
 
58
+ # Gradio 인터페이스 구성
59
+ iface = gr.Interface(
60
+ fn=gradio_interface,
61
+ inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
62
+ outputs=gr.Textbox(label="크롤링 결과"),
63
+ title="네이버 블로그 크롤러",
64
+ description="네이버 블로그의 제목과 내용을 크롤링하여 출력합니다."
65
+ )
66
 
67
+ if __name__ == "__main__":
68
+ iface.launch()