Spaces:

AIRider
/

blogcr111111

Sleeping

App Files Files Community

AIRider commited on Jan 13

Commit

34aa4af

verified ·

1 Parent(s): 5d11094

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -53

app.py CHANGED Viewed

@@ -2,66 +2,67 @@ import gradio as gr
 import requests
 from bs4 import BeautifulSoup
-def scrape_blog(url):
-    debug_logs = []  # 디버깅 메시지 저장용
     try:
-        # HTTP 요청 보내기
-        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
-        debug_logs.append(f"Request sent to {url}")
-        # 응답 확인
-        if response.status_code == 200:
-            debug_logs.append("Successfully fetched the webpage.")
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # 제목 크롤링
-            try:
-                title_element = soup.select_one("meta[property='og:title']")  # 제목 메타 태그 사용
-                title = title_element["content"] if title_element else "Title not found"
-                debug_logs.append(f"Title extracted: {title}")
-            except Exception as e:
-                title = "Error extracting title"
-                debug_logs.append(f"Error extracting title: {e}")
-            # 내용 크롤링
-            try:
-                content_element = soup.select_one("div#postViewArea")  # 네이버 블로그의 본문 영역 ID
-                content = content_element.get_text(strip=True) if content_element else "Content not found"
-                debug_logs.append(f"Content extracted: {content}")
-            except Exception as e:
-                content = "Error extracting content"
-                debug_logs.append(f"Error extracting content: {e}")
-        else:
-            title = "Error accessing blog"
-            content = "Error accessing blog"
-            debug_logs.append(f"Error accessing blog: Status code {response.status_code}")
-    except Exception as e:
-        title = "Error accessing blog"
-        content = "Error accessing blog"
-        debug_logs.append(f"Error accessing blog: {e}")
-    return {"title": title, "content": content, "debug_logs": debug_logs}
-def interface_function(url):
-    result = scrape_blog(url)
-    debug_output = "\n".join(result["debug_logs"])
-    return f"제목: {result['title']}\n\n내용: {result['content']}\n\n[Debug Logs]\n{debug_output}"
-# Gradio 인터페이스 설정
-with gr.Blocks() as demo:
-    gr.Markdown("# 네이버 블로그 크롤러")
-    gr.Markdown("블로그 URL을 입력하면 제목과 내용을 추출합니다.")
-    with gr.Row():
-        url_input = gr.Textbox(label="네이버 블로그 URL")
-        submit_button = gr.Button("크롤링 시작")
-    output = gr.Textbox(label="결과")
-    submit_button.click(interface_function, inputs=url_input, outputs=output)
-# 앱 실행
-demo.launch()

 import requests
 from bs4 import BeautifulSoup
+def scrape_naver_blog(url):
     try:
+        # User-Agent 설정
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
+        }
+        response = requests.get(url, headers=headers)
+        # 요청 성공 여부 확인
+        if response.status_code != 200:
+            debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
+            print(debug_message)
+            return debug_message
+        # BeautifulSoup을 사용하여 HTML 파싱
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # 제목 크롤링
+        try:
+            title_element = soup.select_one(
+                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
+            )
+            title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
+        except Exception as e:
+            debug_message = f"제목 크롤링 중 오류 발생: {e}"
+            print(debug_message)
+            title = debug_message
+        # 내용 크롤링
+        try:
+            content_element = soup.select_one(
+                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
+            )
+            content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
+        except Exception as e:
+            debug_message = f"내용 크롤링 중 오류 발생: {e}"
+            print(debug_message)
+            content = debug_message
+        # 결과 출력
+        return {"제목": title, "내용": content}
+    except Exception as e:
+        debug_message = f"전체 크롤링 중 오류 발생: {e}"
+        print(debug_message)
+        return debug_message
+def gradio_interface(url):
+    print(f"입력된 URL: {url}")
+    result = scrape_naver_blog(url)
+    print(f"크롤�� 결과: {result}")
+    return f"제목: {result['제목']}\n내용: {result['내용']}"
+# Gradio 인터페이스 구성
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
+    outputs=gr.Textbox(label="크롤링 결과"),
+    title="네이버 블로그 크롤러",
+    description="네이버 블로그의 제목과 내용을 크롤링하여 출력합니다."
+)
+if __name__ == "__main__":
+    iface.launch()