aliceblue11 commited on
Commit
9e50054
·
verified ·
1 Parent(s): 17330bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from selenium import webdriver
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.chrome.service import Service
5
+ from webdriver_manager.chrome import ChromeDriverManager
6
+ import time
7
+
8
+ # Function to scrape Naver blog title and content
9
+ def scrape_naver_blog(url):
10
+ # Initialize Chrome WebDriver
11
+ try:
12
+ options = webdriver.ChromeOptions()
13
+ options.add_argument('--headless') # Run in headless mode
14
+ options.add_argument('--no-sandbox')
15
+ options.add_argument('--disable-dev-shm-usage')
16
+
17
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
18
+ driver.get(url)
19
+ time.sleep(3) # Wait for the page to load
20
+
21
+ try:
22
+ # Extract title
23
+ title_xpath = "/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[8]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div[1]/div/div/div[2]/div/p/span"
24
+ title_element = driver.find_element(By.XPATH, title_xpath)
25
+ title = title_element.text.strip()
26
+ except Exception as e:
27
+ title = f"Error extracting title: {e}"
28
+
29
+ try:
30
+ # Extract content
31
+ content_xpath = "/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[8]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div[3]/div[4]/div/div/div/p[1]/span"
32
+ content_element = driver.find_element(By.XPATH, content_xpath)
33
+ content = content_element.text.strip()
34
+ except Exception as e:
35
+ content = f"Error extracting content: {e}"
36
+
37
+ driver.quit()
38
+
39
+ # Return the results
40
+ return f"제목: {title}\n내용: {content}"
41
+
42
+ except Exception as e:
43
+ return f"Error initializing WebDriver: {e}"
44
+
45
+ # Gradio Interface
46
+ def scrape_interface(url):
47
+ result = scrape_naver_blog(url)
48
+ return result
49
+
50
+ interface = gr.Interface(
51
+ fn=scrape_interface,
52
+ inputs=gr.Textbox(label="Naver Blog URL"),
53
+ outputs=gr.Textbox(label="Scraped Content"),
54
+ title="Naver Blog Scraper",
55
+ description="Enter the URL of a Naver blog to scrape its title and content."
56
+ )
57
+
58
+ if __name__ == "__main__":
59
+ interface.launch(debug=True)